1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2021 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 #define OPS_INIT_SIZE 8
33
34 #define NODE_IS_REAL_IGNORECASE(node) \
35 (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node))
36
37 typedef struct {
38 OnigLen min;
39 OnigLen max;
40 } MinMaxLen;
41
42 typedef struct {
43 OnigLen min;
44 OnigLen max;
45 int min_is_sure;
46 } MinMaxCharLen;
47
48 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
49
50 static OnigLen node_min_byte_len(Node* node, ParseEnv* env);
51
52 #if 0
53 typedef struct {
54 int n;
55 int alloc;
56 int* v;
57 } int_stack;
58
59 static int
60 make_int_stack(int_stack** rs, int init_size)
61 {
62 int_stack* s;
63 int* v;
64
65 *rs = 0;
66
67 s = xmalloc(sizeof(*s));
68 if (IS_NULL(s)) return ONIGERR_MEMORY;
69
70 v = (int* )xmalloc(sizeof(int) * init_size);
71 if (IS_NULL(v)) {
72 xfree(s);
73 return ONIGERR_MEMORY;
74 }
75
76 s->n = 0;
77 s->alloc = init_size;
78 s->v = v;
79
80 *rs = s;
81 return ONIG_NORMAL;
82 }
83
84 static void
85 free_int_stack(int_stack* s)
86 {
87 if (IS_NOT_NULL(s)) {
88 if (IS_NOT_NULL(s->v))
89 xfree(s->v);
90 xfree(s);
91 }
92 }
93
94 static int
95 int_stack_push(int_stack* s, int v)
96 {
97 if (s->n >= s->alloc) {
98 int new_size = s->alloc * 2;
99 int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
100 if (IS_NULL(nv)) return ONIGERR_MEMORY;
101
102 s->alloc = new_size;
103 s->v = nv;
104 }
105
106 s->v[s->n] = v;
107 s->n++;
108 return ONIG_NORMAL;
109 }
110
111 static int
112 int_stack_pop(int_stack* s)
113 {
114 int v;
115
116 #ifdef ONIG_DEBUG
117 if (s->n <= 0) {
118 fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s);
119 return 0;
120 }
121 #endif
122
123 v = s->v[s->n];
124 s->n--;
125 return v;
126 }
127 #endif
128
129 static int
ops_init(regex_t * reg,int init_alloc_size)130 ops_init(regex_t* reg, int init_alloc_size)
131 {
132 Operation* p;
133 size_t size;
134
135 if (init_alloc_size <= 0)
136 return ONIGERR_PARSER_BUG;
137
138 size = sizeof(Operation) * init_alloc_size;
139 p = (Operation* )xrealloc(reg->ops, size);
140 CHECK_NULL_RETURN_MEMERR(p);
141 reg->ops = p;
142 #ifdef USE_DIRECT_THREADED_CODE
143 {
144 enum OpCode* cp;
145 size = sizeof(enum OpCode) * init_alloc_size;
146 cp = (enum OpCode* )xrealloc(reg->ocs, size);
147 CHECK_NULL_RETURN_MEMERR(cp);
148 reg->ocs = cp;
149 }
150 #endif
151
152 reg->ops_curr = 0; /* !!! not yet done ops_new() */
153 reg->ops_alloc = init_alloc_size;
154 reg->ops_used = 0;
155
156 return ONIG_NORMAL;
157 }
158
159 static int
ops_resize(regex_t * reg,int n)160 ops_resize(regex_t* reg, int n)
161 {
162 #ifdef USE_DIRECT_THREADED_CODE
163 enum OpCode* cp;
164 #endif
165 Operation* p;
166 size_t size;
167
168 if (n == reg->ops_alloc) return ONIG_NORMAL;
169 if (n <= 0) return ONIGERR_PARSER_BUG;
170
171 size = sizeof(Operation) * n;
172 p = (Operation* )xrealloc(reg->ops, size);
173 CHECK_NULL_RETURN_MEMERR(p);
174 reg->ops = p;
175
176 #ifdef USE_DIRECT_THREADED_CODE
177 size = sizeof(enum OpCode) * n;
178 cp = (enum OpCode* )xrealloc(reg->ocs, size);
179 CHECK_NULL_RETURN_MEMERR(cp);
180 reg->ocs = cp;
181 #endif
182
183 reg->ops_alloc = n;
184 if (reg->ops_used == 0)
185 reg->ops_curr = 0;
186 else
187 reg->ops_curr = reg->ops + (reg->ops_used - 1);
188
189 return ONIG_NORMAL;
190 }
191
192 static int
ops_new(regex_t * reg)193 ops_new(regex_t* reg)
194 {
195 if (reg->ops_used >= reg->ops_alloc) {
196 int r = ops_resize(reg, reg->ops_alloc << 1);
197 if (r != ONIG_NORMAL) return r;
198 }
199
200 reg->ops_curr = reg->ops + reg->ops_used;
201 reg->ops_used++;
202
203 xmemset(reg->ops_curr, 0, sizeof(Operation));
204 return ONIG_NORMAL;
205 }
206
207 static int
is_in_string_pool(regex_t * reg,UChar * s)208 is_in_string_pool(regex_t* reg, UChar* s)
209 {
210 return (s >= reg->string_pool && s < reg->string_pool_end);
211 }
212
213 static void
ops_free(regex_t * reg)214 ops_free(regex_t* reg)
215 {
216 int i;
217
218 if (IS_NULL(reg->ops)) return ;
219
220 for (i = 0; i < (int )reg->ops_used; i++) {
221 enum OpCode opcode;
222 Operation* op;
223
224 op = reg->ops + i;
225
226 #ifdef USE_DIRECT_THREADED_CODE
227 opcode = *(reg->ocs + i);
228 #else
229 opcode = op->opcode;
230 #endif
231
232 switch (opcode) {
233 case OP_STR_MBN:
234 if (! is_in_string_pool(reg, op->exact_len_n.s))
235 xfree(op->exact_len_n.s);
236 break;
237 case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N:
238 if (! is_in_string_pool(reg, op->exact_n.s))
239 xfree(op->exact_n.s);
240 break;
241 case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
242 case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
243 case OP_STR_MB2N3:
244 break;
245
246 case OP_CCLASS_NOT: case OP_CCLASS:
247 xfree(op->cclass.bsp);
248 break;
249
250 case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
251 xfree(op->cclass_mb.mb);
252 break;
253 case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
254 xfree(op->cclass_mix.mb);
255 xfree(op->cclass_mix.bsp);
256 break;
257
258 case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
259 break;
260 case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC:
261 case OP_BACKREF_CHECK:
262 #ifdef USE_BACKREF_WITH_LEVEL
263 case OP_BACKREF_WITH_LEVEL:
264 case OP_BACKREF_WITH_LEVEL_IC:
265 case OP_BACKREF_CHECK_WITH_LEVEL:
266 #endif
267 if (op->backref_general.num != 1)
268 xfree(op->backref_general.ns);
269 break;
270
271 default:
272 break;
273 }
274 }
275
276 xfree(reg->ops);
277 #ifdef USE_DIRECT_THREADED_CODE
278 xfree(reg->ocs);
279 reg->ocs = 0;
280 #endif
281
282 reg->ops = 0;
283 reg->ops_curr = 0;
284 reg->ops_alloc = 0;
285 reg->ops_used = 0;
286 }
287
288 static int
ops_calc_size_of_string_pool(regex_t * reg)289 ops_calc_size_of_string_pool(regex_t* reg)
290 {
291 int i;
292 int total;
293
294 if (IS_NULL(reg->ops)) return 0;
295
296 total = 0;
297 for (i = 0; i < (int )reg->ops_used; i++) {
298 enum OpCode opcode;
299 Operation* op;
300
301 op = reg->ops + i;
302 #ifdef USE_DIRECT_THREADED_CODE
303 opcode = *(reg->ocs + i);
304 #else
305 opcode = op->opcode;
306 #endif
307
308 switch (opcode) {
309 case OP_STR_MBN:
310 total += op->exact_len_n.len * op->exact_len_n.n;
311 break;
312 case OP_STR_N:
313 case OP_STR_MB2N:
314 total += op->exact_n.n * 2;
315 break;
316 case OP_STR_MB3N:
317 total += op->exact_n.n * 3;
318 break;
319
320 default:
321 break;
322 }
323 }
324
325 return total;
326 }
327
328 static int
ops_make_string_pool(regex_t * reg)329 ops_make_string_pool(regex_t* reg)
330 {
331 int i;
332 int len;
333 int size;
334 UChar* pool;
335 UChar* curr;
336
337 size = ops_calc_size_of_string_pool(reg);
338 if (size <= 0) {
339 return 0;
340 }
341
342 curr = pool = (UChar* )xmalloc((size_t )size);
343 CHECK_NULL_RETURN_MEMERR(pool);
344
345 for (i = 0; i < (int )reg->ops_used; i++) {
346 enum OpCode opcode;
347 Operation* op;
348
349 op = reg->ops + i;
350 #ifdef USE_DIRECT_THREADED_CODE
351 opcode = *(reg->ocs + i);
352 #else
353 opcode = op->opcode;
354 #endif
355
356 switch (opcode) {
357 case OP_STR_MBN:
358 len = op->exact_len_n.len * op->exact_len_n.n;
359 xmemcpy(curr, op->exact_len_n.s, len);
360 xfree(op->exact_len_n.s);
361 op->exact_len_n.s = curr;
362 curr += len;
363 break;
364 case OP_STR_N:
365 len = op->exact_n.n;
366 copy:
367 xmemcpy(curr, op->exact_n.s, len);
368 xfree(op->exact_n.s);
369 op->exact_n.s = curr;
370 curr += len;
371 break;
372 case OP_STR_MB2N:
373 len = op->exact_n.n * 2;
374 goto copy;
375 break;
376 case OP_STR_MB3N:
377 len = op->exact_n.n * 3;
378 goto copy;
379 break;
380
381 default:
382 break;
383 }
384 }
385
386 reg->string_pool = pool;
387 reg->string_pool_end = pool + size;
388 return 0;
389 }
390
391 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)392 onig_get_default_case_fold_flag(void)
393 {
394 return OnigDefaultCaseFoldFlag;
395 }
396
397 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)398 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
399 {
400 OnigDefaultCaseFoldFlag = case_fold_flag;
401 return 0;
402 }
403
404 static int
len_multiply_cmp(OnigLen x,int y,OnigLen v)405 len_multiply_cmp(OnigLen x, int y, OnigLen v)
406 {
407 if (x == 0 || y == 0) return -1;
408
409 if (x < INFINITE_LEN / y) {
410 OnigLen xy = x * (OnigLen )y;
411 if (xy > v) return 1;
412 else {
413 if (xy == v) return 0;
414 else return -1;
415 }
416 }
417 else
418 return v == INFINITE_LEN ? 0 : 1;
419 }
420
421 extern int
onig_positive_int_multiply(int x,int y)422 onig_positive_int_multiply(int x, int y)
423 {
424 if (x == 0 || y == 0) return 0;
425
426 if (x < ONIG_INT_MAX / y)
427 return x * y;
428 else
429 return -1;
430 }
431
432
433 static void
node_swap(Node * a,Node * b)434 node_swap(Node* a, Node* b)
435 {
436 Node c;
437
438 c = *a; *a = *b; *b = c;
439
440 if (NODE_TYPE(a) == NODE_STRING) {
441 StrNode* sn = STR_(a);
442 if (sn->capacity == 0) {
443 int len = (int )(sn->end - sn->s);
444 sn->s = sn->buf;
445 sn->end = sn->s + len;
446 }
447 }
448
449 if (NODE_TYPE(b) == NODE_STRING) {
450 StrNode* sn = STR_(b);
451 if (sn->capacity == 0) {
452 int len = (int )(sn->end - sn->s);
453 sn->s = sn->buf;
454 sn->end = sn->s + len;
455 }
456 }
457 }
458
459 static int
node_list_len(Node * list)460 node_list_len(Node* list)
461 {
462 int len;
463
464 len = 1;
465 while (IS_NOT_NULL(NODE_CDR(list))) {
466 list = NODE_CDR(list);
467 len++;
468 }
469
470 return len;
471 }
472
473 static Node*
node_list_add(Node * list,Node * x)474 node_list_add(Node* list, Node* x)
475 {
476 Node *n;
477
478 n = onig_node_new_list(x, NULL);
479 if (IS_NULL(n)) return NULL_NODE;
480
481 if (IS_NOT_NULL(list)) {
482 while (IS_NOT_NULL(NODE_CDR(list)))
483 list = NODE_CDR(list);
484
485 NODE_CDR(list) = n;
486 }
487
488 return n;
489 }
490
491 static int
node_str_node_cat(Node * node,Node * add)492 node_str_node_cat(Node* node, Node* add)
493 {
494 int r;
495
496 if (NODE_STATUS(node) != NODE_STATUS(add))
497 return ONIGERR_TYPE_BUG;
498
499 if (STR_(node)->flag != STR_(add)->flag)
500 return ONIGERR_TYPE_BUG;
501
502 r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
503 if (r != 0) return r;
504
505 return 0;
506 }
507
508 static void
node_conv_to_str_node(Node * node,Node * ref_node)509 node_conv_to_str_node(Node* node, Node* ref_node)
510 {
511 xmemset(node, 0, sizeof(*node));
512 NODE_SET_TYPE(node, NODE_STRING);
513 NODE_STATUS(node) = NODE_STATUS(ref_node);
514
515 STR_(node)->flag = STR_(ref_node)->flag;
516 STR_(node)->s = STR_(node)->buf;
517 STR_(node)->end = STR_(node)->buf;
518 STR_(node)->capacity = 0;
519 }
520
521 static OnigLen
distance_add(OnigLen d1,OnigLen d2)522 distance_add(OnigLen d1, OnigLen d2)
523 {
524 if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
525 return INFINITE_LEN;
526 else {
527 if (d1 <= INFINITE_LEN - d2) return d1 + d2;
528 else return INFINITE_LEN;
529 }
530 }
531
532 static OnigLen
distance_multiply(OnigLen d,int m)533 distance_multiply(OnigLen d, int m)
534 {
535 if (m == 0) return 0;
536
537 if (d < INFINITE_LEN / m)
538 return d * m;
539 else
540 return INFINITE_LEN;
541 }
542
543 static int
bitset_is_empty(BitSetRef bs)544 bitset_is_empty(BitSetRef bs)
545 {
546 int i;
547
548 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {
549 if (bs[i] != 0) return 0;
550 }
551 return 1;
552 }
553
554 #ifdef USE_CALL
555
556 static int
unset_addr_list_init(UnsetAddrList * list,int size)557 unset_addr_list_init(UnsetAddrList* list, int size)
558 {
559 UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
560 CHECK_NULL_RETURN_MEMERR(p);
561
562 list->num = 0;
563 list->alloc = size;
564 list->us = p;
565 return 0;
566 }
567
568 static void
unset_addr_list_end(UnsetAddrList * list)569 unset_addr_list_end(UnsetAddrList* list)
570 {
571 if (IS_NOT_NULL(list->us))
572 xfree(list->us);
573 }
574
575 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)576 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
577 {
578 UnsetAddr* p;
579 int size;
580
581 if (list->num >= list->alloc) {
582 size = list->alloc * 2;
583 p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
584 CHECK_NULL_RETURN_MEMERR(p);
585 list->alloc = size;
586 list->us = p;
587 }
588
589 list->us[list->num].offset = offset;
590 list->us[list->num].target = node;
591 list->num++;
592 return 0;
593 }
594 #endif /* USE_CALL */
595
596 enum CharLenReturnType {
597 CHAR_LEN_NORMAL = 0, /* fixed or variable */
598 CHAR_LEN_TOP_ALT_FIXED = 1
599 };
600
601 static int
mmcl_fixed(MinMaxCharLen * c)602 mmcl_fixed(MinMaxCharLen* c)
603 {
604 return (c->min == c->max && c->min != INFINITE_LEN);
605 }
606
607 static void
mmcl_set(MinMaxCharLen * l,OnigLen len)608 mmcl_set(MinMaxCharLen* l, OnigLen len)
609 {
610 l->min = len;
611 l->max = len;
612 l->min_is_sure = TRUE;
613 }
614
615 static void
mmcl_set_min_max(MinMaxCharLen * l,OnigLen min,OnigLen max,int min_is_sure)616 mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure)
617 {
618 l->min = min;
619 l->max = max;
620 l->min_is_sure = min_is_sure;
621 }
622
623 static void
mmcl_add(MinMaxCharLen * to,MinMaxCharLen * add)624 mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add)
625 {
626 to->min = distance_add(to->min, add->min);
627 to->max = distance_add(to->max, add->max);
628
629 to->min_is_sure = add->min_is_sure != FALSE && to->min_is_sure != FALSE;
630 }
631
632 static void
mmcl_multiply(MinMaxCharLen * to,int m)633 mmcl_multiply(MinMaxCharLen* to, int m)
634 {
635 to->min = distance_multiply(to->min, m);
636 to->max = distance_multiply(to->max, m);
637 }
638
639 static void
mmcl_repeat_range_multiply(MinMaxCharLen * to,int mlow,int mhigh)640 mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh)
641 {
642 to->min = distance_multiply(to->min, mlow);
643
644 if (IS_INFINITE_REPEAT(mhigh))
645 to->max = INFINITE_LEN;
646 else
647 to->max = distance_multiply(to->max, mhigh);
648 }
649
650 static void
mmcl_alt_merge(MinMaxCharLen * to,MinMaxCharLen * alt)651 mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)
652 {
653 if (to->min > alt->min) {
654 to->min = alt->min;
655 to->min_is_sure = alt->min_is_sure;
656 }
657 else if (to->min == alt->min) {
658 if (alt->min_is_sure != FALSE)
659 to->min_is_sure = TRUE;
660 }
661
662 if (to->max < alt->max) to->max = alt->max;
663 }
664
665 #ifndef ONIG_DONT_OPTIMIZE
666
667 static int
mml_is_equal(MinMaxLen * a,MinMaxLen * b)668 mml_is_equal(MinMaxLen* a, MinMaxLen* b)
669 {
670 return a->min == b->min && a->max == b->max;
671 }
672
673 static void
mml_set_min_max(MinMaxLen * l,OnigLen min,OnigLen max)674 mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max)
675 {
676 l->min = min;
677 l->max = max;
678 }
679
680 static void
mml_clear(MinMaxLen * l)681 mml_clear(MinMaxLen* l)
682 {
683 l->min = l->max = 0;
684 }
685
686 static void
mml_copy(MinMaxLen * to,MinMaxLen * from)687 mml_copy(MinMaxLen* to, MinMaxLen* from)
688 {
689 to->min = from->min;
690 to->max = from->max;
691 }
692
693 static void
mml_add(MinMaxLen * to,MinMaxLen * add)694 mml_add(MinMaxLen* to, MinMaxLen* add)
695 {
696 to->min = distance_add(to->min, add->min);
697 to->max = distance_add(to->max, add->max);
698 }
699
700 static void
mml_alt_merge(MinMaxLen * to,MinMaxLen * alt)701 mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)
702 {
703 if (to->min > alt->min) to->min = alt->min;
704 if (to->max < alt->max) to->max = alt->max;
705 }
706
707 #endif
708
709 /* fixed size pattern node only */
710 static int
node_char_len1(Node * node,regex_t * reg,MinMaxCharLen * ci,ParseEnv * env,int level)711 node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env,
712 int level)
713 {
714 MinMaxCharLen tci;
715 int r = CHAR_LEN_NORMAL;
716
717 level++;
718
719 switch (NODE_TYPE(node)) {
720 case NODE_LIST:
721 {
722 int first = TRUE;
723 do {
724 r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
725 if (r < 0) break;
726 if (first == TRUE) {
727 *ci = tci;
728 first = FALSE;
729 }
730 else
731 mmcl_add(ci, &tci);
732 } while (IS_NOT_NULL(node = NODE_CDR(node)));
733 }
734 break;
735
736 case NODE_ALT:
737 {
738 int fixed;
739
740 r = node_char_len1(NODE_CAR(node), reg, ci, env, level);
741 if (r < 0) break;
742
743 fixed = TRUE;
744 while (IS_NOT_NULL(node = NODE_CDR(node))) {
745 r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
746 if (r < 0) break;
747 if (! mmcl_fixed(&tci))
748 fixed = FALSE;
749 mmcl_alt_merge(ci, &tci);
750 }
751 if (r < 0) break;
752
753 r = CHAR_LEN_NORMAL;
754 if (mmcl_fixed(ci)) break;
755
756 if (fixed == TRUE && level == 1) {
757 r = CHAR_LEN_TOP_ALT_FIXED;
758 }
759 }
760 break;
761
762 case NODE_STRING:
763 {
764 OnigLen clen;
765 StrNode* sn = STR_(node);
766 UChar *s = sn->s;
767
768 if (NODE_IS_REAL_IGNORECASE(node) &&
769 CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) {
770 /* Such a case is possible.
771 ex. /(?i)(?<=\1)(a)/
772 Backref node refer to capture group, but it doesn't tune yet.
773 */
774 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
775 break;
776 }
777
778 clen = 0;
779 while (s < sn->end) {
780 s += enclen(reg->enc, s);
781 clen = distance_add(clen, 1);
782 }
783 mmcl_set(ci, clen);
784 }
785 break;
786
787 case NODE_QUANT:
788 {
789 QuantNode* qn = QUANT_(node);
790
791 if (qn->lower == qn->upper) {
792 if (qn->upper == 0) {
793 mmcl_set(ci, 0);
794 }
795 else {
796 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
797 if (r < 0) break;
798 mmcl_multiply(ci, qn->lower);
799 }
800 }
801 else {
802 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
803 if (r < 0) break;
804 mmcl_repeat_range_multiply(ci, qn->lower, qn->upper);
805 }
806 }
807 break;
808
809 #ifdef USE_CALL
810 case NODE_CALL:
811 if (NODE_IS_RECURSION(node))
812 mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
813 else
814 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
815 break;
816 #endif
817
818 case NODE_CTYPE:
819 case NODE_CCLASS:
820 mmcl_set(ci, 1);
821 break;
822
823 case NODE_BAG:
824 {
825 BagNode* en = BAG_(node);
826
827 switch (en->type) {
828 case BAG_MEMORY:
829 if (NODE_IS_FIXED_CLEN(node)) {
830 mmcl_set_min_max(ci, en->min_char_len, en->max_char_len,
831 NODE_IS_FIXED_CLEN_MIN_SURE(node));
832 }
833 else {
834 if (NODE_IS_MARK1(node)) {
835 mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
836 }
837 else {
838 NODE_STATUS_ADD(node, MARK1);
839 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
840 NODE_STATUS_REMOVE(node, MARK1);
841 if (r < 0) break;
842
843 en->min_char_len = ci->min;
844 en->max_char_len = ci->max;
845 NODE_STATUS_ADD(node, FIXED_CLEN);
846 if (ci->min_is_sure != FALSE)
847 NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE);
848 }
849 }
850 /* can't optimize look-behind if capture exists. */
851 ci->min_is_sure = FALSE;
852 break;
853 case BAG_OPTION:
854 case BAG_STOP_BACKTRACK:
855 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
856 break;
857 case BAG_IF_ELSE:
858 {
859 MinMaxCharLen eci;
860
861 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
862 if (r < 0) break;
863
864 if (IS_NOT_NULL(en->te.Then)) {
865 r = node_char_len1(en->te.Then, reg, &tci, env, level);
866 if (r < 0) break;
867 mmcl_add(ci, &tci);
868 }
869
870 if (IS_NOT_NULL(en->te.Else)) {
871 r = node_char_len1(en->te.Else, reg, &eci, env, level);
872 if (r < 0) break;
873 }
874 else {
875 mmcl_set(&eci, 0);
876 }
877
878 mmcl_alt_merge(ci, &eci);
879 }
880 break;
881 default: /* never come here */
882 r = ONIGERR_PARSER_BUG;
883 break;
884 }
885 }
886 break;
887
888 case NODE_GIMMICK:
889 mmcl_set(ci, 0);
890 break;
891
892 case NODE_ANCHOR:
893 zero:
894 mmcl_set(ci, 0);
895 /* can't optimize look-behind if anchor exists. */
896 ci->min_is_sure = FALSE;
897 break;
898
899 case NODE_BACKREF:
900 if (NODE_IS_CHECKER(node))
901 goto zero;
902
903 if (NODE_IS_RECURSION(node)) {
904 #ifdef USE_BACKREF_WITH_LEVEL
905 if (NODE_IS_NEST_LEVEL(node)) {
906 mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
907 break;
908 }
909 #endif
910
911 mmcl_set_min_max(ci, 0, 0, FALSE);
912 break;
913 }
914
915 {
916 int i;
917 int* backs;
918 MemEnv* mem_env = PARSEENV_MEMENV(env);
919 BackRefNode* br = BACKREF_(node);
920
921 backs = BACKREFS_P(br);
922 r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level);
923 if (r < 0) break;
924 if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE;
925
926 for (i = 1; i < br->back_num; i++) {
927 r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level);
928 if (r < 0) break;
929 if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE;
930 mmcl_alt_merge(ci, &tci);
931 }
932 }
933 break;
934
935 default: /* never come here */
936 r = ONIGERR_PARSER_BUG;
937 break;
938 }
939
940 return r;
941 }
942
943 static int
node_char_len(Node * node,regex_t * reg,MinMaxCharLen * ci,ParseEnv * env)944 node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env)
945 {
946 return node_char_len1(node, reg, ci, env, 0);
947 }
948
949
950 static int
add_op(regex_t * reg,int opcode)951 add_op(regex_t* reg, int opcode)
952 {
953 int r;
954
955 r = ops_new(reg);
956 if (r != ONIG_NORMAL) return r;
957
958 #ifdef USE_DIRECT_THREADED_CODE
959 *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
960 #else
961 reg->ops_curr->opcode = opcode;
962 #endif
963
964 return 0;
965 }
966
967 static int compile_length_tree(Node* node, regex_t* reg);
968 static int compile_tree(Node* node, regex_t* reg, ParseEnv* env);
969
970
971 #define IS_NEED_STR_LEN_OP(op) \
972 ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\
973 (op) == OP_STR_MB3N || (op) == OP_STR_MBN)
974
975 static int
select_str_opcode(int mb_len,int str_len)976 select_str_opcode(int mb_len, int str_len)
977 {
978 int op;
979
980 switch (mb_len) {
981 case 1:
982 switch (str_len) {
983 case 1: op = OP_STR_1; break;
984 case 2: op = OP_STR_2; break;
985 case 3: op = OP_STR_3; break;
986 case 4: op = OP_STR_4; break;
987 case 5: op = OP_STR_5; break;
988 default: op = OP_STR_N; break;
989 }
990 break;
991
992 case 2:
993 switch (str_len) {
994 case 1: op = OP_STR_MB2N1; break;
995 case 2: op = OP_STR_MB2N2; break;
996 case 3: op = OP_STR_MB2N3; break;
997 default: op = OP_STR_MB2N; break;
998 }
999 break;
1000
1001 case 3:
1002 op = OP_STR_MB3N;
1003 break;
1004
1005 default:
1006 op = OP_STR_MBN;
1007 break;
1008 }
1009
1010 return op;
1011 }
1012
1013 static int
is_strict_real_node(Node * node)1014 is_strict_real_node(Node* node)
1015 {
1016 switch (NODE_TYPE(node)) {
1017 case NODE_STRING:
1018 {
1019 StrNode* sn = STR_(node);
1020 return (sn->end != sn->s);
1021 }
1022 break;
1023
1024 case NODE_CCLASS:
1025 case NODE_CTYPE:
1026 return 1;
1027 break;
1028
1029 default:
1030 return 0;
1031 break;
1032 }
1033 }
1034
1035 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ParseEnv * env)1036 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env)
1037 {
1038 int r;
1039 int saved_num_empty_check;
1040 int emptiness;
1041 Node* body;
1042
1043 body = NODE_BODY((Node* )qn);
1044 emptiness = qn->emptiness;
1045 saved_num_empty_check = reg->num_empty_check;
1046
1047 if (emptiness != BODY_IS_NOT_EMPTY) {
1048 r = add_op(reg, OP_EMPTY_CHECK_START);
1049 if (r != 0) return r;
1050 COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
1051 reg->num_empty_check++;
1052 }
1053
1054 r = compile_tree(body, reg, env);
1055 if (r != 0) return r;
1056
1057 if (emptiness != BODY_IS_NOT_EMPTY) {
1058 if (emptiness == BODY_MAY_BE_EMPTY)
1059 r = add_op(reg, OP_EMPTY_CHECK_END);
1060 else if (emptiness == BODY_MAY_BE_EMPTY_MEM) {
1061 if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) {
1062 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
1063 if (r != 0) return r;
1064 COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
1065 }
1066 else
1067 r = add_op(reg, OP_EMPTY_CHECK_END);
1068 }
1069 #ifdef USE_CALL
1070 else if (emptiness == BODY_MAY_BE_EMPTY_REC) {
1071 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
1072 if (r != 0) return r;
1073 COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
1074 }
1075 #endif
1076
1077 if (r != 0) return r;
1078 COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
1079 }
1080 return r;
1081 }
1082
1083 #ifdef USE_CALL
1084 static int
compile_call(CallNode * node,regex_t * reg,ParseEnv * env)1085 compile_call(CallNode* node, regex_t* reg, ParseEnv* env)
1086 {
1087 int r;
1088 int offset;
1089
1090 r = add_op(reg, OP_CALL);
1091 if (r != 0) return r;
1092
1093 COP(reg)->call.addr = 0; /* dummy addr. */
1094 #ifdef ONIG_DEBUG_MATCH_COUNTER
1095 COP(reg)->call.called_mem = node->called_gnum;
1096 #endif
1097
1098 offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
1099 r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
1100 return r;
1101 }
1102 #endif
1103
1104 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ParseEnv * env)1105 compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env)
1106 {
1107 int i, r;
1108
1109 for (i = 0; i < n; i++) {
1110 r = compile_tree(node, reg, env);
1111 if (r != 0) return r;
1112 }
1113 return 0;
1114 }
1115
1116 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)1117 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
1118 regex_t* reg ARG_UNUSED)
1119 {
1120 return 1;
1121 }
1122
1123 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)1124 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
1125 {
1126 int op;
1127 int r;
1128 int byte_len;
1129 UChar* p;
1130 UChar* end;
1131
1132 op = select_str_opcode(mb_len, str_len);
1133 r = add_op(reg, op);
1134 if (r != 0) return r;
1135
1136 byte_len = mb_len * str_len;
1137 end = s + byte_len;
1138
1139 if (op == OP_STR_MBN) {
1140 p = onigenc_strdup(reg->enc, s, end);
1141 CHECK_NULL_RETURN_MEMERR(p);
1142
1143 COP(reg)->exact_len_n.len = mb_len;
1144 COP(reg)->exact_len_n.n = str_len;
1145 COP(reg)->exact_len_n.s = p;
1146 }
1147 else if (IS_NEED_STR_LEN_OP(op)) {
1148 p = onigenc_strdup(reg->enc, s, end);
1149 CHECK_NULL_RETURN_MEMERR(p);
1150 COP(reg)->exact_n.n = str_len;
1151 COP(reg)->exact_n.s = p;
1152 }
1153 else {
1154 xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
1155 xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
1156 }
1157
1158 return 0;
1159 }
1160
1161 static int
compile_length_string_node(Node * node,regex_t * reg)1162 compile_length_string_node(Node* node, regex_t* reg)
1163 {
1164 int rlen, r, len, prev_len, slen;
1165 UChar *p, *prev;
1166 StrNode* sn;
1167 OnigEncoding enc = reg->enc;
1168
1169 sn = STR_(node);
1170 if (sn->end <= sn->s)
1171 return 0;
1172
1173 p = prev = sn->s;
1174 prev_len = enclen(enc, p);
1175 p += prev_len;
1176 slen = 1;
1177 rlen = 0;
1178
1179 for (; p < sn->end; ) {
1180 len = enclen(enc, p);
1181 if (len == prev_len) {
1182 slen++;
1183 }
1184 else {
1185 r = add_compile_string_length(prev, prev_len, slen, reg);
1186 rlen += r;
1187 prev = p;
1188 slen = 1;
1189 prev_len = len;
1190 }
1191 p += len;
1192 }
1193
1194 r = add_compile_string_length(prev, prev_len, slen, reg);
1195 rlen += r;
1196 return rlen;
1197 }
1198
1199 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)1200 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
1201 {
1202 if (sn->end <= sn->s)
1203 return 0;
1204
1205 return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
1206 reg);
1207 }
1208
1209 static int
compile_string_node(Node * node,regex_t * reg)1210 compile_string_node(Node* node, regex_t* reg)
1211 {
1212 int r, len, prev_len, slen;
1213 UChar *p, *prev, *end;
1214 StrNode* sn;
1215 OnigEncoding enc = reg->enc;
1216
1217 sn = STR_(node);
1218 if (sn->end <= sn->s)
1219 return 0;
1220
1221 end = sn->end;
1222
1223 p = prev = sn->s;
1224 prev_len = enclen(enc, p);
1225 p += prev_len;
1226 slen = 1;
1227
1228 for (; p < end; ) {
1229 len = enclen(enc, p);
1230 if (len == prev_len) {
1231 slen++;
1232 }
1233 else {
1234 r = add_compile_string(prev, prev_len, slen, reg);
1235 if (r != 0) return r;
1236
1237 prev = p;
1238 slen = 1;
1239 prev_len = len;
1240 }
1241
1242 p += len;
1243 }
1244
1245 return add_compile_string(prev, prev_len, slen, reg);
1246 }
1247
1248 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)1249 compile_string_crude_node(StrNode* sn, regex_t* reg)
1250 {
1251 if (sn->end <= sn->s)
1252 return 0;
1253
1254 return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
1255 }
1256
1257 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)1258 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
1259 {
1260 size_t len;
1261 void* p;
1262
1263 len = (size_t )mbuf->used;
1264 p = xmalloc(len);
1265 if (IS_NULL(p)) return NULL;
1266
1267 xmemcpy(p, mbuf->p, len);
1268 return p;
1269 }
1270
1271 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)1272 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
1273 {
1274 return 1;
1275 }
1276
1277 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)1278 compile_cclass_node(CClassNode* cc, regex_t* reg)
1279 {
1280 int r;
1281
1282 if (IS_NULL(cc->mbuf)) {
1283 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
1284 if (r != 0) return r;
1285
1286 COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
1287 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
1288 xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
1289 }
1290 else {
1291 void* p;
1292
1293 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
1294 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
1295 if (r != 0) return r;
1296
1297 p = set_multi_byte_cclass(cc->mbuf, reg);
1298 CHECK_NULL_RETURN_MEMERR(p);
1299 COP(reg)->cclass_mb.mb = p;
1300 }
1301 else {
1302 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
1303 if (r != 0) return r;
1304
1305 COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
1306 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
1307 xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
1308
1309 p = set_multi_byte_cclass(cc->mbuf, reg);
1310 CHECK_NULL_RETURN_MEMERR(p);
1311 COP(reg)->cclass_mix.mb = p;
1312 }
1313 }
1314
1315 return 0;
1316 }
1317
1318 static void
set_addr_in_repeat_range(regex_t * reg)1319 set_addr_in_repeat_range(regex_t* reg)
1320 {
1321 int i;
1322
1323 for (i = 0; i < reg->num_repeat; i++) {
1324 RepeatRange* p = reg->repeat_range + i;
1325 int offset = p->u.offset;
1326 p->u.pcode = reg->ops + offset;
1327 }
1328 }
1329
1330 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1331 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1332 {
1333 #define REPEAT_RANGE_ALLOC 4
1334
1335 RepeatRange* p;
1336
1337 if (reg->repeat_range_alloc == 0) {
1338 p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1339 CHECK_NULL_RETURN_MEMERR(p);
1340 reg->repeat_range = p;
1341 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1342 }
1343 else if (reg->repeat_range_alloc <= id) {
1344 int n;
1345 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1346 p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1347 CHECK_NULL_RETURN_MEMERR(p);
1348 reg->repeat_range = p;
1349 reg->repeat_range_alloc = n;
1350 }
1351 else {
1352 p = reg->repeat_range;
1353 }
1354
1355 p[id].lower = lower;
1356 p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1357 p[id].u.offset = ops_index;
1358 return 0;
1359 }
1360
1361 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ParseEnv * env)1362 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1363 regex_t* reg, ParseEnv* env)
1364 {
1365 int r;
1366 int num_repeat = reg->num_repeat++;
1367
1368 r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1369 if (r != 0) return r;
1370
1371 COP(reg)->repeat.id = num_repeat;
1372 COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1373
1374 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1375 COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1376 if (r != 0) return r;
1377
1378 r = compile_quant_body_with_empty_check(qn, reg, env);
1379 if (r != 0) return r;
1380
1381 r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1382 if (r != 0) return r;
1383
1384 COP(reg)->repeat_inc.id = num_repeat;
1385 return r;
1386 }
1387
1388 static int
is_anychar_infinite_greedy(QuantNode * qn)1389 is_anychar_infinite_greedy(QuantNode* qn)
1390 {
1391 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1392 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1393 return 1;
1394 else
1395 return 0;
1396 }
1397
1398 #define QUANTIFIER_EXPAND_LIMIT_SIZE 10
1399 #define CKN_ON (ckn > 0)
1400
1401 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1402 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1403 {
1404 int len, mod_tlen;
1405 int infinite = IS_INFINITE_REPEAT(qn->upper);
1406 enum BodyEmptyType emptiness = qn->emptiness;
1407 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1408
1409 if (tlen < 0) return tlen;
1410 if (tlen == 0) return 0;
1411
1412 /* anychar repeat */
1413 if (is_anychar_infinite_greedy(qn)) {
1414 if (qn->lower <= 1 ||
1415 len_multiply_cmp((OnigLen )tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1416 if (IS_NOT_NULL(qn->next_head_exact))
1417 return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1418 else
1419 return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1420 }
1421 }
1422
1423 mod_tlen = tlen;
1424 if (emptiness != BODY_IS_NOT_EMPTY)
1425 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1426
1427 if (infinite &&
1428 (qn->lower <= 1 ||
1429 len_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1430 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1431 len = OPSIZE_JUMP;
1432 }
1433 else {
1434 len = tlen * qn->lower;
1435 }
1436
1437 if (qn->greedy) {
1438 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1439 if (IS_NOT_NULL(qn->head_exact))
1440 len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1441 else
1442 #endif
1443 if (IS_NOT_NULL(qn->next_head_exact))
1444 len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1445 else
1446 len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1447 }
1448 else
1449 len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1450 }
1451 else if (qn->upper == 0) {
1452 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1453 len = OPSIZE_JUMP + tlen;
1454 }
1455 else
1456 len = 0;
1457 }
1458 else if (!infinite && qn->greedy &&
1459 (qn->upper == 1 ||
1460 len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1461 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1462 len = tlen * qn->lower;
1463 len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1464 }
1465 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1466 len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1467 }
1468 else {
1469 len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1470 }
1471
1472 return len;
1473 }
1474
1475 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ParseEnv * env)1476 compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
1477 {
1478 int i, r, mod_tlen;
1479 int infinite = IS_INFINITE_REPEAT(qn->upper);
1480 enum BodyEmptyType emptiness = qn->emptiness;
1481 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1482
1483 if (tlen < 0) return tlen;
1484 if (tlen == 0) return 0;
1485
1486 if (is_anychar_infinite_greedy(qn) &&
1487 (qn->lower <= 1 ||
1488 len_multiply_cmp((OnigLen )tlen, qn->lower,
1489 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1490 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1491 if (r != 0) return r;
1492 if (IS_NOT_NULL(qn->next_head_exact)) {
1493 r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1494 OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1495 if (r != 0) return r;
1496
1497 COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1498 return 0;
1499 }
1500 else {
1501 r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1502 OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1503 return r;
1504 }
1505 }
1506
1507 mod_tlen = tlen;
1508 if (emptiness != BODY_IS_NOT_EMPTY)
1509 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1510
1511 if (infinite &&
1512 (qn->lower <= 1 ||
1513 len_multiply_cmp((OnigLen )tlen, qn->lower,
1514 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1515 int addr;
1516
1517 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1518 r = add_op(reg, OP_JUMP);
1519 if (r != 0) return r;
1520 if (qn->greedy) {
1521 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1522 if (IS_NOT_NULL(qn->head_exact))
1523 COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1524 else
1525 #endif
1526 if (IS_NOT_NULL(qn->next_head_exact))
1527 COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1528 else
1529 COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1530 }
1531 else {
1532 COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1533 }
1534 }
1535 else {
1536 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1537 if (r != 0) return r;
1538 }
1539
1540 if (qn->greedy) {
1541 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1542 if (IS_NOT_NULL(qn->head_exact)) {
1543 r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1544 if (r != 0) return r;
1545 COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1546 COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0];
1547
1548 r = compile_quant_body_with_empty_check(qn, reg, env);
1549 if (r != 0) return r;
1550
1551 addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1552 }
1553 else
1554 #endif
1555 if (IS_NOT_NULL(qn->next_head_exact)) {
1556 r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1557 if (r != 0) return r;
1558 COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1559 COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0];
1560
1561 r = compile_quant_body_with_empty_check(qn, reg, env);
1562 if (r != 0) return r;
1563
1564 addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1565 }
1566 else {
1567 r = add_op(reg, OP_PUSH);
1568 if (r != 0) return r;
1569 COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1570
1571 r = compile_quant_body_with_empty_check(qn, reg, env);
1572 if (r != 0) return r;
1573
1574 addr = -(mod_tlen + (int )OPSIZE_PUSH);
1575 }
1576
1577 r = add_op(reg, OP_JUMP);
1578 if (r != 0) return r;
1579 COP(reg)->jump.addr = addr;
1580 }
1581 else {
1582 r = add_op(reg, OP_JUMP);
1583 if (r != 0) return r;
1584 COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1585
1586 r = compile_quant_body_with_empty_check(qn, reg, env);
1587 if (r != 0) return r;
1588
1589 r = add_op(reg, OP_PUSH);
1590 if (r != 0) return r;
1591 COP(reg)->push.addr = -mod_tlen;
1592 }
1593 }
1594 else if (qn->upper == 0) {
1595 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1596 r = add_op(reg, OP_JUMP);
1597 if (r != 0) return r;
1598 COP(reg)->jump.addr = tlen + SIZE_INC;
1599
1600 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1601 }
1602 else {
1603 /* Nothing output */
1604 r = 0;
1605 }
1606 }
1607 else if (! infinite && qn->greedy &&
1608 (qn->upper == 1 ||
1609 len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1610 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1611 int n = qn->upper - qn->lower;
1612
1613 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1614 if (r != 0) return r;
1615
1616 for (i = 0; i < n; i++) {
1617 int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1618 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1619
1620 r = add_op(reg, OP_PUSH);
1621 if (r != 0) return r;
1622 COP(reg)->push.addr = v;
1623
1624 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1625 if (r != 0) return r;
1626 }
1627 }
1628 else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1629 r = add_op(reg, OP_PUSH);
1630 if (r != 0) return r;
1631 COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1632
1633 r = add_op(reg, OP_JUMP);
1634 if (r != 0) return r;
1635 COP(reg)->jump.addr = tlen + SIZE_INC;
1636
1637 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1638 }
1639 else {
1640 r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1641 }
1642 return r;
1643 }
1644
1645 static int
compile_length_option_node(BagNode * node,regex_t * reg)1646 compile_length_option_node(BagNode* node, regex_t* reg)
1647 {
1648 int tlen;
1649
1650 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1651
1652 return tlen;
1653 }
1654
1655 static int
compile_option_node(BagNode * node,regex_t * reg,ParseEnv * env)1656 compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env)
1657 {
1658 int r;
1659
1660 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1661
1662 return r;
1663 }
1664
1665 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1666 compile_length_bag_node(BagNode* node, regex_t* reg)
1667 {
1668 int len;
1669 int tlen;
1670
1671 if (node->type == BAG_OPTION)
1672 return compile_length_option_node(node, reg);
1673
1674 if (NODE_BAG_BODY(node)) {
1675 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1676 if (tlen < 0) return tlen;
1677 }
1678 else
1679 tlen = 0;
1680
1681 switch (node->type) {
1682 case BAG_MEMORY:
1683 #ifdef USE_CALL
1684
1685 if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1686 len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1687 return len;
1688 }
1689
1690 if (NODE_IS_CALLED(node)) {
1691 len = OPSIZE_MEM_START_PUSH + tlen
1692 + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1693 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1694 len += (NODE_IS_RECURSION(node)
1695 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1696 else
1697 len += (NODE_IS_RECURSION(node)
1698 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1699 }
1700 else if (NODE_IS_RECURSION(node)) {
1701 len = OPSIZE_MEM_START_PUSH;
1702 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1703 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1704 }
1705 else
1706 #endif
1707 {
1708 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1709 len = OPSIZE_MEM_START_PUSH;
1710 else
1711 len = OPSIZE_MEM_START;
1712
1713 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1714 ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1715 }
1716 break;
1717
1718 case BAG_STOP_BACKTRACK:
1719 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1720 int v;
1721 QuantNode* qn;
1722
1723 qn = QUANT_(NODE_BAG_BODY(node));
1724 tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1725 if (tlen < 0) return tlen;
1726
1727 v = onig_positive_int_multiply(qn->lower, tlen);
1728 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1729 len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP;
1730 }
1731 else {
1732 len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1733 }
1734 break;
1735
1736 case BAG_IF_ELSE:
1737 {
1738 Node* cond = NODE_BAG_BODY(node);
1739 Node* Then = node->te.Then;
1740 Node* Else = node->te.Else;
1741
1742 len = compile_length_tree(cond, reg);
1743 if (len < 0) return len;
1744 len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK;
1745
1746 if (IS_NOT_NULL(Then)) {
1747 tlen = compile_length_tree(Then, reg);
1748 if (tlen < 0) return tlen;
1749 len += tlen;
1750 }
1751
1752 len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK;
1753
1754 if (IS_NOT_NULL(Else)) {
1755 tlen = compile_length_tree(Else, reg);
1756 if (tlen < 0) return tlen;
1757 len += tlen;
1758 }
1759 }
1760 break;
1761
1762 case BAG_OPTION:
1763 /* never come here, but set for escape warning */
1764 len = 0;
1765 break;
1766 }
1767
1768 return len;
1769 }
1770
1771 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ParseEnv * env)1772 compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)
1773 {
1774 int r;
1775
1776 #ifdef USE_CALL
1777 if (NODE_IS_CALLED(node)) {
1778 int len;
1779
1780 r = add_op(reg, OP_CALL);
1781 if (r != 0) return r;
1782
1783 node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1784 NODE_STATUS_ADD(node, FIXED_ADDR);
1785 COP(reg)->call.addr = (int )node->m.called_addr;
1786
1787 if (node->m.regnum == 0) {
1788 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1789 len += OPSIZE_RETURN;
1790
1791 r = add_op(reg, OP_JUMP);
1792 if (r != 0) return r;
1793 COP(reg)->jump.addr = len + SIZE_INC;
1794
1795 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1796 if (r != 0) return r;
1797
1798 r = add_op(reg, OP_RETURN);
1799 return r;
1800 }
1801 else {
1802 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1803 len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1804 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1805 len += (NODE_IS_RECURSION(node)
1806 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1807 else
1808 len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1809
1810 r = add_op(reg, OP_JUMP);
1811 if (r != 0) return r;
1812 COP(reg)->jump.addr = len + SIZE_INC;
1813 }
1814 }
1815 #endif
1816
1817 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1818 r = add_op(reg, OP_MEM_START_PUSH);
1819 else
1820 r = add_op(reg, OP_MEM_START);
1821 if (r != 0) return r;
1822 COP(reg)->memory_start.num = node->m.regnum;
1823
1824 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1825 if (r != 0) return r;
1826
1827 #ifdef USE_CALL
1828 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1829 r = add_op(reg, (NODE_IS_RECURSION(node)
1830 ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1831 else
1832 r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1833 if (r != 0) return r;
1834 COP(reg)->memory_end.num = node->m.regnum;
1835
1836 if (NODE_IS_CALLED(node)) {
1837 r = add_op(reg, OP_RETURN);
1838 }
1839 #else
1840 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1841 r = add_op(reg, OP_MEM_END_PUSH);
1842 else
1843 r = add_op(reg, OP_MEM_END);
1844 if (r != 0) return r;
1845 COP(reg)->memory_end.num = node->m.regnum;
1846 #endif
1847
1848 return r;
1849 }
1850
1851 static int
compile_bag_node(BagNode * node,regex_t * reg,ParseEnv * env)1852 compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
1853 {
1854 int r, len;
1855
1856 switch (node->type) {
1857 case BAG_MEMORY:
1858 r = compile_bag_memory_node(node, reg, env);
1859 break;
1860
1861 case BAG_OPTION:
1862 r = compile_option_node(node, reg, env);
1863 break;
1864
1865 case BAG_STOP_BACKTRACK:
1866 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1867 QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1868 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1869 if (r != 0) return r;
1870
1871 len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1872 if (len < 0) return len;
1873
1874 r = add_op(reg, OP_PUSH);
1875 if (r != 0) return r;
1876 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP;
1877
1878 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1879 if (r != 0) return r;
1880 r = add_op(reg, OP_POP);
1881 if (r != 0) return r;
1882
1883 r = add_op(reg, OP_JUMP);
1884 if (r != 0) return r;
1885 COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP);
1886 }
1887 else {
1888 MemNumType mid;
1889
1890 ID_ENTRY(env, mid);
1891 r = add_op(reg, OP_MARK);
1892 if (r != 0) return r;
1893 COP(reg)->mark.id = mid;
1894 COP(reg)->mark.save_pos = 0;
1895
1896 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1897 if (r != 0) return r;
1898 r = add_op(reg, OP_CUT_TO_MARK);
1899 if (r != 0) return r;
1900 COP(reg)->cut_to_mark.id = mid;
1901 COP(reg)->cut_to_mark.restore_pos = 0;
1902 }
1903 break;
1904
1905 case BAG_IF_ELSE:
1906 {
1907 int cond_len, then_len, else_len, jump_len;
1908 MemNumType mid;
1909 Node* cond = NODE_BAG_BODY(node);
1910 Node* Then = node->te.Then;
1911 Node* Else = node->te.Else;
1912
1913 ID_ENTRY(env, mid);
1914
1915 r = add_op(reg, OP_MARK);
1916 if (r != 0) return r;
1917 COP(reg)->mark.id = mid;
1918 COP(reg)->mark.save_pos = 0;
1919
1920 cond_len = compile_length_tree(cond, reg);
1921 if (cond_len < 0) return cond_len;
1922 if (IS_NOT_NULL(Then)) {
1923 then_len = compile_length_tree(Then, reg);
1924 if (then_len < 0) return then_len;
1925 }
1926 else
1927 then_len = 0;
1928
1929 jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP;
1930
1931 r = add_op(reg, OP_PUSH);
1932 if (r != 0) return r;
1933 COP(reg)->push.addr = SIZE_INC + jump_len;
1934
1935 r = compile_tree(cond, reg, env);
1936 if (r != 0) return r;
1937 r = add_op(reg, OP_CUT_TO_MARK);
1938 if (r != 0) return r;
1939 COP(reg)->cut_to_mark.id = mid;
1940 COP(reg)->cut_to_mark.restore_pos = 0;
1941
1942 if (IS_NOT_NULL(Then)) {
1943 r = compile_tree(Then, reg, env);
1944 if (r != 0) return r;
1945 }
1946
1947 if (IS_NOT_NULL(Else)) {
1948 else_len = compile_length_tree(Else, reg);
1949 if (else_len < 0) return else_len;
1950 }
1951 else
1952 else_len = 0;
1953
1954 r = add_op(reg, OP_JUMP);
1955 if (r != 0) return r;
1956 COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC;
1957
1958 r = add_op(reg, OP_CUT_TO_MARK);
1959 if (r != 0) return r;
1960 COP(reg)->cut_to_mark.id = mid;
1961 COP(reg)->cut_to_mark.restore_pos = 0;
1962
1963 if (IS_NOT_NULL(Else)) {
1964 r = compile_tree(Else, reg, env);
1965 }
1966 }
1967 break;
1968 }
1969
1970 return r;
1971 }
1972
1973 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1974 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1975 {
1976 int len;
1977 int tlen = 0;
1978
1979 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1980 tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1981 if (tlen < 0) return tlen;
1982 }
1983
1984 switch (node->type) {
1985 case ANCR_PREC_READ:
1986 len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1987 break;
1988 case ANCR_PREC_READ_NOT:
1989 len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
1990 break;
1991 case ANCR_LOOK_BEHIND:
1992 if (node->char_min_len == node->char_max_len)
1993 len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK;
1994 else {
1995 len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR;
1996
1997 if (IS_NOT_NULL(node->lead_node)) {
1998 int llen = compile_length_tree(node->lead_node, reg);
1999 if (llen < 0) return llen;
2000
2001 len += OPSIZE_MOVE + llen;
2002 }
2003 }
2004 break;
2005 case ANCR_LOOK_BEHIND_NOT:
2006 if (node->char_min_len == node->char_max_len)
2007 len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP;
2008 else {
2009 len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP;
2010
2011 if (IS_NOT_NULL(node->lead_node)) {
2012 int llen = compile_length_tree(node->lead_node, reg);
2013 if (llen < 0) return llen;
2014
2015 len += OPSIZE_MOVE + llen;
2016 }
2017 }
2018 break;
2019
2020 case ANCR_WORD_BOUNDARY:
2021 case ANCR_NO_WORD_BOUNDARY:
2022 #ifdef USE_WORD_BEGIN_END
2023 case ANCR_WORD_BEGIN:
2024 case ANCR_WORD_END:
2025 #endif
2026 len = OPSIZE_WORD_BOUNDARY;
2027 break;
2028
2029 case ANCR_TEXT_SEGMENT_BOUNDARY:
2030 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2031 len = SIZE_OPCODE;
2032 break;
2033
2034 default:
2035 len = SIZE_OPCODE;
2036 break;
2037 }
2038
2039 return len;
2040 }
2041
2042 static int
compile_anchor_look_behind_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2043 compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
2044 {
2045 int r;
2046
2047 if (node->char_min_len == node->char_max_len) {
2048 MemNumType mid;
2049
2050 ID_ENTRY(env, mid);
2051 r = add_op(reg, OP_MARK);
2052 if (r != 0) return r;
2053 COP(reg)->mark.id = mid;
2054 COP(reg)->mark.save_pos = FALSE;
2055
2056 r = add_op(reg, OP_STEP_BACK_START);
2057 if (r != 0) return r;
2058 COP(reg)->step_back_start.initial = node->char_min_len;
2059 COP(reg)->step_back_start.remaining = 0;
2060 COP(reg)->step_back_start.addr = 1;
2061
2062 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2063 if (r != 0) return r;
2064
2065 r = add_op(reg, OP_CUT_TO_MARK);
2066 if (r != 0) return r;
2067 COP(reg)->cut_to_mark.id = mid;
2068 COP(reg)->cut_to_mark.restore_pos = FALSE;
2069 }
2070 else {
2071 MemNumType mid1, mid2;
2072 OnigLen diff;
2073
2074 if (IS_NOT_NULL(node->lead_node)) {
2075 MinMaxCharLen ci;
2076
2077 r = node_char_len(node->lead_node, reg, &ci, env);
2078 if (r < 0) return r;
2079 r = add_op(reg, OP_MOVE);
2080 if (r != 0) return r;
2081 COP(reg)->move.n = -((RelPositionType )ci.min);
2082 r = compile_tree(node->lead_node, reg, env);
2083 if (r != 0) return r;
2084 }
2085
2086 ID_ENTRY(env, mid1);
2087 r = add_op(reg, OP_SAVE_VAL);
2088 if (r != 0) return r;
2089 COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2090 COP(reg)->save_val.id = mid1;
2091
2092 r = add_op(reg, OP_UPDATE_VAR);
2093 if (r != 0) return r;
2094 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2095
2096 ID_ENTRY(env, mid2);
2097 r = add_op(reg, OP_MARK);
2098 if (r != 0) return r;
2099 COP(reg)->mark.id = mid2;
2100 COP(reg)->mark.save_pos = FALSE;
2101
2102 r = add_op(reg, OP_PUSH);
2103 if (r != 0) return r;
2104 COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
2105
2106 r = add_op(reg, OP_JUMP);
2107 if (r != 0) return r;
2108 COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL;
2109
2110 r = add_op(reg, OP_UPDATE_VAR);
2111 if (r != 0) return r;
2112 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2113 COP(reg)->update_var.id = mid1;
2114 COP(reg)->update_var.clear = FALSE;
2115 r = add_op(reg, OP_FAIL);
2116 if (r != 0) return r;
2117
2118 r = add_op(reg, OP_STEP_BACK_START);
2119 if (r != 0) return r;
2120
2121 if (node->char_max_len != INFINITE_LEN)
2122 diff = node->char_max_len - node->char_min_len;
2123 else
2124 diff = INFINITE_LEN;
2125
2126 COP(reg)->step_back_start.initial = node->char_min_len;
2127 COP(reg)->step_back_start.remaining = diff;
2128 COP(reg)->step_back_start.addr = 2;
2129
2130 r = add_op(reg, OP_STEP_BACK_NEXT);
2131 if (r != 0) return r;
2132
2133 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2134 if (r != 0) return r;
2135
2136 r = add_op(reg, OP_CHECK_POSITION);
2137 if (r != 0) return r;
2138 COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2139
2140 r = add_op(reg, OP_CUT_TO_MARK);
2141 if (r != 0) return r;
2142 COP(reg)->cut_to_mark.id = mid2;
2143 COP(reg)->cut_to_mark.restore_pos = FALSE;
2144
2145 r = add_op(reg, OP_UPDATE_VAR);
2146 if (r != 0) return r;
2147 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2148 COP(reg)->update_var.id = mid1;
2149 COP(reg)->update_var.clear = TRUE;
2150 }
2151
2152 return r;
2153 }
2154
2155 static int
compile_anchor_look_behind_not_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2156 compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
2157 ParseEnv* env)
2158 {
2159 int r;
2160 int len;
2161
2162 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2163
2164 if (node->char_min_len == node->char_max_len) {
2165 MemNumType mid;
2166
2167 ID_ENTRY(env, mid);
2168 r = add_op(reg, OP_MARK);
2169 if (r != 0) return r;
2170 COP(reg)->mark.id = mid;
2171 COP(reg)->mark.save_pos = FALSE;
2172
2173 r = add_op(reg, OP_PUSH);
2174 if (r != 0) return r;
2175 COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL;
2176
2177 r = add_op(reg, OP_STEP_BACK_START);
2178 if (r != 0) return r;
2179 COP(reg)->step_back_start.initial = node->char_min_len;
2180 COP(reg)->step_back_start.remaining = 0;
2181 COP(reg)->step_back_start.addr = 1;
2182
2183 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2184 if (r != 0) return r;
2185
2186 r = add_op(reg, OP_POP_TO_MARK);
2187 if (r != 0) return r;
2188 COP(reg)->pop_to_mark.id = mid;
2189 r = add_op(reg, OP_FAIL);
2190 if (r != 0) return r;
2191 r = add_op(reg, OP_POP);
2192 }
2193 else {
2194 MemNumType mid1, mid2;
2195 OnigLen diff;
2196
2197 ID_ENTRY(env, mid1);
2198 r = add_op(reg, OP_SAVE_VAL);
2199 if (r != 0) return r;
2200 COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2201 COP(reg)->save_val.id = mid1;
2202
2203 r = add_op(reg, OP_UPDATE_VAR);
2204 if (r != 0) return r;
2205 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2206
2207 ID_ENTRY(env, mid2);
2208 r = add_op(reg, OP_MARK);
2209 if (r != 0) return r;
2210 COP(reg)->mark.id = mid2;
2211 COP(reg)->mark.save_pos = FALSE;
2212
2213 r = add_op(reg, OP_PUSH);
2214 if (r != 0) return r;
2215 COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL;
2216
2217 if (IS_NOT_NULL(node->lead_node)) {
2218 int clen;
2219 MinMaxCharLen ci;
2220
2221 clen = compile_length_tree(node->lead_node, reg);
2222 COP(reg)->push.addr += OPSIZE_MOVE + clen;
2223
2224 r = node_char_len(node->lead_node, reg, &ci, env);
2225 if (r < 0) return r;
2226 r = add_op(reg, OP_MOVE);
2227 if (r != 0) return r;
2228 COP(reg)->move.n = -((RelPositionType )ci.min);
2229
2230 r = compile_tree(node->lead_node, reg, env);
2231 if (r != 0) return r;
2232 }
2233
2234 r = add_op(reg, OP_STEP_BACK_START);
2235 if (r != 0) return r;
2236
2237 if (node->char_max_len != INFINITE_LEN)
2238 diff = node->char_max_len - node->char_min_len;
2239 else
2240 diff = INFINITE_LEN;
2241
2242 COP(reg)->step_back_start.initial = node->char_min_len;
2243 COP(reg)->step_back_start.remaining = diff;
2244 COP(reg)->step_back_start.addr = 2;
2245
2246 r = add_op(reg, OP_STEP_BACK_NEXT);
2247 if (r != 0) return r;
2248
2249 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2250 if (r != 0) return r;
2251
2252 r = add_op(reg, OP_CHECK_POSITION);
2253 if (r != 0) return r;
2254 COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2255
2256 r = add_op(reg, OP_POP_TO_MARK);
2257 if (r != 0) return r;
2258 COP(reg)->pop_to_mark.id = mid2;
2259
2260 r = add_op(reg, OP_UPDATE_VAR);
2261 if (r != 0) return r;
2262 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2263 COP(reg)->update_var.id = mid1;
2264 COP(reg)->update_var.clear = FALSE;
2265
2266 r = add_op(reg, OP_POP); /* pop save val */
2267 if (r != 0) return r;
2268 r = add_op(reg, OP_FAIL);
2269 if (r != 0) return r;
2270
2271 r = add_op(reg, OP_UPDATE_VAR);
2272 if (r != 0) return r;
2273 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2274 COP(reg)->update_var.id = mid1;
2275 COP(reg)->update_var.clear = FALSE;
2276
2277 r = add_op(reg, OP_POP); /* pop mark */
2278 if (r != 0) return r;
2279 r = add_op(reg, OP_POP); /* pop save val */
2280 }
2281
2282 return r;
2283 }
2284
2285 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2286 compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
2287 {
2288 int r, len;
2289 enum OpCode op;
2290 MemNumType mid;
2291
2292 switch (node->type) {
2293 case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break;
2294 case ANCR_END_BUF: r = add_op(reg, OP_END_BUF); break;
2295 case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break;
2296 case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break;
2297 case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break;
2298 case ANCR_BEGIN_POSITION:
2299 r = add_op(reg, OP_CHECK_POSITION);
2300 if (r != 0) return r;
2301 COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START;
2302 break;
2303
2304 case ANCR_WORD_BOUNDARY:
2305 op = OP_WORD_BOUNDARY;
2306 word:
2307 r = add_op(reg, op);
2308 if (r != 0) return r;
2309 COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
2310 break;
2311
2312 case ANCR_NO_WORD_BOUNDARY:
2313 op = OP_NO_WORD_BOUNDARY; goto word;
2314 break;
2315 #ifdef USE_WORD_BEGIN_END
2316 case ANCR_WORD_BEGIN:
2317 op = OP_WORD_BEGIN; goto word;
2318 break;
2319 case ANCR_WORD_END:
2320 op = OP_WORD_END; goto word;
2321 break;
2322 #endif
2323
2324 case ANCR_TEXT_SEGMENT_BOUNDARY:
2325 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2326 {
2327 enum TextSegmentBoundaryType type;
2328
2329 r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
2330 if (r != 0) return r;
2331
2332 type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
2333 #ifdef USE_UNICODE_WORD_BREAK
2334 if (NODE_IS_TEXT_SEGMENT_WORD(node))
2335 type = WORD_BOUNDARY;
2336 #endif
2337
2338 COP(reg)->text_segment_boundary.type = type;
2339 COP(reg)->text_segment_boundary.not =
2340 (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
2341 }
2342 break;
2343
2344 case ANCR_PREC_READ:
2345 {
2346 ID_ENTRY(env, mid);
2347 r = add_op(reg, OP_MARK);
2348 if (r != 0) return r;
2349 COP(reg)->mark.id = mid;
2350 COP(reg)->mark.save_pos = TRUE;
2351
2352 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2353 if (r != 0) return r;
2354
2355 r = add_op(reg, OP_CUT_TO_MARK);
2356 if (r != 0) return r;
2357 COP(reg)->cut_to_mark.id = mid;
2358 COP(reg)->cut_to_mark.restore_pos = TRUE;
2359 }
2360 break;
2361
2362 case ANCR_PREC_READ_NOT:
2363 {
2364 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2365 if (len < 0) return len;
2366
2367 ID_ENTRY(env, mid);
2368 r = add_op(reg, OP_PUSH);
2369 if (r != 0) return r;
2370 COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len +
2371 OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
2372
2373 r = add_op(reg, OP_MARK);
2374 if (r != 0) return r;
2375 COP(reg)->mark.id = mid;
2376 COP(reg)->mark.save_pos = FALSE;
2377
2378 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2379 if (r != 0) return r;
2380
2381 r = add_op(reg, OP_POP_TO_MARK);
2382 if (r != 0) return r;
2383 COP(reg)->pop_to_mark.id = mid;
2384
2385 r = add_op(reg, OP_POP);
2386 if (r != 0) return r;
2387 r = add_op(reg, OP_FAIL);
2388 }
2389 break;
2390
2391 case ANCR_LOOK_BEHIND:
2392 r = compile_anchor_look_behind_node(node, reg, env);
2393 break;
2394
2395 case ANCR_LOOK_BEHIND_NOT:
2396 r = compile_anchor_look_behind_not_node(node, reg, env);
2397 break;
2398
2399 default:
2400 return ONIGERR_TYPE_BUG;
2401 break;
2402 }
2403
2404 return r;
2405 }
2406
2407 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)2408 compile_gimmick_node(GimmickNode* node, regex_t* reg)
2409 {
2410 int r = 0;
2411
2412 switch (node->type) {
2413 case GIMMICK_FAIL:
2414 r = add_op(reg, OP_FAIL);
2415 break;
2416
2417 case GIMMICK_SAVE:
2418 r = add_op(reg, OP_SAVE_VAL);
2419 if (r != 0) return r;
2420 COP(reg)->save_val.type = node->detail_type;
2421 COP(reg)->save_val.id = node->id;
2422 break;
2423
2424 case GIMMICK_UPDATE_VAR:
2425 r = add_op(reg, OP_UPDATE_VAR);
2426 if (r != 0) return r;
2427 COP(reg)->update_var.type = node->detail_type;
2428 COP(reg)->update_var.id = node->id;
2429 COP(reg)->update_var.clear = FALSE;
2430 break;
2431
2432 #ifdef USE_CALLOUT
2433 case GIMMICK_CALLOUT:
2434 switch (node->detail_type) {
2435 case ONIG_CALLOUT_OF_CONTENTS:
2436 case ONIG_CALLOUT_OF_NAME:
2437 {
2438 if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
2439 r = add_op(reg, OP_CALLOUT_NAME);
2440 if (r != 0) return r;
2441 COP(reg)->callout_name.id = node->id;
2442 COP(reg)->callout_name.num = node->num;
2443 }
2444 else {
2445 r = add_op(reg, OP_CALLOUT_CONTENTS);
2446 if (r != 0) return r;
2447 COP(reg)->callout_contents.num = node->num;
2448 }
2449 }
2450 break;
2451
2452 default:
2453 r = ONIGERR_TYPE_BUG;
2454 break;
2455 }
2456 #endif
2457 }
2458
2459 return r;
2460 }
2461
2462 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)2463 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
2464 {
2465 int len;
2466
2467 switch (node->type) {
2468 case GIMMICK_FAIL:
2469 len = OPSIZE_FAIL;
2470 break;
2471
2472 case GIMMICK_SAVE:
2473 len = OPSIZE_SAVE_VAL;
2474 break;
2475
2476 case GIMMICK_UPDATE_VAR:
2477 len = OPSIZE_UPDATE_VAR;
2478 break;
2479
2480 #ifdef USE_CALLOUT
2481 case GIMMICK_CALLOUT:
2482 switch (node->detail_type) {
2483 case ONIG_CALLOUT_OF_CONTENTS:
2484 len = OPSIZE_CALLOUT_CONTENTS;
2485 break;
2486 case ONIG_CALLOUT_OF_NAME:
2487 len = OPSIZE_CALLOUT_NAME;
2488 break;
2489
2490 default:
2491 len = ONIGERR_TYPE_BUG;
2492 break;
2493 }
2494 break;
2495 #endif
2496 }
2497
2498 return len;
2499 }
2500
2501 static int
compile_length_tree(Node * node,regex_t * reg)2502 compile_length_tree(Node* node, regex_t* reg)
2503 {
2504 int len, r;
2505
2506 switch (NODE_TYPE(node)) {
2507 case NODE_LIST:
2508 len = 0;
2509 do {
2510 r = compile_length_tree(NODE_CAR(node), reg);
2511 if (r < 0) return r;
2512 len += r;
2513 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2514 r = len;
2515 break;
2516
2517 case NODE_ALT:
2518 {
2519 int n;
2520
2521 n = r = 0;
2522 do {
2523 r += compile_length_tree(NODE_CAR(node), reg);
2524 n++;
2525 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2526 r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
2527 }
2528 break;
2529
2530 case NODE_STRING:
2531 if (NODE_STRING_IS_CRUDE(node))
2532 r = compile_length_string_crude_node(STR_(node), reg);
2533 else
2534 r = compile_length_string_node(node, reg);
2535 break;
2536
2537 case NODE_CCLASS:
2538 r = compile_length_cclass_node(CCLASS_(node), reg);
2539 break;
2540
2541 case NODE_CTYPE:
2542 r = SIZE_OPCODE;
2543 break;
2544
2545 case NODE_BACKREF:
2546 r = OPSIZE_BACKREF;
2547 break;
2548
2549 #ifdef USE_CALL
2550 case NODE_CALL:
2551 r = OPSIZE_CALL;
2552 break;
2553 #endif
2554
2555 case NODE_QUANT:
2556 r = compile_length_quantifier_node(QUANT_(node), reg);
2557 break;
2558
2559 case NODE_BAG:
2560 r = compile_length_bag_node(BAG_(node), reg);
2561 break;
2562
2563 case NODE_ANCHOR:
2564 r = compile_length_anchor_node(ANCHOR_(node), reg);
2565 break;
2566
2567 case NODE_GIMMICK:
2568 r = compile_length_gimmick_node(GIMMICK_(node), reg);
2569 break;
2570
2571 default:
2572 return ONIGERR_TYPE_BUG;
2573 break;
2574 }
2575
2576 return r;
2577 }
2578
2579 static int
compile_tree(Node * node,regex_t * reg,ParseEnv * env)2580 compile_tree(Node* node, regex_t* reg, ParseEnv* env)
2581 {
2582 int n, len, pos, r = 0;
2583
2584 switch (NODE_TYPE(node)) {
2585 case NODE_LIST:
2586 do {
2587 r = compile_tree(NODE_CAR(node), reg, env);
2588 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2589 break;
2590
2591 case NODE_ALT:
2592 {
2593 Node* x = node;
2594 len = 0;
2595 do {
2596 len += compile_length_tree(NODE_CAR(x), reg);
2597 if (IS_NOT_NULL(NODE_CDR(x))) {
2598 len += OPSIZE_PUSH + OPSIZE_JUMP;
2599 }
2600 } while (IS_NOT_NULL(x = NODE_CDR(x)));
2601 pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */
2602
2603 do {
2604 len = compile_length_tree(NODE_CAR(node), reg);
2605 if (IS_NOT_NULL(NODE_CDR(node))) {
2606 enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2607 r = add_op(reg, push);
2608 if (r != 0) break;
2609 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2610 }
2611 r = compile_tree(NODE_CAR(node), reg, env);
2612 if (r != 0) break;
2613 if (IS_NOT_NULL(NODE_CDR(node))) {
2614 len = pos - (COP_CURR_OFFSET(reg) + 1);
2615 r = add_op(reg, OP_JUMP);
2616 if (r != 0) break;
2617 COP(reg)->jump.addr = len;
2618 }
2619 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2620 }
2621 break;
2622
2623 case NODE_STRING:
2624 if (NODE_STRING_IS_CRUDE(node))
2625 r = compile_string_crude_node(STR_(node), reg);
2626 else
2627 r = compile_string_node(node, reg);
2628 break;
2629
2630 case NODE_CCLASS:
2631 r = compile_cclass_node(CCLASS_(node), reg);
2632 break;
2633
2634 case NODE_CTYPE:
2635 {
2636 int op;
2637
2638 switch (CTYPE_(node)->ctype) {
2639 case CTYPE_ANYCHAR:
2640 r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR);
2641 break;
2642
2643 case ONIGENC_CTYPE_WORD:
2644 if (CTYPE_(node)->ascii_mode == 0) {
2645 op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2646 }
2647 else {
2648 op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2649 }
2650 r = add_op(reg, op);
2651 break;
2652
2653 default:
2654 return ONIGERR_TYPE_BUG;
2655 break;
2656 }
2657 }
2658 break;
2659
2660 case NODE_BACKREF:
2661 {
2662 BackRefNode* br = BACKREF_(node);
2663
2664 if (NODE_IS_CHECKER(node)) {
2665 #ifdef USE_BACKREF_WITH_LEVEL
2666 if (NODE_IS_NEST_LEVEL(node)) {
2667 r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2668 if (r != 0) return r;
2669 COP(reg)->backref_general.nest_level = br->nest_level;
2670 }
2671 else
2672 #endif
2673 {
2674 r = add_op(reg, OP_BACKREF_CHECK);
2675 if (r != 0) return r;
2676 }
2677 goto add_bacref_mems;
2678 }
2679 else {
2680 #ifdef USE_BACKREF_WITH_LEVEL
2681 if (NODE_IS_NEST_LEVEL(node)) {
2682 if (NODE_IS_IGNORECASE(node))
2683 r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2684 else
2685 r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2686
2687 if (r != 0) return r;
2688 COP(reg)->backref_general.nest_level = br->nest_level;
2689 goto add_bacref_mems;
2690 }
2691 else
2692 #endif
2693 if (br->back_num == 1) {
2694 n = br->back_static[0];
2695 if (NODE_IS_IGNORECASE(node)) {
2696 r = add_op(reg, OP_BACKREF_N_IC);
2697 if (r != 0) return r;
2698 COP(reg)->backref_n.n1 = n;
2699 }
2700 else {
2701 switch (n) {
2702 case 1: r = add_op(reg, OP_BACKREF1); break;
2703 case 2: r = add_op(reg, OP_BACKREF2); break;
2704 default:
2705 r = add_op(reg, OP_BACKREF_N);
2706 if (r != 0) return r;
2707 COP(reg)->backref_n.n1 = n;
2708 break;
2709 }
2710 }
2711 }
2712 else {
2713 int num;
2714 int* p;
2715
2716 r = add_op(reg, NODE_IS_IGNORECASE(node) ?
2717 OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2718 if (r != 0) return r;
2719
2720 add_bacref_mems:
2721 num = br->back_num;
2722 COP(reg)->backref_general.num = num;
2723 if (num == 1) {
2724 COP(reg)->backref_general.n1 = br->back_static[0];
2725 }
2726 else {
2727 int i, j;
2728 MemNumType* ns;
2729
2730 ns = xmalloc(sizeof(MemNumType) * num);
2731 CHECK_NULL_RETURN_MEMERR(ns);
2732 COP(reg)->backref_general.ns = ns;
2733 p = BACKREFS_P(br);
2734 for (i = num - 1, j = 0; i >= 0; i--, j++) {
2735 ns[j] = p[i];
2736 }
2737 }
2738 }
2739 }
2740 }
2741 break;
2742
2743 #ifdef USE_CALL
2744 case NODE_CALL:
2745 r = compile_call(CALL_(node), reg, env);
2746 break;
2747 #endif
2748
2749 case NODE_QUANT:
2750 r = compile_quantifier_node(QUANT_(node), reg, env);
2751 break;
2752
2753 case NODE_BAG:
2754 r = compile_bag_node(BAG_(node), reg, env);
2755 break;
2756
2757 case NODE_ANCHOR:
2758 r = compile_anchor_node(ANCHOR_(node), reg, env);
2759 break;
2760
2761 case NODE_GIMMICK:
2762 r = compile_gimmick_node(GIMMICK_(node), reg);
2763 break;
2764
2765 default:
2766 #ifdef ONIG_DEBUG
2767 fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2768 #endif
2769 break;
2770 }
2771
2772 return r;
2773 }
2774
2775 static int
make_named_capture_number_map(Node ** plink,GroupNumMap * map,int * counter)2776 make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)
2777 {
2778 int r;
2779 Node* node = *plink;
2780
2781 switch (NODE_TYPE(node)) {
2782 case NODE_LIST:
2783 case NODE_ALT:
2784 do {
2785 r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter);
2786 } while (r >= 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2787 if (r < 0) return r;
2788 break;
2789
2790 case NODE_QUANT:
2791 {
2792 Node** ptarget = &(NODE_BODY(node));
2793 r = make_named_capture_number_map(ptarget, map, counter);
2794 if (r < 0) return r;
2795 if (r == 1 && NODE_TYPE(*ptarget) == NODE_QUANT) {
2796 return onig_reduce_nested_quantifier(node);
2797 }
2798 }
2799 break;
2800
2801 case NODE_BAG:
2802 {
2803 BagNode* en = BAG_(node);
2804 if (en->type == BAG_MEMORY) {
2805 if (NODE_IS_NAMED_GROUP(node)) {
2806 (*counter)++;
2807 map[en->m.regnum].new_val = *counter;
2808 en->m.regnum = *counter;
2809 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2810 if (r < 0) return r;
2811 }
2812 else {
2813 *plink = NODE_BODY(node);
2814 NODE_BODY(node) = NULL_NODE;
2815 onig_node_free(node);
2816 r = make_named_capture_number_map(plink, map, counter);
2817 if (r < 0) return r;
2818 return 1;
2819 }
2820 }
2821 else if (en->type == BAG_IF_ELSE) {
2822 r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter);
2823 if (r < 0) return r;
2824 if (IS_NOT_NULL(en->te.Then)) {
2825 r = make_named_capture_number_map(&(en->te.Then), map, counter);
2826 if (r < 0) return r;
2827 }
2828 if (IS_NOT_NULL(en->te.Else)) {
2829 r = make_named_capture_number_map(&(en->te.Else), map, counter);
2830 if (r < 0) return r;
2831 }
2832 }
2833 else {
2834 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2835 if (r < 0) return r;
2836 }
2837 }
2838 break;
2839
2840 case NODE_ANCHOR:
2841 if (IS_NOT_NULL(NODE_BODY(node))) {
2842 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2843 if (r < 0) return r;
2844 }
2845 break;
2846
2847 default:
2848 break;
2849 }
2850
2851 return 0;
2852 }
2853
2854 static int
renumber_backref_node(Node * node,GroupNumMap * map)2855 renumber_backref_node(Node* node, GroupNumMap* map)
2856 {
2857 int i, pos, n, old_num;
2858 int *backs;
2859 BackRefNode* bn = BACKREF_(node);
2860
2861 if (! NODE_IS_BY_NAME(node))
2862 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2863
2864 old_num = bn->back_num;
2865 if (IS_NULL(bn->back_dynamic))
2866 backs = bn->back_static;
2867 else
2868 backs = bn->back_dynamic;
2869
2870 for (i = 0, pos = 0; i < old_num; i++) {
2871 n = map[backs[i]].new_val;
2872 if (n > 0) {
2873 backs[pos] = n;
2874 pos++;
2875 }
2876 }
2877
2878 bn->back_num = pos;
2879 return 0;
2880 }
2881
2882 static int
renumber_backref_traverse(Node * node,GroupNumMap * map)2883 renumber_backref_traverse(Node* node, GroupNumMap* map)
2884 {
2885 int r = 0;
2886
2887 switch (NODE_TYPE(node)) {
2888 case NODE_LIST:
2889 case NODE_ALT:
2890 do {
2891 r = renumber_backref_traverse(NODE_CAR(node), map);
2892 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2893 break;
2894
2895 case NODE_QUANT:
2896 r = renumber_backref_traverse(NODE_BODY(node), map);
2897 break;
2898
2899 case NODE_BAG:
2900 {
2901 BagNode* en = BAG_(node);
2902
2903 r = renumber_backref_traverse(NODE_BODY(node), map);
2904 if (r != 0) return r;
2905
2906 if (en->type == BAG_IF_ELSE) {
2907 if (IS_NOT_NULL(en->te.Then)) {
2908 r = renumber_backref_traverse(en->te.Then, map);
2909 if (r != 0) return r;
2910 }
2911 if (IS_NOT_NULL(en->te.Else)) {
2912 r = renumber_backref_traverse(en->te.Else, map);
2913 if (r != 0) return r;
2914 }
2915 }
2916 }
2917 break;
2918
2919 case NODE_BACKREF:
2920 r = renumber_backref_node(node, map);
2921 break;
2922
2923 case NODE_ANCHOR:
2924 if (IS_NOT_NULL(NODE_BODY(node)))
2925 r = renumber_backref_traverse(NODE_BODY(node), map);
2926 break;
2927
2928 default:
2929 break;
2930 }
2931
2932 return r;
2933 }
2934
2935 static int
numbered_ref_check(Node * node)2936 numbered_ref_check(Node* node)
2937 {
2938 int r = 0;
2939
2940 switch (NODE_TYPE(node)) {
2941 case NODE_LIST:
2942 case NODE_ALT:
2943 do {
2944 r = numbered_ref_check(NODE_CAR(node));
2945 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2946 break;
2947
2948 case NODE_ANCHOR:
2949 if (IS_NULL(NODE_BODY(node)))
2950 break;
2951 /* fall */
2952 case NODE_QUANT:
2953 r = numbered_ref_check(NODE_BODY(node));
2954 break;
2955
2956 case NODE_BAG:
2957 {
2958 BagNode* en = BAG_(node);
2959
2960 r = numbered_ref_check(NODE_BODY(node));
2961 if (r != 0) return r;
2962
2963 if (en->type == BAG_IF_ELSE) {
2964 if (IS_NOT_NULL(en->te.Then)) {
2965 r = numbered_ref_check(en->te.Then);
2966 if (r != 0) return r;
2967 }
2968 if (IS_NOT_NULL(en->te.Else)) {
2969 r = numbered_ref_check(en->te.Else);
2970 if (r != 0) return r;
2971 }
2972 }
2973 }
2974
2975 break;
2976
2977 case NODE_BACKREF:
2978 if (! NODE_IS_BY_NAME(node))
2979 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2980 break;
2981
2982 default:
2983 break;
2984 }
2985
2986 return r;
2987 }
2988
2989 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ParseEnv * env)2990 disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env)
2991 {
2992 int r, i, pos, counter;
2993 MemStatusType loc;
2994 GroupNumMap* map;
2995
2996 map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1));
2997 CHECK_NULL_RETURN_MEMERR(map);
2998 for (i = 1; i <= env->num_mem; i++) {
2999 map[i].new_val = 0;
3000 }
3001 counter = 0;
3002 r = make_named_capture_number_map(root, map, &counter);
3003 if (r < 0) return r;
3004
3005 r = renumber_backref_traverse(*root, map);
3006 if (r != 0) return r;
3007
3008 for (i = 1, pos = 1; i <= env->num_mem; i++) {
3009 if (map[i].new_val > 0) {
3010 PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i];
3011 pos++;
3012 }
3013 }
3014
3015 loc = env->cap_history;
3016 MEM_STATUS_CLEAR(env->cap_history);
3017 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
3018 if (MEM_STATUS_AT(loc, i)) {
3019 MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
3020 }
3021 }
3022
3023 env->num_mem = env->num_named;
3024 reg->num_mem = env->num_named;
3025
3026 return onig_renumber_name_table(reg, map);
3027 }
3028
3029 #ifdef USE_CALL
3030 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)3031 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
3032 {
3033 int i, offset;
3034 BagNode* en;
3035 AbsAddrType addr;
3036 AbsAddrType* paddr;
3037
3038 for (i = 0; i < uslist->num; i++) {
3039 if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) {
3040 if (NODE_IS_CALLED(uslist->us[i].target))
3041 return ONIGERR_PARSER_BUG;
3042 else {
3043 /* CASE: called node doesn't have called address.
3044 ex. /((|a\g<1>)(.){0}){0}\g<3>/
3045 group-1 doesn't called, but compiled into bytecodes,
3046 because group-3 is referred from outside.
3047 */
3048 continue;
3049 }
3050 }
3051
3052 en = BAG_(uslist->us[i].target);
3053 addr = en->m.called_addr;
3054 offset = uslist->us[i].offset;
3055
3056 paddr = (AbsAddrType* )((char* )reg->ops + offset);
3057 *paddr = addr;
3058 }
3059 return 0;
3060 }
3061 #endif
3062
3063 /* x is not included y ==> 1 : 0 */
3064 static int
is_exclusive(Node * x,Node * y,regex_t * reg)3065 is_exclusive(Node* x, Node* y, regex_t* reg)
3066 {
3067 int i, len;
3068 OnigCodePoint code;
3069 UChar *p;
3070 NodeType ytype;
3071
3072 retry:
3073 ytype = NODE_TYPE(y);
3074 switch (NODE_TYPE(x)) {
3075 case NODE_CTYPE:
3076 {
3077 if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
3078 CTYPE_(y)->ctype == CTYPE_ANYCHAR)
3079 break;
3080
3081 switch (ytype) {
3082 case NODE_CTYPE:
3083 if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
3084 CTYPE_(y)->not != CTYPE_(x)->not &&
3085 CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
3086 return 1;
3087 else
3088 return 0;
3089 break;
3090
3091 case NODE_CCLASS:
3092 swap:
3093 {
3094 Node* tmp;
3095 tmp = x; x = y; y = tmp;
3096 goto retry;
3097 }
3098 break;
3099
3100 case NODE_STRING:
3101 goto swap;
3102 break;
3103
3104 default:
3105 break;
3106 }
3107 }
3108 break;
3109
3110 case NODE_CCLASS:
3111 {
3112 int range;
3113 CClassNode* xc = CCLASS_(x);
3114
3115 switch (ytype) {
3116 case NODE_CTYPE:
3117 switch (CTYPE_(y)->ctype) {
3118 case CTYPE_ANYCHAR:
3119 return 0;
3120 break;
3121
3122 case ONIGENC_CTYPE_WORD:
3123 if (CTYPE_(y)->not == 0) {
3124 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
3125 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3126 for (i = 0; i < range; i++) {
3127 if (BITSET_AT(xc->bs, i)) {
3128 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
3129 }
3130 }
3131 return 1;
3132 }
3133 return 0;
3134 }
3135 else {
3136 if (IS_NOT_NULL(xc->mbuf)) return 0;
3137 if (IS_NCCLASS_NOT(xc)) return 0;
3138
3139 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3140 for (i = 0; i < range; i++) {
3141 if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
3142 if (BITSET_AT(xc->bs, i))
3143 return 0;
3144 }
3145 }
3146 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
3147 if (BITSET_AT(xc->bs, i)) return 0;
3148 }
3149 return 1;
3150 }
3151 break;
3152
3153 default:
3154 break;
3155 }
3156 break;
3157
3158 case NODE_CCLASS:
3159 {
3160 int v;
3161 CClassNode* yc = CCLASS_(y);
3162
3163 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
3164 v = BITSET_AT(xc->bs, i);
3165 if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
3166 v = BITSET_AT(yc->bs, i);
3167 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
3168 (v == 0 && IS_NCCLASS_NOT(yc)))
3169 return 0;
3170 }
3171 }
3172 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
3173 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
3174 return 1;
3175 return 0;
3176 }
3177 break;
3178
3179 case NODE_STRING:
3180 goto swap;
3181 break;
3182
3183 default:
3184 break;
3185 }
3186 }
3187 break;
3188
3189 case NODE_STRING:
3190 {
3191 StrNode* xs = STR_(x);
3192
3193 if (NODE_STRING_LEN(x) == 0)
3194 break;
3195
3196 switch (ytype) {
3197 case NODE_CTYPE:
3198 switch (CTYPE_(y)->ctype) {
3199 case CTYPE_ANYCHAR:
3200 break;
3201
3202 case ONIGENC_CTYPE_WORD:
3203 if (CTYPE_(y)->ascii_mode == 0) {
3204 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
3205 return CTYPE_(y)->not;
3206 else
3207 return !(CTYPE_(y)->not);
3208 }
3209 else {
3210 if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
3211 return CTYPE_(y)->not;
3212 else
3213 return !(CTYPE_(y)->not);
3214 }
3215 break;
3216 default:
3217 break;
3218 }
3219 break;
3220
3221 case NODE_CCLASS:
3222 {
3223 CClassNode* cc = CCLASS_(y);
3224
3225 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
3226 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
3227 return onig_is_code_in_cc(reg->enc, code, cc) == 0;
3228 }
3229 break;
3230
3231 case NODE_STRING:
3232 {
3233 UChar *q;
3234 StrNode* ys = STR_(y);
3235
3236 len = NODE_STRING_LEN(x);
3237 if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
3238
3239 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
3240 if (*p != *q) return 1;
3241 }
3242 }
3243 break;
3244
3245 default:
3246 break;
3247 }
3248 }
3249 break;
3250
3251 default:
3252 break;
3253 }
3254
3255 return 0;
3256 }
3257
3258 static Node*
get_tree_head_literal(Node * node,int exact,regex_t * reg)3259 get_tree_head_literal(Node* node, int exact, regex_t* reg)
3260 {
3261 Node* n = NULL_NODE;
3262
3263 switch (NODE_TYPE(node)) {
3264 case NODE_BACKREF:
3265 case NODE_ALT:
3266 #ifdef USE_CALL
3267 case NODE_CALL:
3268 #endif
3269 break;
3270
3271 case NODE_CTYPE:
3272 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
3273 break;
3274 /* fall */
3275 case NODE_CCLASS:
3276 if (exact == 0) {
3277 n = node;
3278 }
3279 break;
3280
3281 case NODE_LIST:
3282 n = get_tree_head_literal(NODE_CAR(node), exact, reg);
3283 break;
3284
3285 case NODE_STRING:
3286 {
3287 StrNode* sn = STR_(node);
3288
3289 if (sn->end <= sn->s)
3290 break;
3291
3292 if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) {
3293 n = node;
3294 }
3295 }
3296 break;
3297
3298 case NODE_QUANT:
3299 {
3300 QuantNode* qn = QUANT_(node);
3301 if (qn->lower > 0) {
3302 if (IS_NOT_NULL(qn->head_exact))
3303 n = qn->head_exact;
3304 else
3305 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3306 }
3307 }
3308 break;
3309
3310 case NODE_BAG:
3311 {
3312 BagNode* en = BAG_(node);
3313 switch (en->type) {
3314 case BAG_OPTION:
3315 case BAG_MEMORY:
3316 case BAG_STOP_BACKTRACK:
3317 case BAG_IF_ELSE:
3318 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3319 break;
3320 }
3321 }
3322 break;
3323
3324 case NODE_ANCHOR:
3325 if (ANCHOR_(node)->type == ANCR_PREC_READ)
3326 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3327 break;
3328
3329 case NODE_GIMMICK:
3330 default:
3331 break;
3332 }
3333
3334 return n;
3335 }
3336
3337 enum GetValue {
3338 GET_VALUE_NONE = -1,
3339 GET_VALUE_IGNORE = 0,
3340 GET_VALUE_FOUND = 1
3341 };
3342
3343 static int
get_tree_tail_literal(Node * node,Node ** rnode,regex_t * reg)3344 get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)
3345 {
3346 int r;
3347
3348 switch (NODE_TYPE(node)) {
3349 case NODE_LIST:
3350 if (IS_NULL(NODE_CDR(node))) {
3351 r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3352 }
3353 else {
3354 r = get_tree_tail_literal(NODE_CDR(node), rnode, reg);
3355 if (r == GET_VALUE_IGNORE) {
3356 r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3357 }
3358 }
3359 break;
3360
3361 #ifdef USE_CALL
3362 case NODE_CALL:
3363 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3364 break;
3365 #endif
3366
3367 case NODE_CTYPE:
3368 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) {
3369 r = GET_VALUE_NONE;
3370 break;
3371 }
3372 /* fall */
3373 case NODE_CCLASS:
3374 *rnode = node;
3375 r = GET_VALUE_FOUND;
3376 break;
3377
3378 case NODE_STRING:
3379 {
3380 StrNode* sn = STR_(node);
3381
3382 if (sn->end <= sn->s) {
3383 r = GET_VALUE_IGNORE;
3384 break;
3385 }
3386
3387 if (NODE_IS_REAL_IGNORECASE(node)) {
3388 r = GET_VALUE_NONE;
3389 break;
3390 }
3391
3392 *rnode = node;
3393 r = GET_VALUE_FOUND;
3394 }
3395 break;
3396
3397 case NODE_QUANT:
3398 {
3399 QuantNode* qn = QUANT_(node);
3400 if (qn->lower != 0) {
3401 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3402 }
3403 else
3404 r = GET_VALUE_NONE;
3405 }
3406 break;
3407
3408 case NODE_BAG:
3409 {
3410 BagNode* en = BAG_(node);
3411
3412 if (en->type == BAG_MEMORY) {
3413 if (NODE_IS_MARK1(node))
3414 r = GET_VALUE_NONE;
3415 else {
3416 NODE_STATUS_ADD(node, MARK1);
3417 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3418 NODE_STATUS_REMOVE(node, MARK1);
3419 }
3420 }
3421 else {
3422 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3423 }
3424 }
3425 break;
3426
3427 case NODE_ANCHOR:
3428 case NODE_GIMMICK:
3429 r = GET_VALUE_IGNORE;
3430 break;
3431
3432 case NODE_ALT:
3433 case NODE_BACKREF:
3434 default:
3435 r = GET_VALUE_NONE;
3436 break;
3437 }
3438
3439 return r;
3440 }
3441
3442 static int
check_called_node_in_look_behind(Node * node,int not)3443 check_called_node_in_look_behind(Node* node, int not)
3444 {
3445 int r;
3446
3447 r = 0;
3448
3449 switch (NODE_TYPE(node)) {
3450 case NODE_LIST:
3451 case NODE_ALT:
3452 do {
3453 r = check_called_node_in_look_behind(NODE_CAR(node), not);
3454 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3455 break;
3456
3457 case NODE_QUANT:
3458 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3459 break;
3460
3461 case NODE_BAG:
3462 {
3463 BagNode* en = BAG_(node);
3464
3465 if (en->type == BAG_MEMORY) {
3466 if (NODE_IS_MARK1(node))
3467 return 0;
3468 else {
3469 NODE_STATUS_ADD(node, MARK1);
3470 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3471 NODE_STATUS_REMOVE(node, MARK1);
3472 }
3473 }
3474 else {
3475 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3476 if (r == 0 && en->type == BAG_IF_ELSE) {
3477 if (IS_NOT_NULL(en->te.Then)) {
3478 r = check_called_node_in_look_behind(en->te.Then, not);
3479 if (r != 0) break;
3480 }
3481 if (IS_NOT_NULL(en->te.Else)) {
3482 r = check_called_node_in_look_behind(en->te.Else, not);
3483 }
3484 }
3485 }
3486 }
3487 break;
3488
3489 case NODE_ANCHOR:
3490 if (IS_NOT_NULL(NODE_BODY(node)))
3491 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3492 break;
3493
3494 case NODE_GIMMICK:
3495 if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3496 return 1;
3497 break;
3498
3499 default:
3500 break;
3501 }
3502
3503 return r;
3504 }
3505
3506 /* allowed node types in look-behind */
3507 #define ALLOWED_TYPE_IN_LB \
3508 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
3509 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
3510 | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK)
3511
3512 #define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3513 #define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3514
3515 #define ALLOWED_ANCHOR_IN_LB \
3516 ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
3517 | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
3518 | ANCR_WORD_BEGIN | ANCR_WORD_END \
3519 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3520
3521 #define ALLOWED_ANCHOR_IN_LB_NOT \
3522 ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
3523 | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
3524 | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
3525 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3526
3527
3528 static int
check_node_in_look_behind(Node * node,int not,int * used)3529 check_node_in_look_behind(Node* node, int not, int* used)
3530 {
3531 static unsigned int
3532 bag_mask[2] = { ALLOWED_BAG_IN_LB, ALLOWED_BAG_IN_LB_NOT };
3533
3534 static unsigned int
3535 anchor_mask[2] = { ALLOWED_ANCHOR_IN_LB, ALLOWED_ANCHOR_IN_LB_NOT };
3536
3537 NodeType type;
3538 int r = 0;
3539
3540 type = NODE_TYPE(node);
3541 if ((NODE_TYPE2BIT(type) & ALLOWED_TYPE_IN_LB) == 0)
3542 return 1;
3543
3544 switch (type) {
3545 case NODE_LIST:
3546 case NODE_ALT:
3547 do {
3548 r = check_node_in_look_behind(NODE_CAR(node), not, used);
3549 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3550 break;
3551
3552 case NODE_QUANT:
3553 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3554 break;
3555
3556 case NODE_BAG:
3557 {
3558 BagNode* en = BAG_(node);
3559 if (((1<<en->type) & bag_mask[not]) == 0)
3560 return 1;
3561
3562 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3563 if (r != 0) break;
3564
3565 if (en->type == BAG_MEMORY) {
3566 if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)
3567 || NODE_IS_REFERENCED(node))
3568 *used = TRUE;
3569 }
3570 else if (en->type == BAG_IF_ELSE) {
3571 if (IS_NOT_NULL(en->te.Then)) {
3572 r = check_node_in_look_behind(en->te.Then, not, used);
3573 if (r != 0) break;
3574 }
3575 if (IS_NOT_NULL(en->te.Else)) {
3576 r = check_node_in_look_behind(en->te.Else, not, used);
3577 }
3578 }
3579 }
3580 break;
3581
3582 case NODE_ANCHOR:
3583 type = ANCHOR_(node)->type;
3584 if ((type & anchor_mask[not]) == 0)
3585 return 1;
3586
3587 if (IS_NOT_NULL(NODE_BODY(node)))
3588 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3589 break;
3590
3591 case NODE_GIMMICK:
3592 if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3593 return 1;
3594 break;
3595
3596 case NODE_CALL:
3597 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3598 break;
3599
3600 default:
3601 break;
3602 }
3603 return r;
3604 }
3605
3606 static OnigLen
node_min_byte_len(Node * node,ParseEnv * env)3607 node_min_byte_len(Node* node, ParseEnv* env)
3608 {
3609 OnigLen len;
3610 OnigLen tmin;
3611
3612 len = 0;
3613 switch (NODE_TYPE(node)) {
3614 case NODE_BACKREF:
3615 if (! NODE_IS_CHECKER(node)) {
3616 int i;
3617 int* backs;
3618 MemEnv* mem_env = PARSEENV_MEMENV(env);
3619 BackRefNode* br = BACKREF_(node);
3620 if (NODE_IS_RECURSION(node)) break;
3621
3622 backs = BACKREFS_P(br);
3623 len = node_min_byte_len(mem_env[backs[0]].mem_node, env);
3624 for (i = 1; i < br->back_num; i++) {
3625 tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env);
3626 if (len > tmin) len = tmin;
3627 }
3628 }
3629 break;
3630
3631 #ifdef USE_CALL
3632 case NODE_CALL:
3633 {
3634 Node* t = NODE_BODY(node);
3635 if (NODE_IS_FIXED_MIN(t))
3636 len = BAG_(t)->min_len;
3637 else
3638 len = node_min_byte_len(t, env);
3639 }
3640 break;
3641 #endif
3642
3643 case NODE_LIST:
3644 do {
3645 tmin = node_min_byte_len(NODE_CAR(node), env);
3646 len = distance_add(len, tmin);
3647 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3648 break;
3649
3650 case NODE_ALT:
3651 {
3652 Node *x, *y;
3653 y = node;
3654 do {
3655 x = NODE_CAR(y);
3656 tmin = node_min_byte_len(x, env);
3657 if (y == node) len = tmin;
3658 else if (len > tmin) len = tmin;
3659 } while (IS_NOT_NULL(y = NODE_CDR(y)));
3660 }
3661 break;
3662
3663 case NODE_STRING:
3664 {
3665 StrNode* sn = STR_(node);
3666 len = (int )(sn->end - sn->s);
3667 }
3668 break;
3669
3670 case NODE_CTYPE:
3671 case NODE_CCLASS:
3672 len = ONIGENC_MBC_MINLEN(env->enc);
3673 break;
3674
3675 case NODE_QUANT:
3676 {
3677 QuantNode* qn = QUANT_(node);
3678
3679 if (qn->lower > 0) {
3680 len = node_min_byte_len(NODE_BODY(node), env);
3681 len = distance_multiply(len, qn->lower);
3682 }
3683 }
3684 break;
3685
3686 case NODE_BAG:
3687 {
3688 BagNode* en = BAG_(node);
3689 switch (en->type) {
3690 case BAG_MEMORY:
3691 if (NODE_IS_FIXED_MIN(node))
3692 len = en->min_len;
3693 else {
3694 if (NODE_IS_MARK1(node))
3695 len = 0; /* recursive */
3696 else {
3697 NODE_STATUS_ADD(node, MARK1);
3698 len = node_min_byte_len(NODE_BODY(node), env);
3699 NODE_STATUS_REMOVE(node, MARK1);
3700
3701 en->min_len = len;
3702 NODE_STATUS_ADD(node, FIXED_MIN);
3703 }
3704 }
3705 break;
3706
3707 case BAG_OPTION:
3708 case BAG_STOP_BACKTRACK:
3709 len = node_min_byte_len(NODE_BODY(node), env);
3710 break;
3711 case BAG_IF_ELSE:
3712 {
3713 OnigLen elen;
3714
3715 len = node_min_byte_len(NODE_BODY(node), env);
3716 if (IS_NOT_NULL(en->te.Then))
3717 len += node_min_byte_len(en->te.Then, env);
3718 if (IS_NOT_NULL(en->te.Else))
3719 elen = node_min_byte_len(en->te.Else, env);
3720 else elen = 0;
3721
3722 if (elen < len) len = elen;
3723 }
3724 break;
3725 }
3726 }
3727 break;
3728
3729 case NODE_GIMMICK:
3730 {
3731 GimmickNode* g = GIMMICK_(node);
3732 if (g->type == GIMMICK_FAIL) {
3733 len = INFINITE_LEN;
3734 break;
3735 }
3736 }
3737 /* fall */
3738 case NODE_ANCHOR:
3739 default:
3740 break;
3741 }
3742
3743 return len;
3744 }
3745
3746 static int
check_backrefs(Node * node,ParseEnv * env)3747 check_backrefs(Node* node, ParseEnv* env)
3748 {
3749 int r;
3750
3751 switch (NODE_TYPE(node)) {
3752 case NODE_LIST:
3753 case NODE_ALT:
3754 do {
3755 r = check_backrefs(NODE_CAR(node), env);
3756 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3757 break;
3758
3759 case NODE_ANCHOR:
3760 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3761 r = 0;
3762 break;
3763 }
3764 /* fall */
3765 case NODE_QUANT:
3766 r = check_backrefs(NODE_BODY(node), env);
3767 break;
3768
3769 case NODE_BAG:
3770 r = check_backrefs(NODE_BODY(node), env);
3771 {
3772 BagNode* en = BAG_(node);
3773
3774 if (en->type == BAG_IF_ELSE) {
3775 if (r != 0) return r;
3776 if (IS_NOT_NULL(en->te.Then)) {
3777 r = check_backrefs(en->te.Then, env);
3778 if (r != 0) return r;
3779 }
3780 if (IS_NOT_NULL(en->te.Else)) {
3781 r = check_backrefs(en->te.Else, env);
3782 }
3783 }
3784 }
3785 break;
3786
3787 case NODE_BACKREF:
3788 {
3789 int i;
3790 BackRefNode* br = BACKREF_(node);
3791 int* backs = BACKREFS_P(br);
3792 MemEnv* mem_env = PARSEENV_MEMENV(env);
3793
3794 for (i = 0; i < br->back_num; i++) {
3795 if (backs[i] > env->num_mem)
3796 return ONIGERR_INVALID_BACKREF;
3797
3798 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3799 }
3800 r = 0;
3801 }
3802 break;
3803
3804 default:
3805 r = 0;
3806 break;
3807 }
3808
3809 return r;
3810 }
3811
3812 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ParseEnv * env)3813 set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env)
3814 {
3815 int r;
3816
3817 switch (NODE_TYPE(node)) {
3818 case NODE_LIST:
3819 case NODE_ALT:
3820 do {
3821 r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3822 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3823 break;
3824
3825 case NODE_ANCHOR:
3826 {
3827 AnchorNode* an = ANCHOR_(node);
3828
3829 if (! ANCHOR_HAS_BODY(an)) {
3830 r = 0;
3831 break;
3832 }
3833
3834 switch (an->type) {
3835 case ANCR_PREC_READ:
3836 case ANCR_LOOK_BEHIND:
3837 empty = NULL_NODE;
3838 break;
3839 default:
3840 break;
3841 }
3842 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3843 }
3844 break;
3845
3846 case NODE_QUANT:
3847 {
3848 QuantNode* qn = QUANT_(node);
3849
3850 if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3851 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3852 }
3853 break;
3854
3855 case NODE_BAG:
3856 if (IS_NOT_NULL(NODE_BODY(node))) {
3857 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3858 if (r != 0) return r;
3859 }
3860 {
3861 BagNode* en = BAG_(node);
3862
3863 r = 0;
3864 if (en->type == BAG_MEMORY) {
3865 if (NODE_IS_BACKREF(node)) {
3866 if (IS_NOT_NULL(empty))
3867 PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3868 }
3869 }
3870 else if (en->type == BAG_IF_ELSE) {
3871 if (IS_NOT_NULL(en->te.Then)) {
3872 r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3873 if (r != 0) return r;
3874 }
3875 if (IS_NOT_NULL(en->te.Else)) {
3876 r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3877 }
3878 }
3879 }
3880 break;
3881
3882 default:
3883 r = 0;
3884 break;
3885 }
3886
3887 return r;
3888 }
3889
3890 static int
is_ancestor_node(Node * node,Node * me)3891 is_ancestor_node(Node* node, Node* me)
3892 {
3893 Node* parent;
3894
3895 while ((parent = NODE_PARENT(me)) != NULL_NODE) {
3896 if (parent == node) return 1;
3897 me = parent;
3898 }
3899 return 0;
3900 }
3901
3902 static void
set_empty_status_check_trav(Node * node,ParseEnv * env)3903 set_empty_status_check_trav(Node* node, ParseEnv* env)
3904 {
3905 switch (NODE_TYPE(node)) {
3906 case NODE_LIST:
3907 case NODE_ALT:
3908 do {
3909 set_empty_status_check_trav(NODE_CAR(node), env);
3910 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3911 break;
3912
3913 case NODE_ANCHOR:
3914 {
3915 AnchorNode* an = ANCHOR_(node);
3916
3917 if (! ANCHOR_HAS_BODY(an)) break;
3918 set_empty_status_check_trav(NODE_BODY(node), env);
3919 }
3920 break;
3921
3922 case NODE_QUANT:
3923 set_empty_status_check_trav(NODE_BODY(node), env);
3924 break;
3925
3926 case NODE_BAG:
3927 if (IS_NOT_NULL(NODE_BODY(node)))
3928 set_empty_status_check_trav(NODE_BODY(node), env);
3929 {
3930 BagNode* en = BAG_(node);
3931
3932 if (en->type == BAG_IF_ELSE) {
3933 if (IS_NOT_NULL(en->te.Then)) {
3934 set_empty_status_check_trav(en->te.Then, env);
3935 }
3936 if (IS_NOT_NULL(en->te.Else)) {
3937 set_empty_status_check_trav(en->te.Else, env);
3938 }
3939 }
3940 }
3941 break;
3942
3943 case NODE_BACKREF:
3944 {
3945 int i;
3946 int* backs;
3947 MemEnv* mem_env = PARSEENV_MEMENV(env);
3948 BackRefNode* br = BACKREF_(node);
3949 backs = BACKREFS_P(br);
3950 for (i = 0; i < br->back_num; i++) {
3951 Node* ernode = mem_env[backs[i]].empty_repeat_node;
3952 if (IS_NOT_NULL(ernode)) {
3953 if (! is_ancestor_node(ernode, node)) {
3954 MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]);
3955 NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
3956 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
3957 }
3958 }
3959 }
3960 }
3961 break;
3962
3963 default:
3964 break;
3965 }
3966 }
3967
3968 static void
set_parent_node_trav(Node * node,Node * parent)3969 set_parent_node_trav(Node* node, Node* parent)
3970 {
3971 NODE_PARENT(node) = parent;
3972
3973 switch (NODE_TYPE(node)) {
3974 case NODE_LIST:
3975 case NODE_ALT:
3976 do {
3977 set_parent_node_trav(NODE_CAR(node), node);
3978 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3979 break;
3980
3981 case NODE_ANCHOR:
3982 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
3983 set_parent_node_trav(NODE_BODY(node), node);
3984 break;
3985
3986 case NODE_QUANT:
3987 set_parent_node_trav(NODE_BODY(node), node);
3988 break;
3989
3990 case NODE_BAG:
3991 if (IS_NOT_NULL(NODE_BODY(node)))
3992 set_parent_node_trav(NODE_BODY(node), node);
3993 {
3994 BagNode* en = BAG_(node);
3995
3996 if (en->type == BAG_IF_ELSE) {
3997 if (IS_NOT_NULL(en->te.Then))
3998 set_parent_node_trav(en->te.Then, node);
3999 if (IS_NOT_NULL(en->te.Else)) {
4000 set_parent_node_trav(en->te.Else, node);
4001 }
4002 }
4003 }
4004 break;
4005
4006 default:
4007 break;
4008 }
4009 }
4010
4011
4012 #ifdef USE_CALL
4013
4014 #define RECURSION_EXIST (1<<0)
4015 #define RECURSION_MUST (1<<1)
4016 #define RECURSION_INFINITE (1<<2)
4017
4018 static int
infinite_recursive_call_check(Node * node,ParseEnv * env,int head)4019 infinite_recursive_call_check(Node* node, ParseEnv* env, int head)
4020 {
4021 int ret;
4022 int r = 0;
4023
4024 switch (NODE_TYPE(node)) {
4025 case NODE_LIST:
4026 {
4027 Node *x;
4028 OnigLen min;
4029
4030 x = node;
4031 do {
4032 ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
4033 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4034 r |= ret;
4035 if (head != 0) {
4036 min = node_min_byte_len(NODE_CAR(x), env);
4037 if (min != 0) head = 0;
4038 }
4039 } while (IS_NOT_NULL(x = NODE_CDR(x)));
4040 }
4041 break;
4042
4043 case NODE_ALT:
4044 {
4045 int must;
4046
4047 must = RECURSION_MUST;
4048 do {
4049 ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
4050 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4051
4052 r |= (ret & RECURSION_EXIST);
4053 must &= ret;
4054 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4055 r |= must;
4056 }
4057 break;
4058
4059 case NODE_QUANT:
4060 if (QUANT_(node)->upper == 0) break;
4061
4062 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4063 if (r < 0) return r;
4064 if ((r & RECURSION_MUST) != 0) {
4065 if (QUANT_(node)->lower == 0)
4066 r &= ~RECURSION_MUST;
4067 }
4068 break;
4069
4070 case NODE_ANCHOR:
4071 if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
4072 break;
4073 /* fall */
4074 case NODE_CALL:
4075 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4076 break;
4077
4078 case NODE_BAG:
4079 {
4080 BagNode* en = BAG_(node);
4081
4082 if (en->type == BAG_MEMORY) {
4083 if (NODE_IS_MARK2(node))
4084 return 0;
4085 else if (NODE_IS_MARK1(node))
4086 return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
4087 : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
4088 else {
4089 NODE_STATUS_ADD(node, MARK2);
4090 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4091 NODE_STATUS_REMOVE(node, MARK2);
4092 }
4093 }
4094 else if (en->type == BAG_IF_ELSE) {
4095 int eret;
4096
4097 ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
4098 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4099 r |= ret;
4100 if (IS_NOT_NULL(en->te.Then)) {
4101 OnigLen min;
4102 if (head != 0) {
4103 min = node_min_byte_len(NODE_BODY(node), env);
4104 }
4105 else min = 0;
4106
4107 ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
4108 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4109 r |= ret;
4110 }
4111 if (IS_NOT_NULL(en->te.Else)) {
4112 eret = infinite_recursive_call_check(en->te.Else, env, head);
4113 if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
4114 r |= (eret & RECURSION_EXIST);
4115 if ((eret & RECURSION_MUST) == 0)
4116 r &= ~RECURSION_MUST;
4117 }
4118 else {
4119 r &= ~RECURSION_MUST;
4120 }
4121 }
4122 else {
4123 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4124 }
4125 }
4126 break;
4127
4128 default:
4129 break;
4130 }
4131
4132 return r;
4133 }
4134
4135 static int
infinite_recursive_call_check_trav(Node * node,ParseEnv * env)4136 infinite_recursive_call_check_trav(Node* node, ParseEnv* env)
4137 {
4138 int r;
4139
4140 switch (NODE_TYPE(node)) {
4141 case NODE_LIST:
4142 case NODE_ALT:
4143 do {
4144 r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
4145 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4146 break;
4147
4148 case NODE_ANCHOR:
4149 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4150 r = 0;
4151 break;
4152 }
4153 /* fall */
4154 case NODE_QUANT:
4155 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4156 break;
4157
4158 case NODE_BAG:
4159 {
4160 BagNode* en = BAG_(node);
4161
4162 if (en->type == BAG_MEMORY) {
4163 if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
4164 int ret;
4165
4166 NODE_STATUS_ADD(node, MARK1);
4167
4168 ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
4169 if (ret < 0) return ret;
4170 else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
4171 return ONIGERR_NEVER_ENDING_RECURSION;
4172
4173 NODE_STATUS_REMOVE(node, MARK1);
4174 }
4175 }
4176 else if (en->type == BAG_IF_ELSE) {
4177 if (IS_NOT_NULL(en->te.Then)) {
4178 r = infinite_recursive_call_check_trav(en->te.Then, env);
4179 if (r != 0) return r;
4180 }
4181 if (IS_NOT_NULL(en->te.Else)) {
4182 r = infinite_recursive_call_check_trav(en->te.Else, env);
4183 if (r != 0) return r;
4184 }
4185 }
4186 }
4187
4188 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4189 break;
4190
4191 default:
4192 r = 0;
4193 break;
4194 }
4195
4196 return r;
4197 }
4198
4199 static int
recursive_call_check(Node * node)4200 recursive_call_check(Node* node)
4201 {
4202 int r;
4203
4204 switch (NODE_TYPE(node)) {
4205 case NODE_LIST:
4206 case NODE_ALT:
4207 r = 0;
4208 do {
4209 r |= recursive_call_check(NODE_CAR(node));
4210 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4211 break;
4212
4213 case NODE_ANCHOR:
4214 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4215 r = 0;
4216 break;
4217 }
4218 /* fall */
4219 case NODE_QUANT:
4220 r = recursive_call_check(NODE_BODY(node));
4221 break;
4222
4223 case NODE_CALL:
4224 r = recursive_call_check(NODE_BODY(node));
4225 if (r != 0) {
4226 if (NODE_IS_MARK1(NODE_BODY(node)))
4227 NODE_STATUS_ADD(node, RECURSION);
4228 }
4229 break;
4230
4231 case NODE_BAG:
4232 {
4233 BagNode* en = BAG_(node);
4234
4235 if (en->type == BAG_MEMORY) {
4236 if (NODE_IS_MARK2(node))
4237 return 0;
4238 else if (NODE_IS_MARK1(node))
4239 return 1; /* recursion */
4240 else {
4241 NODE_STATUS_ADD(node, MARK2);
4242 r = recursive_call_check(NODE_BODY(node));
4243 NODE_STATUS_REMOVE(node, MARK2);
4244 }
4245 }
4246 else if (en->type == BAG_IF_ELSE) {
4247 r = 0;
4248 if (IS_NOT_NULL(en->te.Then)) {
4249 r |= recursive_call_check(en->te.Then);
4250 }
4251 if (IS_NOT_NULL(en->te.Else)) {
4252 r |= recursive_call_check(en->te.Else);
4253 }
4254 r |= recursive_call_check(NODE_BODY(node));
4255 }
4256 else {
4257 r = recursive_call_check(NODE_BODY(node));
4258 }
4259 }
4260 break;
4261
4262 default:
4263 r = 0;
4264 break;
4265 }
4266
4267 return r;
4268 }
4269
4270 #define IN_RECURSION (1<<0)
4271 #define FOUND_CALLED_NODE 1
4272
4273 static int
recursive_call_check_trav(Node * node,ParseEnv * env,int state)4274 recursive_call_check_trav(Node* node, ParseEnv* env, int state)
4275 {
4276 int r = 0;
4277
4278 switch (NODE_TYPE(node)) {
4279 case NODE_LIST:
4280 case NODE_ALT:
4281 {
4282 int ret;
4283 do {
4284 ret = recursive_call_check_trav(NODE_CAR(node), env, state);
4285 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
4286 else if (ret < 0) return ret;
4287 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4288 }
4289 break;
4290
4291 case NODE_QUANT:
4292 r = recursive_call_check_trav(NODE_BODY(node), env, state);
4293 if (QUANT_(node)->upper == 0) {
4294 if (r == FOUND_CALLED_NODE)
4295 QUANT_(node)->include_referred = 1;
4296 }
4297 break;
4298
4299 case NODE_ANCHOR:
4300 {
4301 AnchorNode* an = ANCHOR_(node);
4302 if (ANCHOR_HAS_BODY(an))
4303 r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
4304 }
4305 break;
4306
4307 case NODE_BAG:
4308 {
4309 int ret;
4310 int state1;
4311 BagNode* en = BAG_(node);
4312
4313 if (en->type == BAG_MEMORY) {
4314 if (NODE_IS_CALLED(node)) {
4315 r = FOUND_CALLED_NODE;
4316 goto check_recursion;
4317 }
4318 else if ((state & IN_RECURSION) != 0) {
4319 check_recursion:
4320 if (! NODE_IS_RECURSION(node)) {
4321 NODE_STATUS_ADD(node, MARK1);
4322 ret = recursive_call_check(NODE_BODY(node));
4323 if (ret != 0) {
4324 NODE_STATUS_ADD(node, RECURSION);
4325 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
4326 }
4327 NODE_STATUS_REMOVE(node, MARK1);
4328 }
4329 }
4330 }
4331
4332 state1 = state;
4333 if (NODE_IS_RECURSION(node))
4334 state1 |= IN_RECURSION;
4335
4336 ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
4337 if (ret == FOUND_CALLED_NODE)
4338 r = FOUND_CALLED_NODE;
4339
4340 if (en->type == BAG_IF_ELSE) {
4341 if (IS_NOT_NULL(en->te.Then)) {
4342 ret = recursive_call_check_trav(en->te.Then, env, state1);
4343 if (ret == FOUND_CALLED_NODE)
4344 r = FOUND_CALLED_NODE;
4345 }
4346 if (IS_NOT_NULL(en->te.Else)) {
4347 ret = recursive_call_check_trav(en->te.Else, env, state1);
4348 if (ret == FOUND_CALLED_NODE)
4349 r = FOUND_CALLED_NODE;
4350 }
4351 }
4352 }
4353 break;
4354
4355 default:
4356 break;
4357 }
4358
4359 return r;
4360 }
4361
4362 #endif
4363
4364 static void
remove_from_list(Node * prev,Node * a)4365 remove_from_list(Node* prev, Node* a)
4366 {
4367 if (NODE_CDR(prev) != a) return ;
4368
4369 NODE_CDR(prev) = NODE_CDR(a);
4370 NODE_CDR(a) = NULL_NODE;
4371 }
4372
4373 static int
reduce_string_list(Node * node,OnigEncoding enc)4374 reduce_string_list(Node* node, OnigEncoding enc)
4375 {
4376 int r = 0;
4377
4378 switch (NODE_TYPE(node)) {
4379 case NODE_LIST:
4380 {
4381 Node* prev;
4382 Node* curr;
4383 Node* prev_node;
4384 Node* next_node;
4385
4386 prev = NULL_NODE;
4387 do {
4388 next_node = NODE_CDR(node);
4389 curr = NODE_CAR(node);
4390 if (NODE_TYPE(curr) == NODE_STRING) {
4391 if (IS_NULL(prev)
4392 || STR_(curr)->flag != STR_(prev)->flag
4393 || NODE_STATUS(curr) != NODE_STATUS(prev)) {
4394 prev = curr;
4395 prev_node = node;
4396 }
4397 else {
4398 r = node_str_node_cat(prev, curr);
4399 if (r != 0) return r;
4400 remove_from_list(prev_node, node);
4401 onig_node_free(node);
4402 }
4403 }
4404 else {
4405 if (IS_NOT_NULL(prev)) {
4406 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4407 StrNode* sn = STR_(prev);
4408 if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4409 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4410 #endif
4411 prev = NULL_NODE;
4412 }
4413 r = reduce_string_list(curr, enc);
4414 if (r != 0) return r;
4415 prev_node = node;
4416 }
4417
4418 node = next_node;
4419 } while (r == 0 && IS_NOT_NULL(node));
4420
4421 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4422 if (IS_NOT_NULL(prev)) {
4423 StrNode* sn = STR_(prev);
4424 if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4425 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4426 }
4427 #endif
4428 }
4429 break;
4430
4431 case NODE_ALT:
4432 do {
4433 r = reduce_string_list(NODE_CAR(node), enc);
4434 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4435 break;
4436
4437 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4438 case NODE_STRING:
4439 {
4440 StrNode* sn = STR_(node);
4441 if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4442 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4443 }
4444 break;
4445 #endif
4446
4447 case NODE_ANCHOR:
4448 if (IS_NULL(NODE_BODY(node)))
4449 break;
4450 /* fall */
4451 case NODE_QUANT:
4452 r = reduce_string_list(NODE_BODY(node), enc);
4453 break;
4454
4455 case NODE_BAG:
4456 {
4457 BagNode* en = BAG_(node);
4458
4459 r = reduce_string_list(NODE_BODY(node), enc);
4460 if (r != 0) return r;
4461
4462 if (en->type == BAG_IF_ELSE) {
4463 if (IS_NOT_NULL(en->te.Then)) {
4464 r = reduce_string_list(en->te.Then, enc);
4465 if (r != 0) return r;
4466 }
4467 if (IS_NOT_NULL(en->te.Else)) {
4468 r = reduce_string_list(en->te.Else, enc);
4469 if (r != 0) return r;
4470 }
4471 }
4472 }
4473 break;
4474
4475 default:
4476 break;
4477 }
4478
4479 return r;
4480 }
4481
4482
4483 #define IN_ALT (1<<0)
4484 #define IN_NOT (1<<1)
4485 #define IN_REAL_REPEAT (1<<2)
4486 #define IN_VAR_REPEAT (1<<3)
4487 #define IN_ZERO_REPEAT (1<<4)
4488 #define IN_MULTI_ENTRY (1<<5)
4489 #define IN_PREC_READ (1<<6)
4490 #define IN_LOOK_BEHIND (1<<7)
4491 #define IN_PEEK (1<<8)
4492
4493 /* divide different length alternatives in look-behind.
4494 (?<=A|B) ==> (?<=A)|(?<=B)
4495 (?<!A|B) ==> (?<!A)(?<!B)
4496 */
4497 static int
divide_look_behind_alternatives(Node * node)4498 divide_look_behind_alternatives(Node* node)
4499 {
4500 int r;
4501 int anc_type;
4502 Node *head, *np, *insert_node;
4503 AnchorNode* an;
4504
4505 an = ANCHOR_(node);
4506 anc_type = an->type;
4507
4508 head = NODE_ANCHOR_BODY(an);
4509 np = NODE_CAR(head);
4510 node_swap(node, head);
4511 NODE_CAR(node) = head;
4512 NODE_BODY(head) = np;
4513
4514 np = node;
4515 while (IS_NOT_NULL(np = NODE_CDR(np))) {
4516 r = onig_node_copy(&insert_node, head);
4517 if (r != 0) return r;
4518 CHECK_NULL_RETURN_MEMERR(insert_node);
4519 NODE_BODY(insert_node) = NODE_CAR(np);
4520 NODE_CAR(np) = insert_node;
4521 }
4522
4523 if (anc_type == ANCR_LOOK_BEHIND_NOT) {
4524 np = node;
4525 do {
4526 NODE_SET_TYPE(np, NODE_LIST); /* alt -> list */
4527 } while (IS_NOT_NULL(np = NODE_CDR(np)));
4528 }
4529 return 0;
4530 }
4531
4532 static int
node_reduce_in_look_behind(Node * node)4533 node_reduce_in_look_behind(Node* node)
4534 {
4535 NodeType type;
4536 Node* body;
4537
4538 if (NODE_TYPE(node) != NODE_QUANT) return 0;
4539
4540 body = NODE_BODY(node);
4541 type = NODE_TYPE(body);
4542 if (type == NODE_STRING || type == NODE_CTYPE ||
4543 type == NODE_CCLASS || type == NODE_BACKREF) {
4544 QuantNode* qn = QUANT_(node);
4545 qn->upper = qn->lower;
4546 if (qn->upper == 0)
4547 return 1; /* removed */
4548 }
4549
4550 return 0;
4551 }
4552
4553 static int
list_reduce_in_look_behind(Node * node)4554 list_reduce_in_look_behind(Node* node)
4555 {
4556 int r;
4557
4558 switch (NODE_TYPE(node)) {
4559 case NODE_QUANT:
4560 r = node_reduce_in_look_behind(node);
4561 if (r > 0) r = 0;
4562 break;
4563
4564 case NODE_LIST:
4565 do {
4566 r = node_reduce_in_look_behind(NODE_CAR(node));
4567 if (r <= 0) break;
4568 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4569 break;
4570
4571 default:
4572 r = 0;
4573 break;
4574 }
4575
4576 return r;
4577 }
4578
4579 static int
alt_reduce_in_look_behind(Node * node,regex_t * reg,ParseEnv * env)4580 alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env)
4581 {
4582 int r;
4583
4584 switch (NODE_TYPE(node)) {
4585 case NODE_ALT:
4586 do {
4587 r = list_reduce_in_look_behind(NODE_CAR(node));
4588 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4589 break;
4590
4591 default:
4592 r = list_reduce_in_look_behind(node);
4593 break;
4594 }
4595
4596 return r;
4597 }
4598
4599 static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env);
4600
4601 static int
tune_look_behind(Node * node,regex_t * reg,int state,ParseEnv * env)4602 tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env)
4603 {
4604 int r;
4605 int state1;
4606 int used;
4607 MinMaxCharLen ci;
4608 Node* body;
4609 AnchorNode* an = ANCHOR_(node);
4610
4611 used = FALSE;
4612 r = check_node_in_look_behind(NODE_ANCHOR_BODY(an),
4613 an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0,
4614 &used);
4615 if (r < 0) return r;
4616 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4617
4618 if (an->type == ANCR_LOOK_BEHIND_NOT)
4619 state1 = state | IN_NOT | IN_LOOK_BEHIND;
4620 else
4621 state1 = state | IN_LOOK_BEHIND;
4622
4623 body = NODE_ANCHOR_BODY(an);
4624 /* Execute tune_tree(body) before call node_char_len().
4625 Because case-fold expansion must be done before node_char_len().
4626 */
4627 r = tune_tree(body, reg, state1, env);
4628 if (r != 0) return r;
4629
4630 r = alt_reduce_in_look_behind(body, reg, env);
4631 if (r != 0) return r;
4632
4633 r = node_char_len(body, reg, &ci, env);
4634 if (r >= 0) {
4635 /* #177: overflow in onigenc_step_back() */
4636 if ((ci.max != INFINITE_LEN && ci.max > LOOK_BEHIND_MAX_CHAR_LEN)
4637 || ci.min > LOOK_BEHIND_MAX_CHAR_LEN) {
4638 return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4639 }
4640
4641 if (ci.min == 0 && ci.min_is_sure != FALSE && used == FALSE) {
4642 if (an->type == ANCR_LOOK_BEHIND_NOT)
4643 r = onig_node_reset_fail(node);
4644 else
4645 r = onig_node_reset_empty(node);
4646
4647 return r;
4648 }
4649
4650 if (r == CHAR_LEN_TOP_ALT_FIXED) {
4651 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) {
4652 r = divide_look_behind_alternatives(node);
4653 if (r == 0)
4654 r = tune_tree(node, reg, state, env);
4655 }
4656 else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND))
4657 goto normal;
4658 else
4659 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4660 }
4661 else { /* CHAR_LEN_NORMAL */
4662 normal:
4663 if (ci.min == INFINITE_LEN) {
4664 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4665 }
4666 else {
4667 if (ci.min != ci.max &&
4668 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) {
4669 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4670 }
4671 else {
4672 Node* tail;
4673
4674 /* check lead_node is already set by double call after
4675 divide_look_behind_alternatives() */
4676 if (IS_NULL(an->lead_node)) {
4677 an->char_min_len = ci.min;
4678 an->char_max_len = ci.max;
4679 r = get_tree_tail_literal(body, &tail, reg);
4680 if (r == GET_VALUE_FOUND) {
4681 r = onig_node_copy(&(an->lead_node), tail);
4682 if (r != 0) return r;
4683 }
4684 }
4685 r = ONIG_NORMAL;
4686 }
4687 }
4688 }
4689 }
4690
4691 return r;
4692 }
4693
4694 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4695 tune_next(Node* node, Node* next_node, regex_t* reg)
4696 {
4697 int called;
4698 NodeType type;
4699
4700 called = FALSE;
4701
4702 retry:
4703 type = NODE_TYPE(node);
4704 if (type == NODE_QUANT) {
4705 QuantNode* qn = QUANT_(node);
4706 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4707 #ifdef USE_QUANT_PEEK_NEXT
4708 if (called == FALSE) {
4709 Node* n = get_tree_head_literal(next_node, 1, reg);
4710 /* '\0': for UTF-16BE etc... */
4711 if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4712 qn->next_head_exact = n;
4713 }
4714 }
4715 #endif
4716 /* automatic posseivation a*b ==> (?>a*)b */
4717 if (qn->lower <= 1) {
4718 if (is_strict_real_node(NODE_BODY(node))) {
4719 Node *x, *y;
4720 x = get_tree_head_literal(NODE_BODY(node), 0, reg);
4721 if (IS_NOT_NULL(x)) {
4722 y = get_tree_head_literal(next_node, 0, reg);
4723 if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4724 Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4725 CHECK_NULL_RETURN_MEMERR(en);
4726 NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4727 node_swap(node, en);
4728 NODE_BODY(node) = en;
4729 }
4730 }
4731 }
4732 }
4733 }
4734 }
4735 else if (type == NODE_BAG) {
4736 BagNode* en = BAG_(node);
4737 if (en->type == BAG_MEMORY) {
4738 if (NODE_IS_CALLED(node))
4739 called = TRUE;
4740 node = NODE_BODY(node);
4741 goto retry;
4742 }
4743 }
4744 return 0;
4745 }
4746
4747
4748 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4749 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4750 {
4751 int i;
4752
4753 for (i = 0; i < n; i++) {
4754 OnigCaseFoldCodeItem* item = items + i;
4755 if (item->code_len != 1) return 0;
4756 }
4757
4758 return 1;
4759 }
4760
4761 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],OnigLen * rmin,OnigLen * rmax)4762 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[],
4763 OnigLen* rmin, OnigLen* rmax)
4764 {
4765 int i;
4766 OnigLen len, minlen, maxlen;
4767
4768 minlen = INFINITE_LEN;
4769 maxlen = 0;
4770 for (i = 0; i < n; i++) {
4771 OnigCaseFoldCodeItem* item = items + i;
4772
4773 len = item->byte_len;
4774 if (len < minlen) minlen = len;
4775 if (len > maxlen) maxlen = len;
4776 }
4777
4778 *rmin = minlen;
4779 *rmax = maxlen;
4780 return 0;
4781 }
4782
4783 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4784 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4785 int n, OnigCodePoint codes[])
4786 {
4787 int r, i, len;
4788 Node* node;
4789 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4790
4791 *rnode = NULL_NODE;
4792 node = onig_node_new_str(NULL, NULL);
4793 CHECK_NULL_RETURN_MEMERR(node);
4794
4795 for (i = 0; i < n; i++) {
4796 len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4797 if (len < 0) {
4798 r = len;
4799 goto err;
4800 }
4801
4802 r = onig_node_str_cat(node, buf, buf + len);
4803 if (r != 0) goto err;
4804 }
4805
4806 *rnode = node;
4807 return 0;
4808
4809 err:
4810 onig_node_free(node);
4811 return r;
4812 }
4813
4814 static int
unravel_cf_node_add(Node ** rlist,Node * add)4815 unravel_cf_node_add(Node** rlist, Node* add)
4816 {
4817 Node *list;
4818
4819 list = *rlist;
4820 if (IS_NULL(list)) {
4821 list = onig_node_new_list(add, NULL);
4822 CHECK_NULL_RETURN_MEMERR(list);
4823 *rlist = list;
4824 }
4825 else {
4826 Node* r = node_list_add(list, add);
4827 CHECK_NULL_RETURN_MEMERR(r);
4828 }
4829
4830 return 0;
4831 }
4832
4833 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag)4834 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4835 unsigned int flag)
4836 {
4837 int r;
4838 Node *sn, *list;
4839
4840 list = *rlist;
4841 sn = *rsn;
4842
4843 if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4844 r = onig_node_str_cat(sn, s, end);
4845 }
4846 else {
4847 sn = onig_node_new_str(s, end);
4848 CHECK_NULL_RETURN_MEMERR(sn);
4849
4850 STR_(sn)->flag = flag;
4851 r = unravel_cf_node_add(&list, sn);
4852 }
4853
4854 if (r == 0) {
4855 *rlist = list;
4856 *rsn = sn;
4857 }
4858 return r;
4859 }
4860
4861 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4862 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4863 OnigCaseFoldCodeItem items[], OnigEncoding enc,
4864 OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4865 {
4866 int r, i;
4867 Node* node;
4868
4869 if (is_all_code_len_1_items(n, items)) {
4870 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4871
4872 codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4873 for (i = 0; i < n; i++) {
4874 OnigCaseFoldCodeItem* item = items + i;
4875 codes[i+1] = item->code[0];
4876 }
4877 r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4878 if (r != 0) return r;
4879 }
4880 else {
4881 Node *snode, *alt, *curr;
4882
4883 snode = onig_node_new_str(s, end);
4884 CHECK_NULL_RETURN_MEMERR(snode);
4885 node = curr = onig_node_new_alt(snode, NULL_NODE);
4886 if (IS_NULL(curr)) {
4887 onig_node_free(snode);
4888 return ONIGERR_MEMORY;
4889 }
4890
4891 r = 0;
4892 for (i = 0; i < n; i++) {
4893 OnigCaseFoldCodeItem* item = items + i;
4894 r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4895 if (r != 0) {
4896 onig_node_free(node);
4897 return r;
4898 }
4899
4900 alt = onig_node_new_alt(snode, NULL_NODE);
4901 if (IS_NULL(alt)) {
4902 onig_node_free(snode);
4903 onig_node_free(node);
4904 return ONIGERR_MEMORY;
4905 }
4906
4907 NODE_CDR(curr) = alt;
4908 curr = alt;
4909 }
4910 }
4911
4912 r = unravel_cf_node_add(rlist, node);
4913 if (r != 0) onig_node_free(node);
4914 return r;
4915 }
4916
4917 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,OnigLen one_len)4918 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4919 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4920 UChar* s, OnigLen one_len)
4921 {
4922 int r, i, found;
4923
4924 found = FALSE;
4925 for (i = 0; i < n; i++) {
4926 OnigCaseFoldCodeItem* item = items + i;
4927 if (item->byte_len == one_len) {
4928 if (item->code_len == 1) {
4929 found = TRUE;
4930 break;
4931 }
4932 }
4933 }
4934
4935 if (found == FALSE) {
4936 r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */);
4937 }
4938 else {
4939 Node* node;
4940 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4941
4942 found = 0;
4943 codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
4944 for (i = 0; i < n; i++) {
4945 OnigCaseFoldCodeItem* item = items + i;
4946 if (item->byte_len == one_len) {
4947 if (item->code_len == 1) {
4948 codes[found++] = item->code[0];
4949 }
4950 }
4951 }
4952 r = onig_new_cclass_with_code_list(&node, enc, found, codes);
4953 if (r != 0) return r;
4954
4955 r = unravel_cf_node_add(rlist, node);
4956 if (r != 0) onig_node_free(node);
4957
4958 *rsn = NULL_NODE;
4959 }
4960
4961 return r;
4962 }
4963
4964 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)4965 unravel_case_fold_string(Node* node, regex_t* reg, int state)
4966 {
4967 int r, n, in_look_behind;
4968 OnigLen min_len, max_len, one_len;
4969 UChar *start, *end, *p, *q;
4970 StrNode* snode;
4971 Node *sn, *list;
4972 OnigEncoding enc;
4973 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4974
4975 if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
4976
4977 NODE_STATUS_REMOVE(node, IGNORECASE);
4978 snode = STR_(node);
4979 start = snode->s;
4980 end = snode->end;
4981 if (start >= end) return 0;
4982
4983 in_look_behind = (state & IN_LOOK_BEHIND) != 0;
4984 enc = reg->enc;
4985
4986 list = sn = NULL_NODE;
4987 p = start;
4988 while (p < end) {
4989 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
4990 items);
4991 if (n < 0) {
4992 r = n;
4993 goto err;
4994 }
4995
4996 one_len = (OnigLen )enclen(enc, p);
4997 if (n == 0) {
4998 q = p + one_len;
4999 if (q > end) q = end;
5000 r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */);
5001 if (r != 0) goto err;
5002 }
5003 else {
5004 if (in_look_behind != 0) {
5005 q = p + one_len;
5006 if (items[0].byte_len != one_len) {
5007 r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q,
5008 items);
5009 if (r < 0) goto err;
5010 n = r;
5011 }
5012 r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
5013 if (r != 0) goto err;
5014 }
5015 else {
5016 get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
5017 if (min_len != max_len) {
5018 r = ONIGERR_PARSER_BUG;
5019 goto err;
5020 }
5021
5022 q = p + max_len;
5023 r = unravel_cf_string_alt_or_cc_add(&list, n, items, enc,
5024 reg->case_fold_flag, p, q);
5025 if (r != 0) goto err;
5026 sn = NULL_NODE;
5027 }
5028 }
5029
5030 p = q;
5031 }
5032
5033 if (IS_NOT_NULL(list)) {
5034 if (node_list_len(list) == 1) {
5035 node_swap(node, NODE_CAR(list));
5036 }
5037 else {
5038 node_swap(node, list);
5039 }
5040 onig_node_free(list);
5041 }
5042 else {
5043 node_swap(node, sn);
5044 onig_node_free(sn);
5045 }
5046 return 0;
5047
5048 err:
5049 if (IS_NOT_NULL(list))
5050 onig_node_free(list);
5051 else if (IS_NOT_NULL(sn))
5052 onig_node_free(sn);
5053
5054 return r;
5055 }
5056
5057 #ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
5058 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)5059 quantifiers_memory_node_info(Node* node)
5060 {
5061 int r = BODY_MAY_BE_EMPTY;
5062
5063 switch (NODE_TYPE(node)) {
5064 case NODE_LIST:
5065 case NODE_ALT:
5066 {
5067 int v;
5068 do {
5069 v = quantifiers_memory_node_info(NODE_CAR(node));
5070 if (v > r) r = v;
5071 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5072 }
5073 break;
5074
5075 #ifdef USE_CALL
5076 case NODE_CALL:
5077 if (NODE_IS_RECURSION(node)) {
5078 return BODY_MAY_BE_EMPTY_REC; /* tiny version */
5079 }
5080 else
5081 r = quantifiers_memory_node_info(NODE_BODY(node));
5082 break;
5083 #endif
5084
5085 case NODE_QUANT:
5086 {
5087 QuantNode* qn = QUANT_(node);
5088 if (qn->upper != 0) {
5089 r = quantifiers_memory_node_info(NODE_BODY(node));
5090 }
5091 }
5092 break;
5093
5094 case NODE_BAG:
5095 {
5096 BagNode* en = BAG_(node);
5097 switch (en->type) {
5098 case BAG_MEMORY:
5099 if (NODE_IS_RECURSION(node)) {
5100 return BODY_MAY_BE_EMPTY_REC;
5101 }
5102 return BODY_MAY_BE_EMPTY_MEM;
5103 break;
5104
5105 case BAG_OPTION:
5106 case BAG_STOP_BACKTRACK:
5107 r = quantifiers_memory_node_info(NODE_BODY(node));
5108 break;
5109 case BAG_IF_ELSE:
5110 {
5111 int v;
5112 r = quantifiers_memory_node_info(NODE_BODY(node));
5113 if (IS_NOT_NULL(en->te.Then)) {
5114 v = quantifiers_memory_node_info(en->te.Then);
5115 if (v > r) r = v;
5116 }
5117 if (IS_NOT_NULL(en->te.Else)) {
5118 v = quantifiers_memory_node_info(en->te.Else);
5119 if (v > r) r = v;
5120 }
5121 }
5122 break;
5123 }
5124 }
5125 break;
5126
5127 case NODE_BACKREF:
5128 case NODE_STRING:
5129 case NODE_CTYPE:
5130 case NODE_CCLASS:
5131 case NODE_ANCHOR:
5132 case NODE_GIMMICK:
5133 default:
5134 break;
5135 }
5136
5137 return r;
5138 }
5139 #endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */
5140
5141
5142 #ifdef USE_CALL
5143
5144 #ifdef __GNUC__
5145 __inline
5146 #endif
5147 static int
check_call_reference(CallNode * cn,ParseEnv * env,int state)5148 check_call_reference(CallNode* cn, ParseEnv* env, int state)
5149 {
5150 MemEnv* mem_env = PARSEENV_MEMENV(env);
5151
5152 if (cn->by_number != 0) {
5153 int gnum = cn->called_gnum;
5154
5155 if (env->num_named > 0 &&
5156 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5157 ! OPTON_CAPTURE_GROUP(env->options)) {
5158 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
5159 }
5160
5161 if (gnum > env->num_mem) {
5162 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
5163 cn->name, cn->name_end);
5164 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5165 }
5166
5167 set_call_attr:
5168 NODE_CALL_BODY(cn) = mem_env[cn->called_gnum].mem_node;
5169 if (IS_NULL(NODE_CALL_BODY(cn))) {
5170 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5171 cn->name, cn->name_end);
5172 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5173 }
5174
5175 NODE_STATUS_ADD(NODE_CALL_BODY(cn), REFERENCED);
5176 }
5177 else {
5178 int *refs;
5179
5180 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
5181 if (n <= 0) {
5182 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5183 cn->name, cn->name_end);
5184 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5185 }
5186 else if (n > 1) {
5187 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
5188 cn->name, cn->name_end);
5189 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
5190 }
5191 else {
5192 cn->called_gnum = refs[0];
5193 goto set_call_attr;
5194 }
5195 }
5196
5197 return 0;
5198 }
5199
5200 static void
tune_call2_call(Node * node)5201 tune_call2_call(Node* node)
5202 {
5203 switch (NODE_TYPE(node)) {
5204 case NODE_LIST:
5205 case NODE_ALT:
5206 do {
5207 tune_call2_call(NODE_CAR(node));
5208 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5209 break;
5210
5211 case NODE_QUANT:
5212 tune_call2_call(NODE_BODY(node));
5213 break;
5214
5215 case NODE_ANCHOR:
5216 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5217 tune_call2_call(NODE_BODY(node));
5218 break;
5219
5220 case NODE_BAG:
5221 {
5222 BagNode* en = BAG_(node);
5223
5224 if (en->type == BAG_MEMORY) {
5225 if (! NODE_IS_MARK1(node)) {
5226 NODE_STATUS_ADD(node, MARK1);
5227 tune_call2_call(NODE_BODY(node));
5228 NODE_STATUS_REMOVE(node, MARK1);
5229 }
5230 }
5231 else if (en->type == BAG_IF_ELSE) {
5232 tune_call2_call(NODE_BODY(node));
5233 if (IS_NOT_NULL(en->te.Then))
5234 tune_call2_call(en->te.Then);
5235 if (IS_NOT_NULL(en->te.Else))
5236 tune_call2_call(en->te.Else);
5237 }
5238 else {
5239 tune_call2_call(NODE_BODY(node));
5240 }
5241 }
5242 break;
5243
5244 case NODE_CALL:
5245 if (! NODE_IS_MARK1(node)) {
5246 NODE_STATUS_ADD(node, MARK1);
5247 {
5248 CallNode* cn = CALL_(node);
5249 Node* called = NODE_CALL_BODY(cn);
5250
5251 cn->entry_count++;
5252
5253 NODE_STATUS_ADD(called, CALLED);
5254 BAG_(called)->m.entry_count++;
5255 tune_call2_call(called);
5256 }
5257 NODE_STATUS_REMOVE(node, MARK1);
5258 }
5259 break;
5260
5261 default:
5262 break;
5263 }
5264 }
5265
5266 static int
tune_call(Node * node,ParseEnv * env,int state)5267 tune_call(Node* node, ParseEnv* env, int state)
5268 {
5269 int r;
5270
5271 switch (NODE_TYPE(node)) {
5272 case NODE_LIST:
5273 case NODE_ALT:
5274 do {
5275 r = tune_call(NODE_CAR(node), env, state);
5276 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5277 break;
5278
5279 case NODE_QUANT:
5280 if (QUANT_(node)->upper == 0)
5281 state |= IN_ZERO_REPEAT;
5282
5283 r = tune_call(NODE_BODY(node), env, state);
5284 break;
5285
5286 case NODE_ANCHOR:
5287 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5288 r = tune_call(NODE_BODY(node), env, state);
5289 else
5290 r = 0;
5291 break;
5292
5293 case NODE_BAG:
5294 {
5295 BagNode* en = BAG_(node);
5296
5297 if (en->type == BAG_MEMORY) {
5298 if ((state & IN_ZERO_REPEAT) != 0) {
5299 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5300 BAG_(node)->m.entry_count--;
5301 }
5302 r = tune_call(NODE_BODY(node), env, state);
5303 }
5304 else if (en->type == BAG_IF_ELSE) {
5305 r = tune_call(NODE_BODY(node), env, state);
5306 if (r != 0) return r;
5307 if (IS_NOT_NULL(en->te.Then)) {
5308 r = tune_call(en->te.Then, env, state);
5309 if (r != 0) return r;
5310 }
5311 if (IS_NOT_NULL(en->te.Else))
5312 r = tune_call(en->te.Else, env, state);
5313 }
5314 else
5315 r = tune_call(NODE_BODY(node), env, state);
5316 }
5317 break;
5318
5319 case NODE_CALL:
5320 if ((state & IN_ZERO_REPEAT) != 0) {
5321 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5322 CALL_(node)->entry_count--;
5323 }
5324
5325 r = check_call_reference(CALL_(node), env, state);
5326 break;
5327
5328 default:
5329 r = 0;
5330 break;
5331 }
5332
5333 return r;
5334 }
5335
5336 static int
tune_call2(Node * node)5337 tune_call2(Node* node)
5338 {
5339 int r = 0;
5340
5341 switch (NODE_TYPE(node)) {
5342 case NODE_LIST:
5343 case NODE_ALT:
5344 do {
5345 r = tune_call2(NODE_CAR(node));
5346 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5347 break;
5348
5349 case NODE_QUANT:
5350 if (QUANT_(node)->upper != 0)
5351 r = tune_call2(NODE_BODY(node));
5352 break;
5353
5354 case NODE_ANCHOR:
5355 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5356 r = tune_call2(NODE_BODY(node));
5357 break;
5358
5359 case NODE_BAG:
5360 if (! NODE_IS_IN_ZERO_REPEAT(node))
5361 r = tune_call2(NODE_BODY(node));
5362
5363 {
5364 BagNode* en = BAG_(node);
5365
5366 if (r != 0) return r;
5367 if (en->type == BAG_IF_ELSE) {
5368 if (IS_NOT_NULL(en->te.Then)) {
5369 r = tune_call2(en->te.Then);
5370 if (r != 0) return r;
5371 }
5372 if (IS_NOT_NULL(en->te.Else))
5373 r = tune_call2(en->te.Else);
5374 }
5375 }
5376 break;
5377
5378 case NODE_CALL:
5379 if (! NODE_IS_IN_ZERO_REPEAT(node)) {
5380 tune_call2_call(node);
5381 }
5382 break;
5383
5384 default:
5385 break;
5386 }
5387
5388 return r;
5389 }
5390
5391
5392 static void
tune_called_state_call(Node * node,int state)5393 tune_called_state_call(Node* node, int state)
5394 {
5395 switch (NODE_TYPE(node)) {
5396 case NODE_ALT:
5397 state |= IN_ALT;
5398 /* fall */
5399 case NODE_LIST:
5400 do {
5401 tune_called_state_call(NODE_CAR(node), state);
5402 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5403 break;
5404
5405 case NODE_QUANT:
5406 {
5407 QuantNode* qn = QUANT_(node);
5408
5409 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5410 state |= IN_REAL_REPEAT;
5411 if (qn->lower != qn->upper)
5412 state |= IN_VAR_REPEAT;
5413 if ((state & IN_PEEK) != 0)
5414 NODE_STATUS_ADD(node, INPEEK);
5415
5416 tune_called_state_call(NODE_QUANT_BODY(qn), state);
5417 }
5418 break;
5419
5420 case NODE_ANCHOR:
5421 {
5422 AnchorNode* an = ANCHOR_(node);
5423
5424 switch (an->type) {
5425 case ANCR_PREC_READ_NOT:
5426 case ANCR_LOOK_BEHIND_NOT:
5427 state |= (IN_NOT | IN_PEEK);
5428 tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5429 break;
5430 case ANCR_PREC_READ:
5431 case ANCR_LOOK_BEHIND:
5432 state |= IN_PEEK;
5433 tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5434 break;
5435 default:
5436 break;
5437 }
5438 }
5439 break;
5440
5441 case NODE_BAG:
5442 {
5443 BagNode* en = BAG_(node);
5444
5445 if (en->type == BAG_MEMORY) {
5446 if (NODE_IS_MARK1(node)) {
5447 if ((~en->m.called_state & state) != 0) {
5448 en->m.called_state |= state;
5449 tune_called_state_call(NODE_BODY(node), state);
5450 }
5451 }
5452 else {
5453 NODE_STATUS_ADD(node, MARK1);
5454 en->m.called_state |= state;
5455 tune_called_state_call(NODE_BODY(node), state);
5456 NODE_STATUS_REMOVE(node, MARK1);
5457 }
5458 }
5459 else if (en->type == BAG_IF_ELSE) {
5460 state |= IN_ALT;
5461 tune_called_state_call(NODE_BODY(node), state);
5462 if (IS_NOT_NULL(en->te.Then)) {
5463 tune_called_state_call(en->te.Then, state);
5464 }
5465 if (IS_NOT_NULL(en->te.Else))
5466 tune_called_state_call(en->te.Else, state);
5467 }
5468 else {
5469 tune_called_state_call(NODE_BODY(node), state);
5470 }
5471 }
5472 break;
5473
5474 case NODE_CALL:
5475 if ((state & IN_PEEK) != 0)
5476 NODE_STATUS_ADD(node, INPEEK);
5477 if ((state & IN_REAL_REPEAT) != 0)
5478 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5479
5480 tune_called_state_call(NODE_BODY(node), state);
5481 break;
5482
5483 default:
5484 break;
5485 }
5486 }
5487
5488 static void
tune_called_state(Node * node,int state)5489 tune_called_state(Node* node, int state)
5490 {
5491 switch (NODE_TYPE(node)) {
5492 case NODE_ALT:
5493 state |= IN_ALT;
5494 /* fall */
5495 case NODE_LIST:
5496 do {
5497 tune_called_state(NODE_CAR(node), state);
5498 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5499 break;
5500
5501 #ifdef USE_CALL
5502 case NODE_CALL:
5503 if ((state & IN_PEEK) != 0)
5504 NODE_STATUS_ADD(node, INPEEK);
5505 if ((state & IN_REAL_REPEAT) != 0)
5506 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5507
5508 tune_called_state_call(node, state);
5509 break;
5510 #endif
5511
5512 case NODE_BAG:
5513 {
5514 BagNode* en = BAG_(node);
5515
5516 switch (en->type) {
5517 case BAG_MEMORY:
5518 if (en->m.entry_count > 1)
5519 state |= IN_MULTI_ENTRY;
5520
5521 en->m.called_state |= state;
5522 /* fall */
5523 case BAG_OPTION:
5524 case BAG_STOP_BACKTRACK:
5525 tune_called_state(NODE_BODY(node), state);
5526 break;
5527 case BAG_IF_ELSE:
5528 state |= IN_ALT;
5529 tune_called_state(NODE_BODY(node), state);
5530 if (IS_NOT_NULL(en->te.Then))
5531 tune_called_state(en->te.Then, state);
5532 if (IS_NOT_NULL(en->te.Else))
5533 tune_called_state(en->te.Else, state);
5534 break;
5535 }
5536 }
5537 break;
5538
5539 case NODE_QUANT:
5540 {
5541 QuantNode* qn = QUANT_(node);
5542
5543 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5544 state |= IN_REAL_REPEAT;
5545 if (qn->lower != qn->upper)
5546 state |= IN_VAR_REPEAT;
5547 if ((state & IN_PEEK) != 0)
5548 NODE_STATUS_ADD(node, INPEEK);
5549
5550 tune_called_state(NODE_QUANT_BODY(qn), state);
5551 }
5552 break;
5553
5554 case NODE_ANCHOR:
5555 {
5556 AnchorNode* an = ANCHOR_(node);
5557
5558 switch (an->type) {
5559 case ANCR_PREC_READ_NOT:
5560 case ANCR_LOOK_BEHIND_NOT:
5561 state |= (IN_NOT | IN_PEEK);
5562 tune_called_state(NODE_ANCHOR_BODY(an), state);
5563 break;
5564 case ANCR_PREC_READ:
5565 case ANCR_LOOK_BEHIND:
5566 state |= IN_PEEK;
5567 tune_called_state(NODE_ANCHOR_BODY(an), state);
5568 break;
5569 default:
5570 break;
5571 }
5572 }
5573 break;
5574
5575 case NODE_BACKREF:
5576 case NODE_STRING:
5577 case NODE_CTYPE:
5578 case NODE_CCLASS:
5579 case NODE_GIMMICK:
5580 default:
5581 break;
5582 }
5583 }
5584
5585 #endif /* USE_CALL */
5586
5587
5588 #ifdef __GNUC__
5589 __inline
5590 #endif
5591 static int
tune_anchor(Node * node,regex_t * reg,int state,ParseEnv * env)5592 tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env)
5593 {
5594 int r;
5595 AnchorNode* an = ANCHOR_(node);
5596
5597 switch (an->type) {
5598 case ANCR_PREC_READ:
5599 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env);
5600 break;
5601 case ANCR_PREC_READ_NOT:
5602 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT),
5603 env);
5604 break;
5605
5606 case ANCR_LOOK_BEHIND:
5607 case ANCR_LOOK_BEHIND_NOT:
5608 r = tune_look_behind(node, reg, state, env);
5609 break;
5610
5611 default:
5612 r = 0;
5613 break;
5614 }
5615
5616 return r;
5617 }
5618
5619 #ifdef __GNUC__
5620 __inline
5621 #endif
5622 static int
tune_quant(Node * node,regex_t * reg,int state,ParseEnv * env)5623 tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env)
5624 {
5625 int r;
5626 QuantNode* qn = QUANT_(node);
5627 Node* body = NODE_BODY(node);
5628
5629 if ((state & IN_REAL_REPEAT) != 0) {
5630 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5631 }
5632 if ((state & IN_MULTI_ENTRY) != 0) {
5633 NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5634 }
5635
5636 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5637 OnigLen d = node_min_byte_len(body, env);
5638 if (d == 0) {
5639 #ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
5640 qn->emptiness = quantifiers_memory_node_info(body);
5641 #else
5642 qn->emptiness = BODY_MAY_BE_EMPTY;
5643 #endif
5644 }
5645 }
5646
5647 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5648 state |= IN_REAL_REPEAT;
5649 if (qn->lower != qn->upper)
5650 state |= IN_VAR_REPEAT;
5651
5652 r = tune_tree(body, reg, state, env);
5653 if (r != 0) return r;
5654
5655 /* expand string */
5656 #define EXPAND_STRING_MAX_LENGTH 100
5657 if (NODE_TYPE(body) == NODE_STRING) {
5658 if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5659 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5660 int len = NODE_STRING_LEN(body);
5661
5662 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5663 int i, n = qn->lower;
5664 node_conv_to_str_node(node, body);
5665 for (i = 0; i < n; i++) {
5666 r = node_str_node_cat(node, body);
5667 if (r != 0) return r;
5668 }
5669 onig_node_free(body);
5670 return r;
5671 }
5672 }
5673 }
5674
5675 if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5676 if (NODE_TYPE(body) == NODE_QUANT) {
5677 QuantNode* tqn = QUANT_(body);
5678 if (IS_NOT_NULL(tqn->head_exact)) {
5679 qn->head_exact = tqn->head_exact;
5680 tqn->head_exact = NULL;
5681 }
5682 }
5683 else {
5684 qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg);
5685 }
5686 }
5687
5688 return r;
5689 }
5690
5691 /* tune_tree does the following work.
5692 1. check empty loop. (set qn->emptiness)
5693 2. expand ignore-case in char class.
5694 3. set memory status bit flags. (reg->mem_stats)
5695 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5696 5. find invalid patterns in look-behind.
5697 6. expand repeated string.
5698 */
5699 static int
tune_tree(Node * node,regex_t * reg,int state,ParseEnv * env)5700 tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env)
5701 {
5702 int r = 0;
5703
5704 switch (NODE_TYPE(node)) {
5705 case NODE_LIST:
5706 {
5707 Node* prev = NULL_NODE;
5708 do {
5709 r = tune_tree(NODE_CAR(node), reg, state, env);
5710 if (IS_NOT_NULL(prev) && r == 0) {
5711 r = tune_next(prev, NODE_CAR(node), reg);
5712 }
5713 prev = NODE_CAR(node);
5714 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5715 }
5716 break;
5717
5718 case NODE_ALT:
5719 do {
5720 r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5721 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5722 break;
5723
5724 case NODE_STRING:
5725 if (NODE_IS_REAL_IGNORECASE(node)) {
5726 r = unravel_case_fold_string(node, reg, state);
5727 }
5728 break;
5729
5730 case NODE_BACKREF:
5731 {
5732 int i;
5733 int* p;
5734 BackRefNode* br = BACKREF_(node);
5735 p = BACKREFS_P(br);
5736 for (i = 0; i < br->back_num; i++) {
5737 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
5738 MEM_STATUS_ON(env->backrefed_mem, p[i]);
5739 #if 0
5740 #ifdef USE_BACKREF_WITH_LEVEL
5741 if (NODE_IS_NEST_LEVEL(node)) {
5742 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5743 }
5744 #endif
5745 #else
5746 /* More precisely, it should be checked whether alt/repeat exists before
5747 the subject capture node, and then this backreference position
5748 exists before (or in) the capture node. */
5749 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5750 #endif
5751 }
5752 }
5753 break;
5754
5755 case NODE_BAG:
5756 {
5757 BagNode* en = BAG_(node);
5758
5759 switch (en->type) {
5760 case BAG_OPTION:
5761 {
5762 OnigOptionType options = reg->options;
5763 reg->options = BAG_(node)->o.options;
5764 r = tune_tree(NODE_BODY(node), reg, state, env);
5765 reg->options = options;
5766 }
5767 break;
5768
5769 case BAG_MEMORY:
5770 #ifdef USE_CALL
5771 state |= en->m.called_state;
5772 #endif
5773
5774 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5775 || NODE_IS_RECURSION(node)) {
5776 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5777 }
5778 r = tune_tree(NODE_BODY(node), reg, state, env);
5779 break;
5780
5781 case BAG_STOP_BACKTRACK:
5782 {
5783 Node* target = NODE_BODY(node);
5784 r = tune_tree(target, reg, state, env);
5785 if (NODE_TYPE(target) == NODE_QUANT) {
5786 QuantNode* tqn = QUANT_(target);
5787 if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5788 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
5789 if (is_strict_real_node(NODE_BODY(target)))
5790 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5791 }
5792 }
5793 }
5794 break;
5795
5796 case BAG_IF_ELSE:
5797 r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5798 if (r != 0) return r;
5799 if (IS_NOT_NULL(en->te.Then)) {
5800 r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5801 if (r != 0) return r;
5802 }
5803 if (IS_NOT_NULL(en->te.Else))
5804 r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5805 break;
5806 }
5807 }
5808 break;
5809
5810 case NODE_QUANT:
5811 if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0)
5812 NODE_STATUS_ADD(node, INPEEK);
5813
5814 r = tune_quant(node, reg, state, env);
5815 break;
5816
5817 case NODE_ANCHOR:
5818 r = tune_anchor(node, reg, state, env);
5819 break;
5820
5821 #ifdef USE_CALL
5822 case NODE_CALL:
5823 #endif
5824 case NODE_CTYPE:
5825 case NODE_CCLASS:
5826 case NODE_GIMMICK:
5827 default:
5828 break;
5829 }
5830
5831 return r;
5832 }
5833
5834 #ifndef ONIG_DONT_OPTIMIZE
5835 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5836 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5837 UChar* s, UChar* end,
5838 UChar skip[], int* roffset)
5839 {
5840 int i, j, k, len, offset;
5841 int n, clen;
5842 UChar* p;
5843 OnigEncoding enc;
5844 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5845 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5846
5847 enc = reg->enc;
5848 offset = ENC_GET_SKIP_OFFSET(enc);
5849 if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5850 UChar* p = s;
5851 while (1) {
5852 len = enclen(enc, p);
5853 if (p + len >= end) {
5854 if (len == 1) offset = 1;
5855 else offset = 0;
5856 break;
5857 }
5858 p += len;
5859 }
5860 }
5861
5862 len = (int )(end - s);
5863 if (len + offset >= UCHAR_MAX)
5864 return ONIGERR_PARSER_BUG;
5865
5866 *roffset = offset;
5867
5868 for (i = 0; i < CHAR_MAP_SIZE; i++) {
5869 skip[i] = (UChar )(len + offset);
5870 }
5871
5872 for (p = s; p < end; ) {
5873 int z;
5874
5875 clen = enclen(enc, p);
5876 if (p + clen > end) clen = (int )(end - p);
5877
5878 len = (int )(end - p);
5879 for (j = 0; j < clen; j++) {
5880 z = len - j + (offset - 1);
5881 if (z <= 0) break;
5882 skip[p[j]] = z;
5883 }
5884
5885 if (case_expand != 0) {
5886 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5887 p, end, items);
5888 for (k = 0; k < n; k++) {
5889 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5890 for (j = 0; j < clen; j++) {
5891 z = len - j + (offset - 1);
5892 if (z <= 0) break;
5893 if (skip[buf[j]] > z)
5894 skip[buf[j]] = z;
5895 }
5896 }
5897 }
5898
5899 p += clen;
5900 }
5901
5902 return 0;
5903 }
5904 #endif
5905
5906
5907 #define OPT_EXACT_MAXLEN 24
5908
5909 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5910 #error Too big OPT_EXACT_MAXLEN
5911 #endif
5912
5913 typedef struct {
5914 MinMaxLen mm;
5915 OnigEncoding enc;
5916 OnigCaseFoldType case_fold_flag;
5917 ParseEnv* scan_env;
5918 } OptEnv;
5919
5920 typedef struct {
5921 int left;
5922 int right;
5923 } OptAnc;
5924
5925 typedef struct {
5926 MinMaxLen mm; /* position */
5927 OptAnc anc;
5928 int reach_end;
5929 int len;
5930 UChar s[OPT_EXACT_MAXLEN];
5931 } OptStr;
5932
5933 typedef struct {
5934 MinMaxLen mm; /* position */
5935 OptAnc anc;
5936 int value; /* weighted value */
5937 UChar map[CHAR_MAP_SIZE];
5938 } OptMap;
5939
5940 typedef struct {
5941 MinMaxLen len;
5942 OptAnc anc;
5943 OptStr sb; /* boundary */
5944 OptStr sm; /* middle */
5945 OptStr spr; /* prec read (?=...) */
5946 OptMap map; /* boundary */
5947 } OptNode;
5948
5949
5950 #ifndef ONIG_DONT_OPTIMIZE
5951
5952 static int
map_position_value(OnigEncoding enc,int i)5953 map_position_value(OnigEncoding enc, int i)
5954 {
5955 static const short int Vals[] = {
5956 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
5957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5958 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
5959 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
5960 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
5961 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
5962 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
5963 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
5964 };
5965
5966 if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
5967 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
5968 return 20;
5969 else
5970 return (int )Vals[i];
5971 }
5972 else
5973 return 4; /* Take it easy. */
5974 }
5975
5976 static int
distance_value(MinMaxLen * mm)5977 distance_value(MinMaxLen* mm)
5978 {
5979 /* 1000 / (min-max-dist + 1) */
5980 static const short int dist_vals[] = {
5981 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
5982 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
5983 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
5984 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
5985 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
5986 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
5987 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
5988 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
5989 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
5990 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
5991 };
5992
5993 OnigLen d;
5994
5995 if (mm->max == INFINITE_LEN) return 0;
5996
5997 d = mm->max - mm->min;
5998 if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
5999 /* return dist_vals[d] * 16 / (mm->min + 12); */
6000 return (int )dist_vals[d];
6001 else
6002 return 1;
6003 }
6004
6005 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)6006 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
6007 {
6008 if (v2 <= 0) return -1;
6009 if (v1 <= 0) return 1;
6010
6011 v1 *= distance_value(d1);
6012 v2 *= distance_value(d2);
6013
6014 if (v2 > v1) return 1;
6015 if (v2 < v1) return -1;
6016
6017 if (d2->min < d1->min) return 1;
6018 if (d2->min > d1->min) return -1;
6019 return 0;
6020 }
6021
6022 static void
copy_opt_env(OptEnv * to,OptEnv * from)6023 copy_opt_env(OptEnv* to, OptEnv* from)
6024 {
6025 *to = *from;
6026 }
6027
6028 static void
clear_opt_anc_info(OptAnc * a)6029 clear_opt_anc_info(OptAnc* a)
6030 {
6031 a->left = 0;
6032 a->right = 0;
6033 }
6034
6035 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)6036 copy_opt_anc_info(OptAnc* to, OptAnc* from)
6037 {
6038 *to = *from;
6039 }
6040
6041 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)6042 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
6043 OnigLen left_len, OnigLen right_len)
6044 {
6045 clear_opt_anc_info(to);
6046
6047 to->left = left->left;
6048 if (left_len == 0) {
6049 to->left |= right->left;
6050 }
6051
6052 to->right = right->right;
6053 if (right_len == 0) {
6054 to->right |= left->right;
6055 }
6056 else {
6057 to->right |= (left->right & ANCR_PREC_READ_NOT);
6058 }
6059 }
6060
6061 static int
is_left(int a)6062 is_left(int a)
6063 {
6064 if (a == ANCR_END_BUF || a == ANCR_SEMI_END_BUF ||
6065 a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
6066 return 0;
6067
6068 return 1;
6069 }
6070
6071 static int
is_set_opt_anc_info(OptAnc * to,int anc)6072 is_set_opt_anc_info(OptAnc* to, int anc)
6073 {
6074 if ((to->left & anc) != 0) return 1;
6075
6076 return ((to->right & anc) != 0 ? 1 : 0);
6077 }
6078
6079 static void
add_opt_anc_info(OptAnc * to,int anc)6080 add_opt_anc_info(OptAnc* to, int anc)
6081 {
6082 if (is_left(anc))
6083 to->left |= anc;
6084 else
6085 to->right |= anc;
6086 }
6087
6088 static void
remove_opt_anc_info(OptAnc * to,int anc)6089 remove_opt_anc_info(OptAnc* to, int anc)
6090 {
6091 if (is_left(anc))
6092 to->left &= ~anc;
6093 else
6094 to->right &= ~anc;
6095 }
6096
6097 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)6098 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
6099 {
6100 to->left &= add->left;
6101 to->right &= add->right;
6102 }
6103
6104 static int
is_full_opt_exact(OptStr * e)6105 is_full_opt_exact(OptStr* e)
6106 {
6107 return e->len >= OPT_EXACT_MAXLEN;
6108 }
6109
6110 static void
clear_opt_exact(OptStr * e)6111 clear_opt_exact(OptStr* e)
6112 {
6113 mml_clear(&e->mm);
6114 clear_opt_anc_info(&e->anc);
6115 e->reach_end = 0;
6116 e->len = 0;
6117 e->s[0] = '\0';
6118 }
6119
6120 static void
copy_opt_exact(OptStr * to,OptStr * from)6121 copy_opt_exact(OptStr* to, OptStr* from)
6122 {
6123 *to = *from;
6124 }
6125
6126 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)6127 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
6128 {
6129 int i, j, len, r;
6130 UChar *p, *end;
6131 OptAnc tanc;
6132
6133 r = 0;
6134 p = add->s;
6135 end = p + add->len;
6136 for (i = to->len; p < end; ) {
6137 len = enclen(enc, p);
6138 if (i + len > OPT_EXACT_MAXLEN) {
6139 r = 1; /* 1:full */
6140 break;
6141 }
6142 for (j = 0; j < len && p < end; j++) {
6143 /* coverity[overrun-local] */
6144 to->s[i++] = *p++;
6145 }
6146 }
6147
6148 to->len = i;
6149 to->reach_end = (p == end ? add->reach_end : 0);
6150
6151 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
6152 if (! to->reach_end) tanc.right = 0;
6153 copy_opt_anc_info(&to->anc, &tanc);
6154
6155 return r;
6156 }
6157
6158 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)6159 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
6160 {
6161 int i, j, len;
6162 UChar *p;
6163
6164 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
6165 len = enclen(enc, p);
6166 if (i + len > OPT_EXACT_MAXLEN) break;
6167 for (j = 0; j < len && p < end; j++) {
6168 /* coverity[overrun-local] */
6169 to->s[i++] = *p++;
6170 }
6171 }
6172
6173 to->len = i;
6174
6175 if (p >= end)
6176 to->reach_end = 1;
6177 }
6178
6179 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)6180 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
6181 {
6182 int i, j, len;
6183
6184 if (add->len == 0 || to->len == 0) {
6185 clear_opt_exact(to);
6186 return ;
6187 }
6188
6189 if (! mml_is_equal(&to->mm, &add->mm)) {
6190 clear_opt_exact(to);
6191 return ;
6192 }
6193
6194 for (i = 0; i < to->len && i < add->len; ) {
6195 if (to->s[i] != add->s[i]) break;
6196 len = enclen(env->enc, to->s + i);
6197
6198 for (j = 1; j < len; j++) {
6199 if (to->s[i+j] != add->s[i+j]) break;
6200 }
6201 if (j < len) break;
6202 i += len;
6203 }
6204
6205 if (! add->reach_end || i < add->len || i < to->len) {
6206 to->reach_end = 0;
6207 }
6208 to->len = i;
6209
6210 alt_merge_opt_anc_info(&to->anc, &add->anc);
6211 if (! to->reach_end) to->anc.right = 0;
6212 }
6213
6214 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)6215 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
6216 {
6217 int vn, va;
6218
6219 vn = now->len;
6220 va = alt->len;
6221
6222 if (va == 0) {
6223 return ;
6224 }
6225 else if (vn == 0) {
6226 copy_opt_exact(now, alt);
6227 return ;
6228 }
6229 else if (vn <= 2 && va <= 2) {
6230 /* ByteValTable[x] is big value --> low price */
6231 va = map_position_value(enc, now->s[0]);
6232 vn = map_position_value(enc, alt->s[0]);
6233
6234 if (now->len > 1) vn += 5;
6235 if (alt->len > 1) va += 5;
6236 }
6237
6238 vn *= 2;
6239 va *= 2;
6240
6241 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6242 copy_opt_exact(now, alt);
6243 }
6244
6245 static void
clear_opt_map(OptMap * map)6246 clear_opt_map(OptMap* map)
6247 {
6248 static const OptMap clean_info = {
6249 {0, 0}, {0, 0}, 0,
6250 {
6251 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6267 }
6268 };
6269
6270 xmemcpy(map, &clean_info, sizeof(OptMap));
6271 }
6272
6273 static void
copy_opt_map(OptMap * to,OptMap * from)6274 copy_opt_map(OptMap* to, OptMap* from)
6275 {
6276 *to = *from;
6277 }
6278
6279 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)6280 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
6281 {
6282 if (m->map[c] == 0) {
6283 m->map[c] = 1;
6284 m->value += map_position_value(enc, c);
6285 }
6286 }
6287
6288 static void
select_opt_map(OptMap * now,OptMap * alt)6289 select_opt_map(OptMap* now, OptMap* alt)
6290 {
6291 static int z = 1<<15; /* 32768: something big value */
6292
6293 int vn, va;
6294
6295 if (alt->value == 0) return ;
6296 if (now->value == 0) {
6297 copy_opt_map(now, alt);
6298 return ;
6299 }
6300
6301 vn = z / now->value;
6302 va = z / alt->value;
6303 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6304 copy_opt_map(now, alt);
6305 }
6306
6307 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)6308 comp_opt_exact_or_map(OptStr* e, OptMap* m)
6309 {
6310 #define COMP_EM_BASE 20
6311 int ae, am;
6312 int case_value;
6313
6314 if (m->value <= 0) return -1;
6315
6316 case_value = 3;
6317 ae = COMP_EM_BASE * e->len * case_value;
6318 am = COMP_EM_BASE * 5 * 2 / m->value;
6319 return comp_distance_value(&e->mm, &m->mm, ae, am);
6320 }
6321
6322 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)6323 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
6324 {
6325 int i, val;
6326
6327 /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */
6328 if (to->value == 0) return ;
6329 if (add->value == 0 || to->mm.max < add->mm.min) {
6330 clear_opt_map(to);
6331 return ;
6332 }
6333
6334 mml_alt_merge(&to->mm, &add->mm);
6335
6336 val = 0;
6337 for (i = 0; i < CHAR_MAP_SIZE; i++) {
6338 if (add->map[i])
6339 to->map[i] = 1;
6340
6341 if (to->map[i])
6342 val += map_position_value(enc, i);
6343 }
6344 to->value = val;
6345
6346 alt_merge_opt_anc_info(&to->anc, &add->anc);
6347 }
6348
6349 static void
set_bound_node_opt_info(OptNode * opt,MinMaxLen * plen)6350 set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen)
6351 {
6352 mml_copy(&(opt->sb.mm), plen);
6353 mml_copy(&(opt->spr.mm), plen);
6354 mml_copy(&(opt->map.mm), plen);
6355 }
6356
6357 static void
clear_node_opt_info(OptNode * opt)6358 clear_node_opt_info(OptNode* opt)
6359 {
6360 mml_clear(&opt->len);
6361 clear_opt_anc_info(&opt->anc);
6362 clear_opt_exact(&opt->sb);
6363 clear_opt_exact(&opt->sm);
6364 clear_opt_exact(&opt->spr);
6365 clear_opt_map(&opt->map);
6366 }
6367
6368 static void
copy_node_opt_info(OptNode * to,OptNode * from)6369 copy_node_opt_info(OptNode* to, OptNode* from)
6370 {
6371 *to = *from;
6372 }
6373
6374 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)6375 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
6376 {
6377 int sb_reach, sm_reach;
6378 OptAnc tanc;
6379
6380 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
6381 copy_opt_anc_info(&to->anc, &tanc);
6382
6383 if (add->sb.len > 0 && to->len.max == 0) {
6384 concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
6385 copy_opt_anc_info(&add->sb.anc, &tanc);
6386 }
6387
6388 if (add->map.value > 0 && to->len.max == 0) {
6389 if (add->map.mm.max == 0)
6390 add->map.anc.left |= to->anc.left;
6391 }
6392
6393 sb_reach = to->sb.reach_end;
6394 sm_reach = to->sm.reach_end;
6395
6396 if (add->len.max != 0)
6397 to->sb.reach_end = to->sm.reach_end = 0;
6398
6399 if (add->sb.len > 0) {
6400 if (sb_reach) {
6401 concat_opt_exact(&to->sb, &add->sb, enc);
6402 clear_opt_exact(&add->sb);
6403 }
6404 else if (sm_reach) {
6405 concat_opt_exact(&to->sm, &add->sb, enc);
6406 clear_opt_exact(&add->sb);
6407 }
6408 }
6409 select_opt_exact(enc, &to->sm, &add->sb);
6410 select_opt_exact(enc, &to->sm, &add->sm);
6411
6412 if (to->spr.len > 0) {
6413 if (add->len.max > 0) {
6414 if (to->spr.mm.max == 0)
6415 select_opt_exact(enc, &to->sb, &to->spr);
6416 else
6417 select_opt_exact(enc, &to->sm, &to->spr);
6418 }
6419 }
6420 else if (add->spr.len > 0) {
6421 copy_opt_exact(&to->spr, &add->spr);
6422 }
6423
6424 select_opt_map(&to->map, &add->map);
6425 mml_add(&to->len, &add->len);
6426 }
6427
6428 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)6429 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
6430 {
6431 alt_merge_opt_anc_info(&to->anc, &add->anc);
6432 alt_merge_opt_exact(&to->sb, &add->sb, env);
6433 alt_merge_opt_exact(&to->sm, &add->sm, env);
6434 alt_merge_opt_exact(&to->spr, &add->spr, env);
6435 alt_merge_opt_map(env->enc, &to->map, &add->map);
6436
6437 mml_alt_merge(&to->len, &add->len);
6438 }
6439
6440 static OnigLen
node_max_byte_len(Node * node,ParseEnv * env)6441 node_max_byte_len(Node* node, ParseEnv* env)
6442 {
6443 OnigLen len;
6444 OnigLen tmax;
6445
6446 len = 0;
6447 switch (NODE_TYPE(node)) {
6448 case NODE_LIST:
6449 do {
6450 tmax = node_max_byte_len(NODE_CAR(node), env);
6451 len = distance_add(len, tmax);
6452 } while (IS_NOT_NULL(node = NODE_CDR(node)));
6453 break;
6454
6455 case NODE_ALT:
6456 do {
6457 tmax = node_max_byte_len(NODE_CAR(node), env);
6458 if (len < tmax) len = tmax;
6459 } while (IS_NOT_NULL(node = NODE_CDR(node)));
6460 break;
6461
6462 case NODE_STRING:
6463 {
6464 StrNode* sn = STR_(node);
6465 len = (OnigLen )(sn->end - sn->s);
6466 }
6467 break;
6468
6469 case NODE_CTYPE:
6470 case NODE_CCLASS:
6471 len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
6472 break;
6473
6474 case NODE_BACKREF:
6475 if (! NODE_IS_CHECKER(node)) {
6476 int i;
6477 int* backs;
6478 MemEnv* mem_env = PARSEENV_MEMENV(env);
6479 BackRefNode* br = BACKREF_(node);
6480 if (NODE_IS_RECURSION(node)) {
6481 #ifdef USE_BACKREF_WITH_LEVEL
6482 if (NODE_IS_NEST_LEVEL(node)) {
6483 len = INFINITE_LEN;
6484 }
6485 #endif
6486 break;
6487 }
6488 backs = BACKREFS_P(br);
6489 for (i = 0; i < br->back_num; i++) {
6490 tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
6491 if (len < tmax) len = tmax;
6492 }
6493 }
6494 break;
6495
6496 #ifdef USE_CALL
6497 case NODE_CALL:
6498 if (! NODE_IS_RECURSION(node))
6499 len = node_max_byte_len(NODE_BODY(node), env);
6500 else
6501 len = INFINITE_LEN;
6502 break;
6503 #endif
6504
6505 case NODE_QUANT:
6506 {
6507 QuantNode* qn = QUANT_(node);
6508
6509 if (qn->upper != 0) {
6510 len = node_max_byte_len(NODE_BODY(node), env);
6511 if (len != 0) {
6512 if (! IS_INFINITE_REPEAT(qn->upper))
6513 len = distance_multiply(len, qn->upper);
6514 else
6515 len = INFINITE_LEN;
6516 }
6517 }
6518 }
6519 break;
6520
6521 case NODE_BAG:
6522 {
6523 BagNode* en = BAG_(node);
6524 switch (en->type) {
6525 case BAG_MEMORY:
6526 if (NODE_IS_FIXED_MAX(node))
6527 len = en->max_len;
6528 else {
6529 if (NODE_IS_MARK1(node))
6530 len = INFINITE_LEN;
6531 else {
6532 NODE_STATUS_ADD(node, MARK1);
6533 len = node_max_byte_len(NODE_BODY(node), env);
6534 NODE_STATUS_REMOVE(node, MARK1);
6535
6536 en->max_len = len;
6537 NODE_STATUS_ADD(node, FIXED_MAX);
6538 }
6539 }
6540 break;
6541
6542 case BAG_OPTION:
6543 case BAG_STOP_BACKTRACK:
6544 len = node_max_byte_len(NODE_BODY(node), env);
6545 break;
6546 case BAG_IF_ELSE:
6547 {
6548 OnigLen tlen, elen;
6549
6550 len = node_max_byte_len(NODE_BODY(node), env);
6551 if (IS_NOT_NULL(en->te.Then)) {
6552 tlen = node_max_byte_len(en->te.Then, env);
6553 len = distance_add(len, tlen);
6554 }
6555 if (IS_NOT_NULL(en->te.Else))
6556 elen = node_max_byte_len(en->te.Else, env);
6557 else elen = 0;
6558
6559 if (elen > len) len = elen;
6560 }
6561 break;
6562 }
6563 }
6564 break;
6565
6566 case NODE_ANCHOR:
6567 case NODE_GIMMICK:
6568 default:
6569 break;
6570 }
6571
6572 return len;
6573 }
6574
6575 #define MAX_NODE_OPT_INFO_REF_COUNT 5
6576
6577 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)6578 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
6579 {
6580 int i;
6581 int r;
6582 OptNode xo;
6583 OnigEncoding enc;
6584
6585 r = 0;
6586 enc = env->enc;
6587 clear_node_opt_info(opt);
6588 set_bound_node_opt_info(opt, &env->mm);
6589
6590 switch (NODE_TYPE(node)) {
6591 case NODE_LIST:
6592 {
6593 OptEnv nenv;
6594 Node* nd = node;
6595
6596 copy_opt_env(&nenv, env);
6597 do {
6598 r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
6599 if (r == 0) {
6600 mml_add(&nenv.mm, &xo.len);
6601 concat_left_node_opt_info(enc, opt, &xo);
6602 }
6603 } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
6604 }
6605 break;
6606
6607 case NODE_ALT:
6608 {
6609 Node* nd = node;
6610
6611 do {
6612 r = optimize_nodes(NODE_CAR(nd), &xo, env);
6613 if (r == 0) {
6614 if (nd == node) copy_node_opt_info(opt, &xo);
6615 else alt_merge_node_opt_info(opt, &xo, env);
6616 }
6617 } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
6618 }
6619 break;
6620
6621 case NODE_STRING:
6622 {
6623 StrNode* sn = STR_(node);
6624 int slen = (int )(sn->end - sn->s);
6625
6626 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
6627 if (slen > 0) {
6628 add_char_opt_map(&opt->map, *(sn->s), enc);
6629 }
6630 mml_set_min_max(&opt->len, slen, slen);
6631 }
6632 break;
6633
6634 case NODE_CCLASS:
6635 {
6636 int z;
6637 CClassNode* cc = CCLASS_(node);
6638
6639 /* no need to check ignore case. (set in tune_tree()) */
6640
6641 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
6642 OnigLen min = ONIGENC_MBC_MINLEN(enc);
6643 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
6644
6645 mml_set_min_max(&opt->len, min, max);
6646 }
6647 else {
6648 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6649 z = BITSET_AT(cc->bs, i);
6650 if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6651 add_char_opt_map(&opt->map, (UChar )i, enc);
6652 }
6653 }
6654 mml_set_min_max(&opt->len, 1, 1);
6655 }
6656 }
6657 break;
6658
6659 case NODE_CTYPE:
6660 {
6661 int min, max;
6662 int range;
6663
6664 max = ONIGENC_MBC_MAXLEN_DIST(enc);
6665
6666 if (max == 1) {
6667 min = 1;
6668
6669 switch (CTYPE_(node)->ctype) {
6670 case CTYPE_ANYCHAR:
6671 break;
6672
6673 case ONIGENC_CTYPE_WORD:
6674 range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6675 if (CTYPE_(node)->not != 0) {
6676 for (i = 0; i < range; i++) {
6677 if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6678 add_char_opt_map(&opt->map, (UChar )i, enc);
6679 }
6680 }
6681 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6682 add_char_opt_map(&opt->map, (UChar )i, enc);
6683 }
6684 }
6685 else {
6686 for (i = 0; i < range; i++) {
6687 if (ONIGENC_IS_CODE_WORD(enc, i)) {
6688 add_char_opt_map(&opt->map, (UChar )i, enc);
6689 }
6690 }
6691 }
6692 break;
6693 }
6694 }
6695 else {
6696 min = ONIGENC_MBC_MINLEN(enc);
6697 }
6698 mml_set_min_max(&opt->len, min, max);
6699 }
6700 break;
6701
6702 case NODE_ANCHOR:
6703 switch (ANCHOR_(node)->type) {
6704 case ANCR_BEGIN_BUF:
6705 case ANCR_BEGIN_POSITION:
6706 case ANCR_BEGIN_LINE:
6707 case ANCR_END_BUF:
6708 case ANCR_SEMI_END_BUF:
6709 case ANCR_END_LINE:
6710 case ANCR_PREC_READ_NOT:
6711 case ANCR_LOOK_BEHIND:
6712 add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6713 break;
6714
6715 case ANCR_PREC_READ:
6716 {
6717 r = optimize_nodes(NODE_BODY(node), &xo, env);
6718 if (r == 0) {
6719 if (xo.sb.len > 0)
6720 copy_opt_exact(&opt->spr, &xo.sb);
6721 else if (xo.sm.len > 0)
6722 copy_opt_exact(&opt->spr, &xo.sm);
6723
6724 opt->spr.reach_end = 0;
6725
6726 if (xo.map.value > 0)
6727 copy_opt_map(&opt->map, &xo.map);
6728 }
6729 }
6730 break;
6731
6732 case ANCR_LOOK_BEHIND_NOT:
6733 break;
6734 }
6735 break;
6736
6737 case NODE_BACKREF:
6738 if (! NODE_IS_CHECKER(node)) {
6739 OnigLen min, max;
6740
6741 min = node_min_byte_len(node, env->scan_env);
6742 max = node_max_byte_len(node, env->scan_env);
6743 mml_set_min_max(&opt->len, min, max);
6744 }
6745 break;
6746
6747 #ifdef USE_CALL
6748 case NODE_CALL:
6749 if (NODE_IS_RECURSION(node))
6750 mml_set_min_max(&opt->len, 0, INFINITE_LEN);
6751 else {
6752 r = optimize_nodes(NODE_BODY(node), opt, env);
6753 }
6754 break;
6755 #endif
6756
6757 case NODE_QUANT:
6758 {
6759 OnigLen min, max;
6760 QuantNode* qn = QUANT_(node);
6761
6762 /* Issue #175
6763 ex. /\g<1>{0}(?<=|())/
6764
6765 Empty and unused nodes in look-behind is removed in
6766 tune_look_behind().
6767 Called group nodes are assigned to be not called if the caller side is
6768 inside of zero-repetition.
6769 As a result, the nodes are considered unused.
6770 */
6771 if (qn->upper == 0) {
6772 mml_set_min_max(&opt->len, 0, 0);
6773 break;
6774 }
6775
6776 r = optimize_nodes(NODE_BODY(node), &xo, env);
6777 if (r != 0) break;
6778
6779 if (qn->lower > 0) {
6780 copy_node_opt_info(opt, &xo);
6781 if (xo.sb.len > 0) {
6782 if (xo.sb.reach_end) {
6783 for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6784 int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6785 if (rc > 0) break;
6786 }
6787 if (i < qn->lower) opt->sb.reach_end = 0;
6788 }
6789 }
6790
6791 if (qn->lower != qn->upper) {
6792 opt->sb.reach_end = 0;
6793 opt->sm.reach_end = 0;
6794 }
6795 if (qn->lower > 1)
6796 opt->sm.reach_end = 0;
6797 }
6798
6799 if (IS_INFINITE_REPEAT(qn->upper)) {
6800 if (env->mm.max == 0 &&
6801 NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6802 if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)))
6803 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6804 else
6805 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6806 }
6807
6808 max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6809 }
6810 else {
6811 max = distance_multiply(xo.len.max, qn->upper);
6812 }
6813
6814 min = distance_multiply(xo.len.min, qn->lower);
6815 mml_set_min_max(&opt->len, min, max);
6816 }
6817 break;
6818
6819 case NODE_BAG:
6820 {
6821 BagNode* en = BAG_(node);
6822
6823 switch (en->type) {
6824 case BAG_STOP_BACKTRACK:
6825 case BAG_OPTION:
6826 r = optimize_nodes(NODE_BODY(node), opt, env);
6827 break;
6828
6829 case BAG_MEMORY:
6830 #ifdef USE_CALL
6831 en->opt_count++;
6832 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6833 OnigLen min, max;
6834
6835 min = 0;
6836 max = INFINITE_LEN;
6837 if (NODE_IS_FIXED_MIN(node)) min = en->min_len;
6838 if (NODE_IS_FIXED_MAX(node)) max = en->max_len;
6839 mml_set_min_max(&opt->len, min, max);
6840 }
6841 else
6842 #endif
6843 {
6844 r = optimize_nodes(NODE_BODY(node), opt, env);
6845 if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6846 if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6847 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6848 }
6849 }
6850 break;
6851
6852 case BAG_IF_ELSE:
6853 {
6854 OptEnv nenv;
6855
6856 if (IS_NOT_NULL(en->te.Else)) {
6857 copy_opt_env(&nenv, env);
6858 r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6859 if (r == 0) {
6860 mml_add(&nenv.mm, &xo.len);
6861 concat_left_node_opt_info(enc, opt, &xo);
6862 if (IS_NOT_NULL(en->te.Then)) {
6863 r = optimize_nodes(en->te.Then, &xo, &nenv);
6864 if (r == 0) {
6865 concat_left_node_opt_info(enc, opt, &xo);
6866 }
6867 }
6868
6869 r = optimize_nodes(en->te.Else, &xo, env);
6870 if (r == 0)
6871 alt_merge_node_opt_info(opt, &xo, env);
6872 }
6873 }
6874 }
6875 break;
6876 }
6877 }
6878 break;
6879
6880 case NODE_GIMMICK:
6881 break;
6882
6883 default:
6884 #ifdef ONIG_DEBUG
6885 fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6886 #endif
6887 r = ONIGERR_TYPE_BUG;
6888 break;
6889 }
6890
6891 return r;
6892 }
6893
6894 static int
set_optimize_exact(regex_t * reg,OptStr * e)6895 set_optimize_exact(regex_t* reg, OptStr* e)
6896 {
6897 int r;
6898 int allow_reverse;
6899
6900 if (e->len == 0) return 0;
6901
6902 reg->exact = (UChar* )xmalloc(e->len);
6903 CHECK_NULL_RETURN_MEMERR(reg->exact);
6904 xmemcpy(reg->exact, e->s, e->len);
6905 reg->exact_end = reg->exact + e->len;
6906
6907 allow_reverse =
6908 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6909
6910 if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6911 r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6912 reg->exact, reg->exact_end,
6913 reg->map, &(reg->map_offset));
6914 if (r != 0) return r;
6915
6916 reg->optimize = (allow_reverse != 0
6917 ? OPTIMIZE_STR_FAST
6918 : OPTIMIZE_STR_FAST_STEP_FORWARD);
6919 }
6920 else {
6921 reg->optimize = OPTIMIZE_STR;
6922 }
6923
6924 reg->dist_min = e->mm.min;
6925 reg->dist_max = e->mm.max;
6926
6927 if (reg->dist_min != INFINITE_LEN) {
6928 int n = (int )(reg->exact_end - reg->exact);
6929 reg->threshold_len = reg->dist_min + n;
6930 }
6931
6932 return 0;
6933 }
6934
6935 static void
set_optimize_map(regex_t * reg,OptMap * m)6936 set_optimize_map(regex_t* reg, OptMap* m)
6937 {
6938 int i;
6939
6940 for (i = 0; i < CHAR_MAP_SIZE; i++)
6941 reg->map[i] = m->map[i];
6942
6943 reg->optimize = OPTIMIZE_MAP;
6944 reg->dist_min = m->mm.min;
6945 reg->dist_max = m->mm.max;
6946
6947 if (reg->dist_min != INFINITE_LEN) {
6948 reg->threshold_len = reg->dist_min + ONIGENC_MBC_MINLEN(reg->enc);
6949 }
6950 }
6951
6952 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6953 set_sub_anchor(regex_t* reg, OptAnc* anc)
6954 {
6955 reg->sub_anchor |= anc->left & ANCR_BEGIN_LINE;
6956 reg->sub_anchor |= anc->right & ANCR_END_LINE;
6957 }
6958
6959 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6960 static void print_optimize_info(FILE* f, regex_t* reg);
6961 #endif
6962
6963 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ParseEnv * scan_env)6964 set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env)
6965 {
6966 int r;
6967 OptNode opt;
6968 OptEnv env;
6969
6970 env.enc = reg->enc;
6971 env.case_fold_flag = reg->case_fold_flag;
6972 env.scan_env = scan_env;
6973 mml_clear(&env.mm);
6974
6975 r = optimize_nodes(node, &opt, &env);
6976 if (r != 0) return r;
6977
6978 reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6979 ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6980 ANCR_LOOK_BEHIND);
6981
6982 if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6983 reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6984
6985 reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6986 ANCR_PREC_READ_NOT);
6987
6988 if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6989 reg->anc_dist_min = opt.len.min;
6990 reg->anc_dist_max = opt.len.max;
6991 }
6992
6993 if (opt.sb.len > 0 || opt.sm.len > 0) {
6994 select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6995 if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6996 goto set_map;
6997 }
6998 else {
6999 r = set_optimize_exact(reg, &opt.sb);
7000 set_sub_anchor(reg, &opt.sb.anc);
7001 }
7002 }
7003 else if (opt.map.value > 0) {
7004 set_map:
7005 set_optimize_map(reg, &opt.map);
7006 set_sub_anchor(reg, &opt.map.anc);
7007 }
7008 else {
7009 reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
7010 if (opt.len.max == 0)
7011 reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
7012 }
7013
7014 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
7015 print_optimize_info(DBGFP, reg);
7016 #endif
7017 return r;
7018 }
7019 #endif /* ONIG_DONT_OPTIMIZE */
7020
7021 static void
clear_optimize_info(regex_t * reg)7022 clear_optimize_info(regex_t* reg)
7023 {
7024 reg->optimize = OPTIMIZE_NONE;
7025 reg->anchor = 0;
7026 reg->anc_dist_min = 0;
7027 reg->anc_dist_max = 0;
7028 reg->sub_anchor = 0;
7029 reg->exact_end = (UChar* )NULL;
7030 reg->map_offset = 0;
7031 reg->threshold_len = 0;
7032 if (IS_NOT_NULL(reg->exact)) {
7033 xfree(reg->exact);
7034 reg->exact = (UChar* )NULL;
7035 }
7036 }
7037
7038 #ifdef ONIG_DEBUG
7039
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)7040 static void print_enc_string(FILE* fp, OnigEncoding enc,
7041 const UChar *s, const UChar *end)
7042 {
7043 if (ONIGENC_MBC_MINLEN(enc) > 1) {
7044 const UChar *p;
7045 OnigCodePoint code;
7046
7047 p = s;
7048 while (p < end) {
7049 code = ONIGENC_MBC_TO_CODE(enc, p, end);
7050 if (code >= 0x80) {
7051 fprintf(fp, " 0x%04x ", (int )code);
7052 }
7053 else {
7054 fputc((int )code, fp);
7055 }
7056
7057 p += enclen(enc, p);
7058 }
7059 }
7060 else {
7061 while (s < end) {
7062 fputc((int )*s, fp);
7063 s++;
7064 }
7065 }
7066 }
7067
7068 static void
print_options(FILE * fp,OnigOptionType o)7069 print_options(FILE* fp, OnigOptionType o)
7070 {
7071 if ((o & ONIG_OPTION_IGNORECASE) != 0) fprintf(fp, " IGNORECASE");
7072 if ((o & ONIG_OPTION_EXTEND) != 0) fprintf(fp, " EXTEND");
7073 if ((o & ONIG_OPTION_MULTILINE) != 0) fprintf(fp, " MULTILINE");
7074 if ((o & ONIG_OPTION_SINGLELINE) != 0) fprintf(fp, " SINGLELINE");
7075 if ((o & ONIG_OPTION_FIND_LONGEST) != 0) fprintf(fp, " FIND_LONGEST");
7076 if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0) fprintf(fp, " FIND_NOT_EMPTY");
7077 if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0) fprintf(fp, " NEGATE_SINGLELINE");
7078 if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP");
7079 if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0) fprintf(fp, " CAPTURE_GROUP");
7080 if ((o & ONIG_OPTION_NOTBOL) != 0) fprintf(fp, " NOTBOL");
7081 if ((o & ONIG_OPTION_NOTEOL) != 0) fprintf(fp, " NOTEOL");
7082 if ((o & ONIG_OPTION_POSIX_REGION) != 0) fprintf(fp, " POSIX_REGION");
7083 if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING");
7084 if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII");
7085 if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0) fprintf(fp, " WORD_IS_ASCII");
7086 if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0) fprintf(fp, " DIGIT_IS_ASCII");
7087 if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0) fprintf(fp, " SPACE_IS_ASCII");
7088 if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0) fprintf(fp, " POSIX_IS_ASCII");
7089 if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER");
7090 if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD");
7091 if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING");
7092 if ((o & ONIG_OPTION_NOT_END_STRING) != 0) fprintf(fp, " NOT_END_STRING");
7093 if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION");
7094 if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH");
7095 }
7096
7097 #endif /* ONIG_DEBUG */
7098
7099 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
7100
7101 #ifndef ONIG_DONT_OPTIMIZE
7102
7103 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)7104 print_distance_range(FILE* f, OnigLen a, OnigLen b)
7105 {
7106 if (a == INFINITE_LEN)
7107 fputs("inf", f);
7108 else
7109 fprintf(f, "(%u)", a);
7110
7111 fputs("-", f);
7112
7113 if (b == INFINITE_LEN)
7114 fputs("inf", f);
7115 else
7116 fprintf(f, "(%u)", b);
7117 }
7118
7119 static void
print_anchor(FILE * f,int anchor)7120 print_anchor(FILE* f, int anchor)
7121 {
7122 int q = 0;
7123
7124 fprintf(f, "[");
7125
7126 if (anchor & ANCR_BEGIN_BUF) {
7127 fprintf(f, "begin-buf");
7128 q = 1;
7129 }
7130 if (anchor & ANCR_BEGIN_LINE) {
7131 if (q) fprintf(f, ", ");
7132 q = 1;
7133 fprintf(f, "begin-line");
7134 }
7135 if (anchor & ANCR_BEGIN_POSITION) {
7136 if (q) fprintf(f, ", ");
7137 q = 1;
7138 fprintf(f, "begin-pos");
7139 }
7140 if (anchor & ANCR_END_BUF) {
7141 if (q) fprintf(f, ", ");
7142 q = 1;
7143 fprintf(f, "end-buf");
7144 }
7145 if (anchor & ANCR_SEMI_END_BUF) {
7146 if (q) fprintf(f, ", ");
7147 q = 1;
7148 fprintf(f, "semi-end-buf");
7149 }
7150 if (anchor & ANCR_END_LINE) {
7151 if (q) fprintf(f, ", ");
7152 q = 1;
7153 fprintf(f, "end-line");
7154 }
7155 if (anchor & ANCR_ANYCHAR_INF) {
7156 if (q) fprintf(f, ", ");
7157 q = 1;
7158 fprintf(f, "anychar-inf");
7159 }
7160 if (anchor & ANCR_ANYCHAR_INF_ML) {
7161 if (q) fprintf(f, ", ");
7162 fprintf(f, "anychar-inf-ml");
7163 }
7164
7165 fprintf(f, "]");
7166 }
7167
7168 static void
print_optimize_info(FILE * f,regex_t * reg)7169 print_optimize_info(FILE* f, regex_t* reg)
7170 {
7171 static const char* on[] =
7172 { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", "MAP" };
7173
7174 fprintf(f, "optimize: %s\n", on[reg->optimize]);
7175 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
7176 if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
7177 print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
7178 fprintf(f, "\n");
7179
7180 if (reg->optimize) {
7181 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
7182 fprintf(f, "\n");
7183 }
7184 fprintf(f, "\n");
7185
7186 if (reg->exact) {
7187 UChar *p;
7188 fprintf(f, "exact: [");
7189 for (p = reg->exact; p < reg->exact_end; p++) {
7190 fputc(*p, f);
7191 }
7192 fprintf(f, "]: length: %ld, dmin: %u, ",
7193 (reg->exact_end - reg->exact), reg->dist_min);
7194 if (reg->dist_max == INFINITE_LEN)
7195 fprintf(f, "dmax: inf.\n");
7196 else
7197 fprintf(f, "dmax: %u\n", reg->dist_max);
7198 }
7199 else if (reg->optimize & OPTIMIZE_MAP) {
7200 int c, i, n = 0;
7201
7202 for (i = 0; i < CHAR_MAP_SIZE; i++)
7203 if (reg->map[i]) n++;
7204
7205 fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n",
7206 n, reg->dist_min, reg->dist_max);
7207 if (n > 0) {
7208 c = 0;
7209 fputc('[', f);
7210 for (i = 0; i < CHAR_MAP_SIZE; i++) {
7211 if (reg->map[i] != 0) {
7212 if (c > 0) fputs(", ", f);
7213 c++;
7214 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
7215 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
7216 fputc(i, f);
7217 else
7218 fprintf(f, "%d", i);
7219 }
7220 }
7221 fprintf(f, "]\n");
7222 }
7223 }
7224 }
7225 #endif /* ONIG_DONT_OPTIMIZE */
7226 #endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */
7227
7228
7229 extern RegexExt*
onig_get_regex_ext(regex_t * reg)7230 onig_get_regex_ext(regex_t* reg)
7231 {
7232 if (IS_NULL(reg->extp)) {
7233 RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
7234 if (IS_NULL(ext)) return 0;
7235
7236 ext->pattern = 0;
7237 ext->pattern_end = 0;
7238 #ifdef USE_CALLOUT
7239 ext->tag_table = 0;
7240 ext->callout_num = 0;
7241 ext->callout_list_alloc = 0;
7242 ext->callout_list = 0;
7243 #endif
7244
7245 reg->extp = ext;
7246 }
7247
7248 return reg->extp;
7249 }
7250
7251 static void
free_regex_ext(RegexExt * ext)7252 free_regex_ext(RegexExt* ext)
7253 {
7254 if (IS_NOT_NULL(ext)) {
7255 if (IS_NOT_NULL(ext->pattern))
7256 xfree((void* )ext->pattern);
7257
7258 #ifdef USE_CALLOUT
7259 if (IS_NOT_NULL(ext->tag_table))
7260 onig_callout_tag_table_free(ext->tag_table);
7261
7262 if (IS_NOT_NULL(ext->callout_list))
7263 onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
7264 #endif
7265
7266 xfree(ext);
7267 }
7268 }
7269
7270 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)7271 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
7272 {
7273 RegexExt* ext;
7274 UChar* s;
7275
7276 ext = onig_get_regex_ext(reg);
7277 CHECK_NULL_RETURN_MEMERR(ext);
7278
7279 s = onigenc_strdup(reg->enc, pattern, pattern_end);
7280 CHECK_NULL_RETURN_MEMERR(s);
7281
7282 ext->pattern = s;
7283 ext->pattern_end = s + (pattern_end - pattern);
7284
7285 return ONIG_NORMAL;
7286 }
7287
7288 extern void
onig_free_body(regex_t * reg)7289 onig_free_body(regex_t* reg)
7290 {
7291 if (IS_NOT_NULL(reg)) {
7292 ops_free(reg);
7293 if (IS_NOT_NULL(reg->string_pool)) {
7294 xfree(reg->string_pool);
7295 reg->string_pool_end = reg->string_pool = 0;
7296 }
7297 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
7298 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
7299 if (IS_NOT_NULL(reg->extp)) {
7300 free_regex_ext(reg->extp);
7301 reg->extp = 0;
7302 }
7303
7304 onig_names_free(reg);
7305 }
7306 }
7307
7308 extern void
onig_free(regex_t * reg)7309 onig_free(regex_t* reg)
7310 {
7311 if (IS_NOT_NULL(reg)) {
7312 onig_free_body(reg);
7313 xfree(reg);
7314 }
7315 }
7316
7317
7318 #ifdef ONIG_DEBUG_PARSE
7319 static void print_tree P_((FILE* f, Node* node));
7320 #endif
7321
7322 extern int onig_init_for_match_at(regex_t* reg);
7323
parse_and_tune(regex_t * reg,const UChar * pattern,const UChar * pattern_end,ParseEnv * scan_env,Node ** rroot,OnigErrorInfo * einfo,UnsetAddrList * uslist)7324 static int parse_and_tune(regex_t* reg, const UChar* pattern,
7325 const UChar* pattern_end, ParseEnv *scan_env, Node** rroot,
7326 OnigErrorInfo* einfo
7327 #ifdef USE_CALL
7328 , UnsetAddrList* uslist
7329 #endif
7330 )
7331 {
7332 int r;
7333 Node* root;
7334
7335 root = NULL_NODE;
7336 if (IS_NOT_NULL(einfo)) {
7337 einfo->enc = reg->enc;
7338 einfo->par = (UChar* )NULL;
7339 }
7340
7341 r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env);
7342 if (r != 0) goto err;
7343
7344 r = reduce_string_list(root, reg->enc);
7345 if (r != 0) goto err;
7346
7347 /* mixed use named group and no-named group */
7348 if (scan_env->num_named > 0 &&
7349 IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
7350 ! OPTON_CAPTURE_GROUP(reg->options)) {
7351 if (scan_env->num_named != scan_env->num_mem)
7352 r = disable_noname_group_capture(&root, reg, scan_env);
7353 else
7354 r = numbered_ref_check(root);
7355
7356 if (r != 0) goto err;
7357 }
7358
7359 r = check_backrefs(root, scan_env);
7360 if (r != 0) goto err;
7361
7362 #ifdef USE_CALL
7363 if (scan_env->num_call > 0) {
7364 r = unset_addr_list_init(uslist, scan_env->num_call);
7365 if (r != 0) goto err;
7366 scan_env->unset_addr_list = uslist;
7367 r = tune_call(root, scan_env, 0);
7368 if (r != 0) goto err_unset;
7369 r = tune_call2(root);
7370 if (r != 0) goto err_unset;
7371 r = recursive_call_check_trav(root, scan_env, 0);
7372 if (r < 0) goto err_unset;
7373 r = infinite_recursive_call_check_trav(root, scan_env);
7374 if (r != 0) goto err_unset;
7375
7376 tune_called_state(root, 0);
7377 }
7378
7379 reg->num_call = scan_env->num_call;
7380 #endif
7381
7382 #ifdef ONIG_DEBUG_PARSE
7383 fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth);
7384 #endif
7385
7386 r = tune_tree(root, reg, 0, scan_env);
7387 if (r != 0) {
7388 #ifdef ONIG_DEBUG_PARSE
7389 fprintf(DBGFP, "TREE (error in tune)\n");
7390 print_tree(DBGFP, root);
7391 fprintf(DBGFP, "\n");
7392 #endif
7393 goto err_unset;
7394 }
7395
7396 if (scan_env->backref_num != 0) {
7397 set_parent_node_trav(root, NULL_NODE);
7398 r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env);
7399 if (r != 0) goto err_unset;
7400 set_empty_status_check_trav(root, scan_env);
7401 }
7402
7403 *rroot = root;
7404 return r;
7405
7406 err_unset:
7407 #ifdef USE_CALL
7408 if (scan_env->num_call > 0) {
7409 unset_addr_list_end(uslist);
7410 }
7411 #endif
7412 err:
7413 if (IS_NOT_NULL(scan_env->error)) {
7414 if (IS_NOT_NULL(einfo)) {
7415 einfo->par = scan_env->error;
7416 einfo->par_end = scan_env->error_end;
7417 }
7418 }
7419
7420 onig_node_free(root);
7421 if (IS_NOT_NULL(scan_env->mem_env_dynamic))
7422 xfree(scan_env->mem_env_dynamic);
7423
7424 *rroot = NULL_NODE;
7425 return r;
7426 }
7427
7428 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)7429 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
7430 OnigErrorInfo* einfo)
7431 {
7432 int r;
7433 Node* root;
7434 ParseEnv scan_env;
7435 #ifdef USE_CALL
7436 UnsetAddrList uslist = {0};
7437 #endif
7438
7439 #ifdef ONIG_DEBUG
7440 fprintf(DBGFP, "\nPATTERN: /");
7441 print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
7442 fprintf(DBGFP, "/\n");
7443 fprintf(DBGFP, "OPTIONS:");
7444 print_options(DBGFP, reg->options);
7445 fprintf(DBGFP, "\n");
7446 #endif
7447
7448 if (reg->ops_alloc == 0) {
7449 r = ops_init(reg, OPS_INIT_SIZE);
7450 if (r != 0) {
7451 if (IS_NOT_NULL(einfo)) {
7452 einfo->enc = reg->enc;
7453 einfo->par = (UChar* )NULL;
7454 }
7455 return r;
7456 }
7457 }
7458 else
7459 reg->ops_used = 0;
7460
7461 r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo
7462 #ifdef USE_CALL
7463 , &uslist
7464 #endif
7465 );
7466 if (r != 0) return r;
7467
7468 #ifdef ONIG_DEBUG_PARSE
7469 fprintf(DBGFP, "TREE (after tune)\n");
7470 print_tree(DBGFP, root);
7471 fprintf(DBGFP, "\n");
7472 #endif
7473
7474 reg->capture_history = scan_env.cap_history;
7475 reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history;
7476
7477 #ifdef USE_CALLOUT
7478 if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
7479 reg->push_mem_end = reg->push_mem_start;
7480 }
7481 else {
7482 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7483 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7484 else
7485 reg->push_mem_end = reg->push_mem_start &
7486 (scan_env.backrefed_mem | scan_env.cap_history);
7487 }
7488 #else
7489 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7490 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7491 else
7492 reg->push_mem_end = reg->push_mem_start &
7493 (scan_env.backrefed_mem | scan_env.cap_history);
7494 #endif
7495
7496 clear_optimize_info(reg);
7497 #ifndef ONIG_DONT_OPTIMIZE
7498 r = set_optimize_info_from_tree(root, reg, &scan_env);
7499 if (r != 0) {
7500 #ifdef USE_CALL
7501 if (scan_env.num_call > 0) {
7502 unset_addr_list_end(&uslist);
7503 }
7504 #endif
7505 goto err;
7506 }
7507 #endif
7508
7509 if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
7510 xfree(scan_env.mem_env_dynamic);
7511 scan_env.mem_env_dynamic = (MemEnv* )NULL;
7512 }
7513
7514 r = compile_tree(root, reg, &scan_env);
7515 if (r == 0) {
7516 if (scan_env.keep_num > 0) {
7517 r = add_op(reg, OP_UPDATE_VAR);
7518 if (r != 0) goto err;
7519
7520 COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
7521 COP(reg)->update_var.id = 0; /* not used */
7522 COP(reg)->update_var.clear = FALSE;
7523 }
7524
7525 r = add_op(reg, OP_END);
7526 if (r != 0) goto err;
7527
7528 #ifdef USE_CALL
7529 if (scan_env.num_call > 0) {
7530 r = fix_unset_addr_list(&uslist, reg);
7531 unset_addr_list_end(&uslist);
7532 if (r != 0) goto err;
7533 }
7534 #endif
7535
7536 r = ops_resize(reg, reg->ops_used);
7537 if (r != ONIG_NORMAL) goto err;
7538
7539 set_addr_in_repeat_range(reg);
7540
7541 if ((reg->push_mem_end != 0)
7542 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
7543 || (reg->num_repeat != 0)
7544 || (reg->num_empty_check != 0)
7545 #endif
7546 #ifdef USE_CALLOUT
7547 || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
7548 #endif
7549 #ifdef USE_CALL
7550 || scan_env.num_call > 0
7551 #endif
7552 )
7553 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
7554 else {
7555 if (reg->push_mem_start != 0)
7556 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
7557 else
7558 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
7559 }
7560
7561 r = ops_make_string_pool(reg);
7562 if (r != 0) goto err;
7563 }
7564 #ifdef USE_CALL
7565 else if (scan_env.num_call > 0) {
7566 unset_addr_list_end(&uslist);
7567 }
7568 #endif
7569 onig_node_free(root);
7570
7571 #ifdef ONIG_DEBUG_COMPILE
7572 onig_print_names(DBGFP, reg);
7573 onig_print_compiled_byte_code_list(DBGFP, reg);
7574 #endif
7575
7576 #ifdef USE_DIRECT_THREADED_CODE
7577 /* opcode -> opaddr */
7578 onig_init_for_match_at(reg);
7579 #endif
7580
7581 return r;
7582
7583 err:
7584 if (IS_NOT_NULL(scan_env.error)) {
7585 if (IS_NOT_NULL(einfo)) {
7586 einfo->par = scan_env.error;
7587 einfo->par_end = scan_env.error_end;
7588 }
7589 }
7590
7591 onig_node_free(root);
7592 if (IS_NOT_NULL(scan_env.mem_env_dynamic))
7593 xfree(scan_env.mem_env_dynamic);
7594 return r;
7595 }
7596
7597
7598 static int onig_inited = 0;
7599
7600 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)7601 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
7602 OnigEncoding enc, OnigSyntaxType* syntax)
7603 {
7604 int r;
7605
7606 xmemset(reg, 0, sizeof(*reg));
7607
7608 if (onig_inited == 0) {
7609 #if 0
7610 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
7611 #else
7612 r = onig_initialize(&enc, 1);
7613 if (r != 0)
7614 return ONIGERR_FAIL_TO_INITIALIZE;
7615
7616 onig_warning("You didn't call onig_initialize() explicitly");
7617 #endif
7618 }
7619
7620 if (IS_NULL(reg))
7621 return ONIGERR_INVALID_ARGUMENT;
7622
7623 if (ONIGENC_IS_UNDEF(enc))
7624 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
7625
7626 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
7627 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
7628 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
7629 }
7630
7631 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
7632 option |= syntax->options;
7633 option &= ~ONIG_OPTION_SINGLELINE;
7634 }
7635 else
7636 option |= syntax->options;
7637
7638 if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) {
7639 case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR |
7640 ONIGENC_CASE_FOLD_TURKISH_AZERI);
7641 case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY;
7642 }
7643
7644 (reg)->enc = enc;
7645 (reg)->options = option;
7646 (reg)->syntax = syntax;
7647 (reg)->optimize = 0;
7648 (reg)->exact = (UChar* )NULL;
7649 (reg)->extp = (RegexExt* )NULL;
7650 (reg)->ops = (Operation* )NULL;
7651 (reg)->ops_curr = (Operation* )NULL;
7652 (reg)->ops_used = 0;
7653 (reg)->ops_alloc = 0;
7654 (reg)->name_table = (void* )NULL;
7655 (reg)->case_fold_flag = case_fold_flag;
7656 return 0;
7657 }
7658
7659 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7660 onig_new_without_alloc(regex_t* reg,
7661 const UChar* pattern, const UChar* pattern_end,
7662 OnigOptionType option, OnigEncoding enc,
7663 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
7664 {
7665 int r;
7666
7667 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7668 if (r != 0) return r;
7669
7670 r = onig_compile(reg, pattern, pattern_end, einfo);
7671 return r;
7672 }
7673
7674 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7675 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
7676 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
7677 OnigErrorInfo* einfo)
7678 {
7679 int r;
7680
7681 *reg = (regex_t* )xmalloc(sizeof(regex_t));
7682 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
7683
7684 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7685 if (r != 0) {
7686 xfree(*reg);
7687 *reg = NULL;
7688 return r;
7689 }
7690
7691 r = onig_compile(*reg, pattern, pattern_end, einfo);
7692 if (r != 0) {
7693 onig_free(*reg);
7694 *reg = NULL;
7695 }
7696 return r;
7697 }
7698
7699 extern int
onig_initialize(OnigEncoding encodings[],int n)7700 onig_initialize(OnigEncoding encodings[], int n)
7701 {
7702 int i;
7703 int r;
7704
7705 if (onig_inited != 0)
7706 return 0;
7707
7708 onigenc_init();
7709
7710 onig_inited = 1;
7711
7712 for (i = 0; i < n; i++) {
7713 OnigEncoding enc = encodings[i];
7714 r = onig_initialize_encoding(enc);
7715 if (r != 0)
7716 return r;
7717 }
7718
7719 return ONIG_NORMAL;
7720 }
7721
7722 typedef struct EndCallListItem {
7723 struct EndCallListItem* next;
7724 void (*func)(void);
7725 } EndCallListItemType;
7726
7727 static EndCallListItemType* EndCallTop;
7728
onig_add_end_call(void (* func)(void))7729 extern void onig_add_end_call(void (*func)(void))
7730 {
7731 EndCallListItemType* item;
7732
7733 item = (EndCallListItemType* )xmalloc(sizeof(*item));
7734 if (item == 0) return ;
7735
7736 item->next = EndCallTop;
7737 item->func = func;
7738
7739 EndCallTop = item;
7740 }
7741
7742 static void
exec_end_call_list(void)7743 exec_end_call_list(void)
7744 {
7745 EndCallListItemType* prev;
7746 void (*func)(void);
7747
7748 while (EndCallTop != 0) {
7749 func = EndCallTop->func;
7750 (*func)();
7751
7752 prev = EndCallTop;
7753 EndCallTop = EndCallTop->next;
7754 xfree(prev);
7755 }
7756 }
7757
7758 extern int
onig_end(void)7759 onig_end(void)
7760 {
7761 exec_end_call_list();
7762
7763 #ifdef USE_CALLOUT
7764 onig_global_callout_names_free();
7765 #endif
7766
7767 onigenc_end();
7768
7769 onig_inited = 0;
7770
7771 return 0;
7772 }
7773
7774 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7775 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7776 {
7777 OnigCodePoint n, *data;
7778 OnigCodePoint low, high, x;
7779
7780 GET_CODE_POINT(n, p);
7781 data = (OnigCodePoint* )p;
7782 data++;
7783
7784 for (low = 0, high = n; low < high; ) {
7785 x = (low + high) >> 1;
7786 if (code > data[x * 2 + 1])
7787 low = x + 1;
7788 else
7789 high = x;
7790 }
7791
7792 return ((low < n && code >= data[low * 2]) ? 1 : 0);
7793 }
7794
7795 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7796 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7797 {
7798 int found;
7799 CClassNode* cc = (CClassNode* )cc_arg;
7800
7801 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7802 if (IS_NULL(cc->mbuf)) {
7803 found = 0;
7804 }
7805 else {
7806 found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7807 }
7808 }
7809 else {
7810 found = BITSET_AT(cc->bs, code) != 0;
7811 }
7812
7813 if (IS_NCCLASS_NOT(cc))
7814 return !found;
7815 else
7816 return found;
7817 }
7818
7819 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7820 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7821 {
7822 int len;
7823
7824 if (ONIGENC_MBC_MINLEN(enc) > 1) {
7825 len = 2;
7826 }
7827 else {
7828 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7829 if (len < 0) return 0;
7830 }
7831 return onig_is_code_in_cc_len(len, code, cc);
7832 }
7833
7834
7835 #define MANY_REPEAT_OF_ANYCHAR 20
7836
7837 typedef enum {
7838 MJ_NO = 0,
7839 MJ_YES = 1,
7840 MJ_IGNORE = 2,
7841 } MJ_RESULT;
7842
7843 static MJ_RESULT
mostly_just_anychar(Node * node,int in_reluctant)7844 mostly_just_anychar(Node* node, int in_reluctant)
7845 {
7846 MJ_RESULT r;
7847
7848 r = MJ_NO;
7849 switch (NODE_TYPE(node)) {
7850 case NODE_LIST:
7851 {
7852 int found = FALSE;
7853 do {
7854 r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
7855 if (r == MJ_NO) break;
7856 if (r == MJ_YES) found = TRUE;
7857 } while (IS_NOT_NULL(node = NODE_CDR(node)));
7858 if (r == MJ_IGNORE) {
7859 if (found == TRUE) r = MJ_YES;
7860 }
7861 }
7862 break;
7863
7864 case NODE_ALT:
7865 r = MJ_IGNORE;
7866 do {
7867 r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
7868 if (r == MJ_YES) break;
7869 } while (IS_NOT_NULL(node = NODE_CDR(node)));
7870 break;
7871
7872 case NODE_QUANT:
7873 {
7874 QuantNode* qn = QUANT_(node);
7875
7876 if (qn->upper == 0)
7877 r = MJ_IGNORE;
7878 else {
7879 if (in_reluctant == FALSE) {
7880 if (qn->greedy != 0 &&
7881 (! IS_INFINITE_REPEAT(qn->upper) &&
7882 qn->upper <= MANY_REPEAT_OF_ANYCHAR)) {
7883 in_reluctant = TRUE;
7884 }
7885 }
7886 r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
7887 }
7888 }
7889 break;
7890
7891 case NODE_ANCHOR:
7892 switch (ANCHOR_(node)->type) {
7893 case ANCR_PREC_READ:
7894 case ANCR_PREC_READ_NOT:
7895 case ANCR_LOOK_BEHIND:
7896 case ANCR_LOOK_BEHIND_NOT:
7897 case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */
7898 r = MJ_IGNORE;
7899 break;
7900 default:
7901 break;
7902 }
7903 break;
7904
7905 case NODE_BAG:
7906 {
7907 BagNode* en = BAG_(node);
7908
7909 if (en->type == BAG_IF_ELSE) {
7910 if (IS_NOT_NULL(en->te.Then)) {
7911 r = mostly_just_anychar(en->te.Then, in_reluctant);
7912 if (r == MJ_YES) break;
7913 }
7914 if (IS_NOT_NULL(en->te.Else)) {
7915 r = mostly_just_anychar(en->te.Else, in_reluctant);
7916 }
7917 }
7918 else {
7919 r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
7920 }
7921 }
7922 break;
7923
7924 case NODE_CTYPE:
7925 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
7926 r = MJ_YES;
7927 else
7928 r = MJ_NO;
7929 break;
7930
7931 case NODE_STRING:
7932 if (NODE_STRING_LEN(node) == 0) {
7933 r = MJ_IGNORE;
7934 break;
7935 }
7936 /* fall */
7937 case NODE_CCLASS:
7938 r = MJ_NO;
7939 break;
7940
7941 #ifdef USE_CALL
7942 case NODE_CALL:
7943 /* ignore call */
7944 #endif
7945 case NODE_BACKREF:
7946 case NODE_GIMMICK:
7947 r = MJ_IGNORE;
7948 break;
7949
7950 default:
7951 break;
7952 }
7953
7954 return r;
7955 }
7956
7957 #define MAX_CALLS_IN_DETECT 10
7958
7959 typedef struct {
7960 int prec_read;
7961 int look_behind;
7962 int backref;
7963 int backref_with_level;
7964 int call;
7965 int anychar_reluctant_many;
7966 int empty_check_nest_level;
7967 int max_empty_check_nest_level;
7968 int heavy_element;
7969 } SlowElementCount;
7970
7971 static int
detect_can_be_slow(Node * node,SlowElementCount * ct,int ncall,int calls[])7972 detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[])
7973 {
7974 int r;
7975
7976 r = 0;
7977 switch (NODE_TYPE(node)) {
7978 case NODE_LIST:
7979 case NODE_ALT:
7980 do {
7981 r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls);
7982 if (r != 0) return r;
7983 } while (IS_NOT_NULL(node = NODE_CDR(node)));
7984 break;
7985
7986 case NODE_QUANT:
7987 {
7988 int prev_heavy_element;
7989 QuantNode* qn;
7990 Node* body;
7991
7992 qn = QUANT_(node);
7993 body = NODE_BODY(node);
7994
7995 if (qn->emptiness != BODY_IS_NOT_EMPTY) {
7996 prev_heavy_element = ct->heavy_element;
7997 ct->empty_check_nest_level++;
7998 if (ct->empty_check_nest_level > ct->max_empty_check_nest_level)
7999 ct->max_empty_check_nest_level = ct->empty_check_nest_level;
8000 }
8001 else if (IS_INFINITE_REPEAT(qn->upper) ||
8002 qn->upper > MANY_REPEAT_OF_ANYCHAR) {
8003 MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0));
8004 if (mr == MJ_YES)
8005 ct->anychar_reluctant_many++;
8006 }
8007
8008 r = detect_can_be_slow(body, ct, ncall, calls);
8009
8010 if (qn->emptiness != BODY_IS_NOT_EMPTY) {
8011 if (NODE_IS_INPEEK(node)) {
8012 if (ct->empty_check_nest_level > 2) {
8013 if (prev_heavy_element == ct->heavy_element)
8014 ct->heavy_element++;
8015 }
8016 }
8017 ct->empty_check_nest_level--;
8018 }
8019 }
8020 break;
8021
8022 case NODE_ANCHOR:
8023 switch (ANCHOR_(node)->type) {
8024 case ANCR_PREC_READ:
8025 case ANCR_PREC_READ_NOT:
8026 ct->prec_read++;
8027 break;
8028 case ANCR_LOOK_BEHIND:
8029 case ANCR_LOOK_BEHIND_NOT:
8030 ct->look_behind++;
8031 break;
8032 default:
8033 break;
8034 }
8035
8036 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
8037 r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
8038 break;
8039
8040 case NODE_BAG:
8041 {
8042 BagNode* en = BAG_(node);
8043
8044 r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
8045 if (r != 0) return r;
8046
8047 if (en->type == BAG_IF_ELSE) {
8048 if (IS_NOT_NULL(en->te.Then)) {
8049 r = detect_can_be_slow(en->te.Then, ct, ncall, calls);
8050 if (r != 0) return r;
8051 }
8052 if (IS_NOT_NULL(en->te.Else)) {
8053 r = detect_can_be_slow(en->te.Else, ct, ncall, calls);
8054 if (r != 0) return r;
8055 }
8056 }
8057 }
8058 break;
8059
8060 #ifdef USE_BACKREF_WITH_LEVEL
8061 case NODE_BACKREF:
8062 if (NODE_IS_NEST_LEVEL(node))
8063 ct->backref_with_level++;
8064 else
8065 ct->backref++;
8066 break;
8067 #endif
8068
8069 #ifdef USE_CALL
8070 case NODE_CALL:
8071 {
8072 int i;
8073 int found;
8074 int gnum;
8075
8076 gnum = CALL_(node)->called_gnum;
8077 ct->call++;
8078
8079 if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) &&
8080 NODE_IS_IN_REAL_REPEAT(node)) {
8081 ct->heavy_element += 10;
8082 }
8083
8084 found = FALSE;
8085 for (i = 0; i < ncall; i++) {
8086 if (gnum == calls[i]) {
8087 found = TRUE;
8088 break;
8089 }
8090 }
8091
8092 if (! found) {
8093 if (ncall + 1 < MAX_CALLS_IN_DETECT) {
8094 calls[ncall] = gnum;
8095 r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls);
8096 }
8097 else {
8098 ct->heavy_element++;
8099 }
8100 }
8101 }
8102 break;
8103 #endif
8104
8105 default:
8106 break;
8107 }
8108
8109 return r;
8110 }
8111
8112 extern int
onig_detect_can_be_slow_pattern(const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax)8113 onig_detect_can_be_slow_pattern(const UChar* pattern,
8114 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
8115 OnigSyntaxType* syntax)
8116 {
8117 int r;
8118 regex_t* reg;
8119 Node* root;
8120 ParseEnv scan_env;
8121 SlowElementCount count;
8122 int calls[MAX_CALLS_IN_DETECT];
8123 #ifdef USE_CALL
8124 UnsetAddrList uslist = {0};
8125 #endif
8126
8127 reg = (regex_t* )xmalloc(sizeof(regex_t));
8128 if (IS_NULL(reg)) return ONIGERR_MEMORY;
8129
8130 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
8131 if (r != 0) {
8132 xfree(reg);
8133 return r;
8134 }
8135
8136 r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL
8137 #ifdef USE_CALL
8138 , &uslist
8139 #endif
8140 );
8141 if (r != 0) goto err;
8142
8143 #ifdef USE_CALL
8144 if (scan_env.num_call > 0) {
8145 unset_addr_list_end(&uslist);
8146 }
8147 #endif
8148
8149 count.prec_read = 0;
8150 count.look_behind = 0;
8151 count.backref = 0;
8152 count.backref_with_level = 0;
8153 count.call = 0;
8154 count.anychar_reluctant_many = 0;
8155 count.empty_check_nest_level = 0;
8156 count.max_empty_check_nest_level = 0;
8157 count.heavy_element = 0;
8158
8159 r = detect_can_be_slow(root, &count, 0, calls);
8160 if (r == 0) {
8161 int n = count.prec_read + count.look_behind
8162 + count.backref + count.backref_with_level + count.call
8163 + count.anychar_reluctant_many;
8164 if (count.heavy_element != 0)
8165 n += count.heavy_element * 10;
8166
8167 r = n;
8168 }
8169
8170 if (IS_NOT_NULL(scan_env.mem_env_dynamic))
8171 xfree(scan_env.mem_env_dynamic);
8172
8173 err:
8174 onig_node_free(root);
8175 onig_free(reg);
8176 return r;
8177 }
8178
8179
8180 #ifdef ONIG_DEBUG_PARSE
8181
8182 #ifdef USE_CALL
8183 static void
p_string(FILE * f,int len,UChar * s)8184 p_string(FILE* f, int len, UChar* s)
8185 {
8186 fputs(":", f);
8187 while (len-- > 0) { fputc(*s++, f); }
8188 }
8189 #endif
8190
8191 static void
Indent(FILE * f,int indent)8192 Indent(FILE* f, int indent)
8193 {
8194 int i;
8195 for (i = 0; i < indent; i++) putc(' ', f);
8196 }
8197
8198 static void
print_indent_tree(FILE * f,Node * node,int indent)8199 print_indent_tree(FILE* f, Node* node, int indent)
8200 {
8201 static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" };
8202
8203 int i;
8204 NodeType type;
8205 UChar* p;
8206 int add = 3;
8207
8208 Indent(f, indent);
8209 if (IS_NULL(node)) {
8210 fprintf(f, "ERROR: null node!!!\n");
8211 exit(0);
8212 }
8213
8214 type = NODE_TYPE(node);
8215 switch (type) {
8216 case NODE_LIST:
8217 case NODE_ALT:
8218 if (type == NODE_LIST)
8219 fprintf(f, "<list:%p>\n", node);
8220 else
8221 fprintf(f, "<alt:%p>\n", node);
8222
8223 print_indent_tree(f, NODE_CAR(node), indent + add);
8224 while (IS_NOT_NULL(node = NODE_CDR(node))) {
8225 if (NODE_TYPE(node) != type) {
8226 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
8227 exit(0);
8228 }
8229 print_indent_tree(f, NODE_CAR(node), indent + add);
8230 }
8231 break;
8232
8233 case NODE_STRING:
8234 {
8235 char* str;
8236 char* mode;
8237
8238 if (NODE_STRING_IS_CRUDE(node))
8239 mode = "-crude";
8240 else if (NODE_IS_IGNORECASE(node))
8241 mode = "-ignorecase";
8242 else
8243 mode = "";
8244
8245 if (STR_(node)->s == STR_(node)->end)
8246 str = "empty-string";
8247 else
8248 str = "string";
8249
8250 fprintf(f, "<%s%s:%p>", str, mode, node);
8251 for (p = STR_(node)->s; p < STR_(node)->end; p++) {
8252 if (*p >= 0x20 && *p < 0x7f)
8253 fputc(*p, f);
8254 else {
8255 fprintf(f, " 0x%02x", *p);
8256 }
8257 }
8258 }
8259 break;
8260
8261 case NODE_CCLASS:
8262 #define CCLASS_MBUF_MAX_OUTPUT_NUM 10
8263
8264 fprintf(f, "<cclass:%p>", node);
8265 if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
8266 if (CCLASS_(node)->mbuf) {
8267 BBuf* bbuf = CCLASS_(node)->mbuf;
8268 fprintf(f, " mbuf(%u) ", bbuf->used);
8269 for (i = 0; i < bbuf->used && i < CCLASS_MBUF_MAX_OUTPUT_NUM; i++) {
8270 if (i > 0) fprintf(f, ",");
8271 fprintf(f, "%0x", bbuf->p[i]);
8272 }
8273 if (i < bbuf->used) fprintf(f, "...");
8274 }
8275 break;
8276
8277 case NODE_CTYPE:
8278 fprintf(f, "<ctype:%p> ", node);
8279 switch (CTYPE_(node)->ctype) {
8280 case CTYPE_ANYCHAR:
8281 fprintf(f, "anychar");
8282 break;
8283
8284 case ONIGENC_CTYPE_WORD:
8285 if (CTYPE_(node)->not != 0)
8286 fputs("not word", f);
8287 else
8288 fputs("word", f);
8289
8290 if (CTYPE_(node)->ascii_mode != 0)
8291 fputs(" (ascii)", f);
8292
8293 break;
8294
8295 default:
8296 fprintf(f, "ERROR: undefined ctype.\n");
8297 exit(0);
8298 }
8299 break;
8300
8301 case NODE_ANCHOR:
8302 fprintf(f, "<anchor:%p> ", node);
8303 switch (ANCHOR_(node)->type) {
8304 case ANCR_BEGIN_BUF: fputs("begin buf", f); break;
8305 case ANCR_END_BUF: fputs("end buf", f); break;
8306 case ANCR_BEGIN_LINE: fputs("begin line", f); break;
8307 case ANCR_END_LINE: fputs("end line", f); break;
8308 case ANCR_SEMI_END_BUF: fputs("semi end buf", f); break;
8309 case ANCR_BEGIN_POSITION: fputs("begin position", f); break;
8310
8311 case ANCR_WORD_BOUNDARY: fputs("word boundary", f); break;
8312 case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
8313 #ifdef USE_WORD_BEGIN_END
8314 case ANCR_WORD_BEGIN: fputs("word begin", f); break;
8315 case ANCR_WORD_END: fputs("word end", f); break;
8316 #endif
8317 case ANCR_TEXT_SEGMENT_BOUNDARY:
8318 fputs("text-segment boundary", f); break;
8319 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
8320 fputs("no text-segment boundary", f); break;
8321 case ANCR_PREC_READ:
8322 fprintf(f, "prec read\n");
8323 print_indent_tree(f, NODE_BODY(node), indent + add);
8324 break;
8325 case ANCR_PREC_READ_NOT:
8326 fprintf(f, "prec read not\n");
8327 print_indent_tree(f, NODE_BODY(node), indent + add);
8328 break;
8329 case ANCR_LOOK_BEHIND:
8330 fprintf(f, "look behind\n");
8331 print_indent_tree(f, NODE_BODY(node), indent + add);
8332 break;
8333 case ANCR_LOOK_BEHIND_NOT:
8334 fprintf(f, "look behind not\n");
8335 print_indent_tree(f, NODE_BODY(node), indent + add);
8336 break;
8337
8338 default:
8339 fprintf(f, "ERROR: undefined anchor type.\n");
8340 break;
8341 }
8342 break;
8343
8344 case NODE_BACKREF:
8345 {
8346 int* p;
8347 BackRefNode* br = BACKREF_(node);
8348 p = BACKREFS_P(br);
8349 fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
8350 for (i = 0; i < br->back_num; i++) {
8351 if (i > 0) fputs(", ", f);
8352 fprintf(f, "%d", p[i]);
8353 }
8354 #ifdef USE_BACKREF_WITH_LEVEL
8355 if (NODE_IS_NEST_LEVEL(node)) {
8356 fprintf(f, ", level: %d", br->nest_level);
8357 }
8358 #endif
8359 }
8360 break;
8361
8362 #ifdef USE_CALL
8363 case NODE_CALL:
8364 {
8365 CallNode* cn = CALL_(node);
8366 fprintf(f, "<call:%p>", node);
8367 fprintf(f, " num: %d, name", cn->called_gnum);
8368 p_string(f, cn->name_end - cn->name, cn->name);
8369 if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion");
8370 if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek");
8371 if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat");
8372 }
8373 break;
8374 #endif
8375
8376 case NODE_QUANT:
8377 {
8378 fprintf(f, "<quantifier:%p>{%d,%d}%s%s%s", node,
8379 QUANT_(node)->lower, QUANT_(node)->upper,
8380 (QUANT_(node)->greedy ? "" : "?"),
8381 QUANT_(node)->include_referred == 0 ? "" : " referred",
8382 emptiness_name[QUANT_(node)->emptiness]);
8383 if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek");
8384 fprintf(f, "\n");
8385 print_indent_tree(f, NODE_BODY(node), indent + add);
8386 }
8387 break;
8388
8389 case NODE_BAG:
8390 {
8391 BagNode* bn = BAG_(node);
8392 fprintf(f, "<bag:%p> ", node);
8393 if (bn->type == BAG_IF_ELSE) {
8394 Node* Then;
8395 Node* Else;
8396
8397 fprintf(f, "if-else\n");
8398 print_indent_tree(f, NODE_BODY(node), indent + add);
8399
8400 Then = bn->te.Then;
8401 Else = bn->te.Else;
8402 if (IS_NULL(Then)) {
8403 Indent(f, indent + add);
8404 fprintf(f, "THEN empty\n");
8405 }
8406 else
8407 print_indent_tree(f, Then, indent + add);
8408
8409 if (IS_NULL(Else)) {
8410 Indent(f, indent + add);
8411 fprintf(f, "ELSE empty\n");
8412 }
8413 else
8414 print_indent_tree(f, Else, indent + add);
8415 }
8416 else {
8417 switch (bn->type) {
8418 case BAG_OPTION:
8419 fprintf(f, "option:%d", bn->o.options);
8420 break;
8421 case BAG_MEMORY:
8422 fprintf(f, "memory:%d", bn->m.regnum);
8423 if (NODE_IS_CALLED(node)) {
8424 fprintf(f, ", called");
8425 if (NODE_IS_RECURSION(node))
8426 fprintf(f, ", recursion");
8427 }
8428 else if (NODE_IS_REFERENCED(node))
8429 fprintf(f, ", referenced");
8430
8431 if (NODE_IS_FIXED_ADDR(node))
8432 fprintf(f, ", fixed-addr");
8433 if ((bn->m.called_state & IN_PEEK) != 0)
8434 fprintf(f, ", in-peek");
8435 break;
8436 case BAG_STOP_BACKTRACK:
8437 fprintf(f, "stop-bt");
8438 break;
8439 default:
8440 break;
8441 }
8442 fprintf(f, "\n");
8443 print_indent_tree(f, NODE_BODY(node), indent + add);
8444 }
8445 }
8446 break;
8447
8448 case NODE_GIMMICK:
8449 fprintf(f, "<gimmick:%p> ", node);
8450 switch (GIMMICK_(node)->type) {
8451 case GIMMICK_FAIL:
8452 fprintf(f, "fail");
8453 break;
8454 case GIMMICK_SAVE:
8455 fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
8456 break;
8457 case GIMMICK_UPDATE_VAR:
8458 fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
8459 break;
8460 #ifdef USE_CALLOUT
8461 case GIMMICK_CALLOUT:
8462 switch (GIMMICK_(node)->detail_type) {
8463 case ONIG_CALLOUT_OF_CONTENTS:
8464 fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
8465 break;
8466 case ONIG_CALLOUT_OF_NAME:
8467 fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
8468 break;
8469 }
8470 #endif
8471 }
8472 break;
8473
8474 default:
8475 fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
8476 break;
8477 }
8478
8479 if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
8480 type != NODE_BAG)
8481 fprintf(f, "\n");
8482 fflush(f);
8483 }
8484
8485 static void
print_tree(FILE * f,Node * node)8486 print_tree(FILE* f, Node* node)
8487 {
8488 print_indent_tree(f, node, 0);
8489 }
8490 #endif
8491