1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2021  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 #define OPS_INIT_SIZE  8
33 
34 #define NODE_IS_REAL_IGNORECASE(node) \
35   (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node))
36 
37 typedef struct {
38   OnigLen min;
39   OnigLen max;
40 } MinMaxLen;
41 
42 typedef struct {
43   OnigLen min;
44   OnigLen max;
45   int min_is_sure;
46 } MinMaxCharLen;
47 
48 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
49 
50 static OnigLen node_min_byte_len(Node* node, ParseEnv* env);
51 
52 #if 0
53 typedef struct {
54   int  n;
55   int  alloc;
56   int* v;
57 } int_stack;
58 
59 static int
60 make_int_stack(int_stack** rs, int init_size)
61 {
62   int_stack* s;
63   int* v;
64 
65   *rs = 0;
66 
67   s = xmalloc(sizeof(*s));
68   if (IS_NULL(s)) return ONIGERR_MEMORY;
69 
70   v = (int* )xmalloc(sizeof(int) * init_size);
71   if (IS_NULL(v)) {
72     xfree(s);
73     return ONIGERR_MEMORY;
74   }
75 
76   s->n = 0;
77   s->alloc = init_size;
78   s->v = v;
79 
80   *rs = s;
81   return ONIG_NORMAL;
82 }
83 
84 static void
85 free_int_stack(int_stack* s)
86 {
87   if (IS_NOT_NULL(s)) {
88     if (IS_NOT_NULL(s->v))
89       xfree(s->v);
90     xfree(s);
91   }
92 }
93 
94 static int
95 int_stack_push(int_stack* s, int v)
96 {
97   if (s->n >= s->alloc) {
98     int new_size = s->alloc * 2;
99     int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
100     if (IS_NULL(nv)) return ONIGERR_MEMORY;
101 
102     s->alloc = new_size;
103     s->v = nv;
104   }
105 
106   s->v[s->n] = v;
107   s->n++;
108   return ONIG_NORMAL;
109 }
110 
111 static int
112 int_stack_pop(int_stack* s)
113 {
114   int v;
115 
116 #ifdef ONIG_DEBUG
117   if (s->n <= 0) {
118     fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s);
119     return 0;
120   }
121 #endif
122 
123   v = s->v[s->n];
124   s->n--;
125   return v;
126 }
127 #endif
128 
129 static int
ops_init(regex_t * reg,int init_alloc_size)130 ops_init(regex_t* reg, int init_alloc_size)
131 {
132   Operation* p;
133   size_t size;
134 
135   if (init_alloc_size <= 0)
136     return ONIGERR_PARSER_BUG;
137 
138   size = sizeof(Operation) * init_alloc_size;
139   p = (Operation* )xrealloc(reg->ops, size);
140   CHECK_NULL_RETURN_MEMERR(p);
141   reg->ops = p;
142 #ifdef USE_DIRECT_THREADED_CODE
143   {
144     enum OpCode* cp;
145     size = sizeof(enum OpCode) * init_alloc_size;
146     cp = (enum OpCode* )xrealloc(reg->ocs, size);
147     CHECK_NULL_RETURN_MEMERR(cp);
148     reg->ocs = cp;
149   }
150 #endif
151 
152   reg->ops_curr  = 0; /* !!! not yet done ops_new() */
153   reg->ops_alloc = init_alloc_size;
154   reg->ops_used  = 0;
155 
156   return ONIG_NORMAL;
157 }
158 
159 static int
ops_resize(regex_t * reg,int n)160 ops_resize(regex_t* reg, int n)
161 {
162 #ifdef USE_DIRECT_THREADED_CODE
163   enum OpCode* cp;
164 #endif
165   Operation* p;
166   size_t size;
167 
168   if (n == reg->ops_alloc) return ONIG_NORMAL;
169   if (n <= 0) return ONIGERR_PARSER_BUG;
170 
171   size = sizeof(Operation) * n;
172   p = (Operation* )xrealloc(reg->ops, size);
173   CHECK_NULL_RETURN_MEMERR(p);
174   reg->ops = p;
175 
176 #ifdef USE_DIRECT_THREADED_CODE
177   size = sizeof(enum OpCode) * n;
178   cp = (enum OpCode* )xrealloc(reg->ocs, size);
179   CHECK_NULL_RETURN_MEMERR(cp);
180   reg->ocs = cp;
181 #endif
182 
183   reg->ops_alloc = n;
184   if (reg->ops_used == 0)
185     reg->ops_curr = 0;
186   else
187     reg->ops_curr = reg->ops + (reg->ops_used - 1);
188 
189   return ONIG_NORMAL;
190 }
191 
192 static int
ops_new(regex_t * reg)193 ops_new(regex_t* reg)
194 {
195   if (reg->ops_used >= reg->ops_alloc) {
196     int r = ops_resize(reg, reg->ops_alloc << 1);
197     if (r != ONIG_NORMAL) return r;
198   }
199 
200   reg->ops_curr = reg->ops + reg->ops_used;
201   reg->ops_used++;
202 
203   xmemset(reg->ops_curr, 0, sizeof(Operation));
204   return ONIG_NORMAL;
205 }
206 
207 static int
is_in_string_pool(regex_t * reg,UChar * s)208 is_in_string_pool(regex_t* reg, UChar* s)
209 {
210   return (s >= reg->string_pool && s < reg->string_pool_end);
211 }
212 
213 static void
ops_free(regex_t * reg)214 ops_free(regex_t* reg)
215 {
216   int i;
217 
218   if (IS_NULL(reg->ops)) return ;
219 
220   for (i = 0; i < (int )reg->ops_used; i++) {
221     enum OpCode opcode;
222     Operation* op;
223 
224     op = reg->ops + i;
225 
226 #ifdef USE_DIRECT_THREADED_CODE
227     opcode = *(reg->ocs + i);
228 #else
229     opcode = op->opcode;
230 #endif
231 
232     switch (opcode) {
233     case OP_STR_MBN:
234       if (! is_in_string_pool(reg, op->exact_len_n.s))
235         xfree(op->exact_len_n.s);
236       break;
237     case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N:
238       if (! is_in_string_pool(reg, op->exact_n.s))
239         xfree(op->exact_n.s);
240       break;
241     case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
242     case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
243     case OP_STR_MB2N3:
244       break;
245 
246     case OP_CCLASS_NOT: case OP_CCLASS:
247       xfree(op->cclass.bsp);
248       break;
249 
250     case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
251       xfree(op->cclass_mb.mb);
252       break;
253     case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
254       xfree(op->cclass_mix.mb);
255       xfree(op->cclass_mix.bsp);
256       break;
257 
258     case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
259       break;
260     case OP_BACKREF_MULTI:      case OP_BACKREF_MULTI_IC:
261     case OP_BACKREF_CHECK:
262 #ifdef USE_BACKREF_WITH_LEVEL
263     case OP_BACKREF_WITH_LEVEL:
264     case OP_BACKREF_WITH_LEVEL_IC:
265     case OP_BACKREF_CHECK_WITH_LEVEL:
266 #endif
267       if (op->backref_general.num != 1)
268         xfree(op->backref_general.ns);
269       break;
270 
271     default:
272       break;
273     }
274   }
275 
276   xfree(reg->ops);
277 #ifdef USE_DIRECT_THREADED_CODE
278   xfree(reg->ocs);
279   reg->ocs = 0;
280 #endif
281 
282   reg->ops = 0;
283   reg->ops_curr  = 0;
284   reg->ops_alloc = 0;
285   reg->ops_used  = 0;
286 }
287 
288 static int
ops_calc_size_of_string_pool(regex_t * reg)289 ops_calc_size_of_string_pool(regex_t* reg)
290 {
291   int i;
292   int total;
293 
294   if (IS_NULL(reg->ops)) return 0;
295 
296   total = 0;
297   for (i = 0; i < (int )reg->ops_used; i++) {
298     enum OpCode opcode;
299     Operation* op;
300 
301     op = reg->ops + i;
302 #ifdef USE_DIRECT_THREADED_CODE
303     opcode = *(reg->ocs + i);
304 #else
305     opcode = op->opcode;
306 #endif
307 
308     switch (opcode) {
309     case OP_STR_MBN:
310       total += op->exact_len_n.len * op->exact_len_n.n;
311       break;
312     case OP_STR_N:
313     case OP_STR_MB2N:
314       total += op->exact_n.n * 2;
315       break;
316     case OP_STR_MB3N:
317       total += op->exact_n.n * 3;
318       break;
319 
320     default:
321       break;
322     }
323   }
324 
325   return total;
326 }
327 
328 static int
ops_make_string_pool(regex_t * reg)329 ops_make_string_pool(regex_t* reg)
330 {
331   int i;
332   int len;
333   int size;
334   UChar* pool;
335   UChar* curr;
336 
337   size = ops_calc_size_of_string_pool(reg);
338   if (size <= 0) {
339     return 0;
340   }
341 
342   curr = pool = (UChar* )xmalloc((size_t )size);
343   CHECK_NULL_RETURN_MEMERR(pool);
344 
345   for (i = 0; i < (int )reg->ops_used; i++) {
346     enum OpCode opcode;
347     Operation* op;
348 
349     op = reg->ops + i;
350 #ifdef USE_DIRECT_THREADED_CODE
351     opcode = *(reg->ocs + i);
352 #else
353     opcode = op->opcode;
354 #endif
355 
356     switch (opcode) {
357     case OP_STR_MBN:
358       len = op->exact_len_n.len * op->exact_len_n.n;
359       xmemcpy(curr, op->exact_len_n.s, len);
360       xfree(op->exact_len_n.s);
361       op->exact_len_n.s = curr;
362       curr += len;
363       break;
364     case OP_STR_N:
365       len = op->exact_n.n;
366     copy:
367       xmemcpy(curr, op->exact_n.s, len);
368       xfree(op->exact_n.s);
369       op->exact_n.s = curr;
370       curr += len;
371       break;
372     case OP_STR_MB2N:
373       len = op->exact_n.n * 2;
374       goto copy;
375       break;
376     case OP_STR_MB3N:
377       len = op->exact_n.n * 3;
378       goto copy;
379       break;
380 
381     default:
382       break;
383     }
384   }
385 
386   reg->string_pool     = pool;
387   reg->string_pool_end = pool + size;
388   return 0;
389 }
390 
391 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)392 onig_get_default_case_fold_flag(void)
393 {
394   return OnigDefaultCaseFoldFlag;
395 }
396 
397 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)398 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
399 {
400   OnigDefaultCaseFoldFlag = case_fold_flag;
401   return 0;
402 }
403 
404 static int
len_multiply_cmp(OnigLen x,int y,OnigLen v)405 len_multiply_cmp(OnigLen x, int y, OnigLen v)
406 {
407   if (x == 0 || y == 0) return -1;
408 
409   if (x < INFINITE_LEN / y) {
410     OnigLen xy = x * (OnigLen )y;
411     if (xy > v) return 1;
412     else {
413       if (xy == v) return 0;
414       else return -1;
415     }
416   }
417   else
418     return v == INFINITE_LEN ? 0 : 1;
419 }
420 
421 extern int
onig_positive_int_multiply(int x,int y)422 onig_positive_int_multiply(int x, int y)
423 {
424   if (x == 0 || y == 0) return 0;
425 
426   if (x < ONIG_INT_MAX / y)
427     return x * y;
428   else
429     return -1;
430 }
431 
432 
433 static void
node_swap(Node * a,Node * b)434 node_swap(Node* a, Node* b)
435 {
436   Node c;
437 
438   c = *a; *a = *b; *b = c;
439 
440   if (NODE_TYPE(a) == NODE_STRING) {
441     StrNode* sn = STR_(a);
442     if (sn->capacity == 0) {
443       int len = (int )(sn->end - sn->s);
444       sn->s   = sn->buf;
445       sn->end = sn->s + len;
446     }
447   }
448 
449   if (NODE_TYPE(b) == NODE_STRING) {
450     StrNode* sn = STR_(b);
451     if (sn->capacity == 0) {
452       int len = (int )(sn->end - sn->s);
453       sn->s   = sn->buf;
454       sn->end = sn->s + len;
455     }
456   }
457 }
458 
459 static int
node_list_len(Node * list)460 node_list_len(Node* list)
461 {
462   int len;
463 
464   len = 1;
465   while (IS_NOT_NULL(NODE_CDR(list))) {
466     list = NODE_CDR(list);
467     len++;
468   }
469 
470   return len;
471 }
472 
473 static Node*
node_list_add(Node * list,Node * x)474 node_list_add(Node* list, Node* x)
475 {
476   Node *n;
477 
478   n = onig_node_new_list(x, NULL);
479   if (IS_NULL(n)) return NULL_NODE;
480 
481   if (IS_NOT_NULL(list)) {
482     while (IS_NOT_NULL(NODE_CDR(list)))
483       list = NODE_CDR(list);
484 
485     NODE_CDR(list) = n;
486   }
487 
488   return n;
489 }
490 
491 static int
node_str_node_cat(Node * node,Node * add)492 node_str_node_cat(Node* node, Node* add)
493 {
494   int r;
495 
496   if (NODE_STATUS(node) != NODE_STATUS(add))
497     return ONIGERR_TYPE_BUG;
498 
499   if (STR_(node)->flag != STR_(add)->flag)
500     return ONIGERR_TYPE_BUG;
501 
502   r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
503   if (r != 0) return r;
504 
505   return 0;
506 }
507 
508 static void
node_conv_to_str_node(Node * node,Node * ref_node)509 node_conv_to_str_node(Node* node, Node* ref_node)
510 {
511   xmemset(node, 0, sizeof(*node));
512   NODE_SET_TYPE(node, NODE_STRING);
513   NODE_STATUS(node) = NODE_STATUS(ref_node);
514 
515   STR_(node)->flag     = STR_(ref_node)->flag;
516   STR_(node)->s        = STR_(node)->buf;
517   STR_(node)->end      = STR_(node)->buf;
518   STR_(node)->capacity = 0;
519 }
520 
521 static OnigLen
distance_add(OnigLen d1,OnigLen d2)522 distance_add(OnigLen d1, OnigLen d2)
523 {
524   if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
525     return INFINITE_LEN;
526   else {
527     if (d1 <= INFINITE_LEN - d2) return d1 + d2;
528     else return INFINITE_LEN;
529   }
530 }
531 
532 static OnigLen
distance_multiply(OnigLen d,int m)533 distance_multiply(OnigLen d, int m)
534 {
535   if (m == 0) return 0;
536 
537   if (d < INFINITE_LEN / m)
538     return d * m;
539   else
540     return INFINITE_LEN;
541 }
542 
543 static int
bitset_is_empty(BitSetRef bs)544 bitset_is_empty(BitSetRef bs)
545 {
546   int i;
547 
548   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {
549     if (bs[i] != 0) return 0;
550   }
551   return 1;
552 }
553 
554 #ifdef USE_CALL
555 
556 static int
unset_addr_list_init(UnsetAddrList * list,int size)557 unset_addr_list_init(UnsetAddrList* list, int size)
558 {
559   UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
560   CHECK_NULL_RETURN_MEMERR(p);
561 
562   list->num   = 0;
563   list->alloc = size;
564   list->us    = p;
565   return 0;
566 }
567 
568 static void
unset_addr_list_end(UnsetAddrList * list)569 unset_addr_list_end(UnsetAddrList* list)
570 {
571   if (IS_NOT_NULL(list->us))
572     xfree(list->us);
573 }
574 
575 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)576 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
577 {
578   UnsetAddr* p;
579   int size;
580 
581   if (list->num >= list->alloc) {
582     size = list->alloc * 2;
583     p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
584     CHECK_NULL_RETURN_MEMERR(p);
585     list->alloc = size;
586     list->us    = p;
587   }
588 
589   list->us[list->num].offset = offset;
590   list->us[list->num].target = node;
591   list->num++;
592   return 0;
593 }
594 #endif /* USE_CALL */
595 
596 enum CharLenReturnType {
597   CHAR_LEN_NORMAL = 0,       /* fixed or variable */
598   CHAR_LEN_TOP_ALT_FIXED = 1
599 };
600 
601 static int
mmcl_fixed(MinMaxCharLen * c)602 mmcl_fixed(MinMaxCharLen* c)
603 {
604   return (c->min == c->max && c->min != INFINITE_LEN);
605 }
606 
607 static void
mmcl_set(MinMaxCharLen * l,OnigLen len)608 mmcl_set(MinMaxCharLen* l, OnigLen len)
609 {
610   l->min = len;
611   l->max = len;
612   l->min_is_sure = TRUE;
613 }
614 
615 static void
mmcl_set_min_max(MinMaxCharLen * l,OnigLen min,OnigLen max,int min_is_sure)616 mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure)
617 {
618   l->min = min;
619   l->max = max;
620   l->min_is_sure = min_is_sure;
621 }
622 
623 static void
mmcl_add(MinMaxCharLen * to,MinMaxCharLen * add)624 mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add)
625 {
626   to->min = distance_add(to->min, add->min);
627   to->max = distance_add(to->max, add->max);
628 
629   to->min_is_sure = add->min_is_sure != FALSE && to->min_is_sure != FALSE;
630 }
631 
632 static void
mmcl_multiply(MinMaxCharLen * to,int m)633 mmcl_multiply(MinMaxCharLen* to, int m)
634 {
635   to->min = distance_multiply(to->min, m);
636   to->max = distance_multiply(to->max, m);
637 }
638 
639 static void
mmcl_repeat_range_multiply(MinMaxCharLen * to,int mlow,int mhigh)640 mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh)
641 {
642   to->min = distance_multiply(to->min, mlow);
643 
644   if (IS_INFINITE_REPEAT(mhigh))
645     to->max = INFINITE_LEN;
646   else
647     to->max = distance_multiply(to->max, mhigh);
648 }
649 
650 static void
mmcl_alt_merge(MinMaxCharLen * to,MinMaxCharLen * alt)651 mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)
652 {
653   if (to->min > alt->min) {
654     to->min         = alt->min;
655     to->min_is_sure = alt->min_is_sure;
656   }
657   else if (to->min == alt->min) {
658     if (alt->min_is_sure != FALSE)
659       to->min_is_sure = TRUE;
660   }
661 
662   if (to->max < alt->max) to->max = alt->max;
663 }
664 
665 #ifndef ONIG_DONT_OPTIMIZE
666 
667 static int
mml_is_equal(MinMaxLen * a,MinMaxLen * b)668 mml_is_equal(MinMaxLen* a, MinMaxLen* b)
669 {
670   return a->min == b->min && a->max == b->max;
671 }
672 
673 static void
mml_set_min_max(MinMaxLen * l,OnigLen min,OnigLen max)674 mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max)
675 {
676   l->min = min;
677   l->max = max;
678 }
679 
680 static void
mml_clear(MinMaxLen * l)681 mml_clear(MinMaxLen* l)
682 {
683   l->min = l->max = 0;
684 }
685 
686 static void
mml_copy(MinMaxLen * to,MinMaxLen * from)687 mml_copy(MinMaxLen* to, MinMaxLen* from)
688 {
689   to->min = from->min;
690   to->max = from->max;
691 }
692 
693 static void
mml_add(MinMaxLen * to,MinMaxLen * add)694 mml_add(MinMaxLen* to, MinMaxLen* add)
695 {
696   to->min = distance_add(to->min, add->min);
697   to->max = distance_add(to->max, add->max);
698 }
699 
700 static void
mml_alt_merge(MinMaxLen * to,MinMaxLen * alt)701 mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)
702 {
703   if (to->min > alt->min) to->min = alt->min;
704   if (to->max < alt->max) to->max = alt->max;
705 }
706 
707 #endif
708 
709 /* fixed size pattern node only */
710 static int
node_char_len1(Node * node,regex_t * reg,MinMaxCharLen * ci,ParseEnv * env,int level)711 node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env,
712                int level)
713 {
714   MinMaxCharLen tci;
715   int r = CHAR_LEN_NORMAL;
716 
717   level++;
718 
719   switch (NODE_TYPE(node)) {
720   case NODE_LIST:
721     {
722       int first = TRUE;
723       do {
724         r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
725         if (r < 0) break;
726         if (first == TRUE) {
727           *ci = tci;
728           first = FALSE;
729         }
730         else
731           mmcl_add(ci, &tci);
732       } while (IS_NOT_NULL(node = NODE_CDR(node)));
733     }
734     break;
735 
736   case NODE_ALT:
737     {
738       int fixed;
739 
740       r = node_char_len1(NODE_CAR(node), reg, ci, env, level);
741       if (r < 0) break;
742 
743       fixed = TRUE;
744       while (IS_NOT_NULL(node = NODE_CDR(node))) {
745         r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
746         if (r < 0) break;
747         if (! mmcl_fixed(&tci))
748           fixed = FALSE;
749         mmcl_alt_merge(ci, &tci);
750       }
751       if (r < 0) break;
752 
753       r = CHAR_LEN_NORMAL;
754       if (mmcl_fixed(ci)) break;
755 
756       if (fixed == TRUE && level == 1) {
757         r = CHAR_LEN_TOP_ALT_FIXED;
758       }
759     }
760     break;
761 
762   case NODE_STRING:
763     {
764       OnigLen clen;
765       StrNode* sn = STR_(node);
766       UChar *s = sn->s;
767 
768       if (NODE_IS_REAL_IGNORECASE(node) &&
769           CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) {
770         /* Such a case is possible.
771            ex. /(?i)(?<=\1)(a)/
772            Backref node refer to capture group, but it doesn't tune yet.
773          */
774         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
775         break;
776       }
777 
778       clen = 0;
779       while (s < sn->end) {
780         s += enclen(reg->enc, s);
781         clen = distance_add(clen, 1);
782       }
783       mmcl_set(ci, clen);
784     }
785     break;
786 
787   case NODE_QUANT:
788     {
789       QuantNode* qn = QUANT_(node);
790 
791       if (qn->lower == qn->upper) {
792         if (qn->upper == 0) {
793           mmcl_set(ci, 0);
794         }
795         else {
796           r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
797           if (r < 0) break;
798           mmcl_multiply(ci, qn->lower);
799         }
800       }
801       else {
802         r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
803         if (r < 0) break;
804         mmcl_repeat_range_multiply(ci, qn->lower, qn->upper);
805       }
806     }
807     break;
808 
809 #ifdef USE_CALL
810   case NODE_CALL:
811     if (NODE_IS_RECURSION(node))
812       mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
813     else
814       r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
815     break;
816 #endif
817 
818   case NODE_CTYPE:
819   case NODE_CCLASS:
820     mmcl_set(ci, 1);
821     break;
822 
823   case NODE_BAG:
824     {
825       BagNode* en = BAG_(node);
826 
827       switch (en->type) {
828       case BAG_MEMORY:
829         if (NODE_IS_FIXED_CLEN(node)) {
830           mmcl_set_min_max(ci, en->min_char_len, en->max_char_len,
831                            NODE_IS_FIXED_CLEN_MIN_SURE(node));
832         }
833         else {
834           if (NODE_IS_MARK1(node)) {
835             mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
836           }
837           else {
838             NODE_STATUS_ADD(node, MARK1);
839             r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
840             NODE_STATUS_REMOVE(node, MARK1);
841             if (r < 0) break;
842 
843             en->min_char_len = ci->min;
844             en->max_char_len = ci->max;
845             NODE_STATUS_ADD(node, FIXED_CLEN);
846             if (ci->min_is_sure != FALSE)
847               NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE);
848           }
849         }
850         /* can't optimize look-behind if capture exists. */
851         ci->min_is_sure = FALSE;
852         break;
853       case BAG_OPTION:
854       case BAG_STOP_BACKTRACK:
855         r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
856         break;
857       case BAG_IF_ELSE:
858         {
859           MinMaxCharLen eci;
860 
861           r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
862           if (r < 0) break;
863 
864           if (IS_NOT_NULL(en->te.Then)) {
865             r = node_char_len1(en->te.Then, reg, &tci, env, level);
866             if (r < 0) break;
867             mmcl_add(ci, &tci);
868           }
869 
870           if (IS_NOT_NULL(en->te.Else)) {
871             r = node_char_len1(en->te.Else, reg, &eci, env, level);
872             if (r < 0) break;
873           }
874           else {
875             mmcl_set(&eci, 0);
876           }
877 
878           mmcl_alt_merge(ci, &eci);
879         }
880         break;
881       default: /* never come here */
882         r = ONIGERR_PARSER_BUG;
883         break;
884       }
885     }
886     break;
887 
888   case NODE_GIMMICK:
889     mmcl_set(ci, 0);
890     break;
891 
892   case NODE_ANCHOR:
893   zero:
894     mmcl_set(ci, 0);
895     /* can't optimize look-behind if anchor exists. */
896     ci->min_is_sure = FALSE;
897     break;
898 
899   case NODE_BACKREF:
900     if (NODE_IS_CHECKER(node))
901       goto zero;
902 
903     if (NODE_IS_RECURSION(node)) {
904 #ifdef USE_BACKREF_WITH_LEVEL
905       if (NODE_IS_NEST_LEVEL(node)) {
906         mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
907         break;
908       }
909 #endif
910 
911       mmcl_set_min_max(ci, 0, 0, FALSE);
912       break;
913     }
914 
915     {
916       int i;
917       int* backs;
918       MemEnv* mem_env = PARSEENV_MEMENV(env);
919       BackRefNode* br = BACKREF_(node);
920 
921       backs = BACKREFS_P(br);
922       r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level);
923       if (r < 0) break;
924       if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE;
925 
926       for (i = 1; i < br->back_num; i++) {
927         r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level);
928         if (r < 0) break;
929         if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE;
930         mmcl_alt_merge(ci, &tci);
931       }
932     }
933     break;
934 
935   default: /* never come here */
936     r = ONIGERR_PARSER_BUG;
937     break;
938   }
939 
940   return r;
941 }
942 
943 static int
node_char_len(Node * node,regex_t * reg,MinMaxCharLen * ci,ParseEnv * env)944 node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env)
945 {
946   return node_char_len1(node, reg, ci, env, 0);
947 }
948 
949 
950 static int
add_op(regex_t * reg,int opcode)951 add_op(regex_t* reg, int opcode)
952 {
953   int r;
954 
955   r = ops_new(reg);
956   if (r != ONIG_NORMAL) return r;
957 
958 #ifdef USE_DIRECT_THREADED_CODE
959   *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
960 #else
961   reg->ops_curr->opcode = opcode;
962 #endif
963 
964   return 0;
965 }
966 
967 static int compile_length_tree(Node* node, regex_t* reg);
968 static int compile_tree(Node* node, regex_t* reg, ParseEnv* env);
969 
970 
971 #define IS_NEED_STR_LEN_OP(op) \
972    ((op) == OP_STR_N    || (op) == OP_STR_MB2N ||\
973     (op) == OP_STR_MB3N || (op) == OP_STR_MBN)
974 
975 static int
select_str_opcode(int mb_len,int str_len)976 select_str_opcode(int mb_len, int str_len)
977 {
978   int op;
979 
980   switch (mb_len) {
981   case 1:
982     switch (str_len) {
983     case 1:  op = OP_STR_1; break;
984     case 2:  op = OP_STR_2; break;
985     case 3:  op = OP_STR_3; break;
986     case 4:  op = OP_STR_4; break;
987     case 5:  op = OP_STR_5; break;
988     default: op = OP_STR_N; break;
989     }
990     break;
991 
992   case 2:
993     switch (str_len) {
994     case 1:  op = OP_STR_MB2N1; break;
995     case 2:  op = OP_STR_MB2N2; break;
996     case 3:  op = OP_STR_MB2N3; break;
997     default: op = OP_STR_MB2N;  break;
998     }
999     break;
1000 
1001   case 3:
1002     op = OP_STR_MB3N;
1003     break;
1004 
1005   default:
1006     op = OP_STR_MBN;
1007     break;
1008   }
1009 
1010   return op;
1011 }
1012 
1013 static int
is_strict_real_node(Node * node)1014 is_strict_real_node(Node* node)
1015 {
1016   switch (NODE_TYPE(node)) {
1017   case NODE_STRING:
1018     {
1019       StrNode* sn = STR_(node);
1020       return (sn->end != sn->s);
1021     }
1022     break;
1023 
1024   case NODE_CCLASS:
1025   case NODE_CTYPE:
1026     return 1;
1027     break;
1028 
1029   default:
1030     return 0;
1031     break;
1032   }
1033 }
1034 
1035 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ParseEnv * env)1036 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env)
1037 {
1038   int r;
1039   int saved_num_empty_check;
1040   int emptiness;
1041   Node* body;
1042 
1043   body = NODE_BODY((Node* )qn);
1044   emptiness = qn->emptiness;
1045   saved_num_empty_check = reg->num_empty_check;
1046 
1047   if (emptiness != BODY_IS_NOT_EMPTY) {
1048     r = add_op(reg, OP_EMPTY_CHECK_START);
1049     if (r != 0) return r;
1050     COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
1051     reg->num_empty_check++;
1052   }
1053 
1054   r = compile_tree(body, reg, env);
1055   if (r != 0) return r;
1056 
1057   if (emptiness != BODY_IS_NOT_EMPTY) {
1058     if (emptiness == BODY_MAY_BE_EMPTY)
1059       r = add_op(reg, OP_EMPTY_CHECK_END);
1060     else if (emptiness == BODY_MAY_BE_EMPTY_MEM) {
1061       if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) {
1062         r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
1063         if (r != 0) return r;
1064         COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
1065       }
1066       else
1067         r = add_op(reg, OP_EMPTY_CHECK_END);
1068     }
1069 #ifdef USE_CALL
1070     else if (emptiness == BODY_MAY_BE_EMPTY_REC) {
1071       r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
1072       if (r != 0) return r;
1073       COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
1074     }
1075 #endif
1076 
1077     if (r != 0) return r;
1078     COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
1079   }
1080   return r;
1081 }
1082 
1083 #ifdef USE_CALL
1084 static int
compile_call(CallNode * node,regex_t * reg,ParseEnv * env)1085 compile_call(CallNode* node, regex_t* reg, ParseEnv* env)
1086 {
1087   int r;
1088   int offset;
1089 
1090   r = add_op(reg, OP_CALL);
1091   if (r != 0) return r;
1092 
1093   COP(reg)->call.addr = 0; /* dummy addr. */
1094 #ifdef ONIG_DEBUG_MATCH_COUNTER
1095   COP(reg)->call.called_mem = node->called_gnum;
1096 #endif
1097 
1098   offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
1099   r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
1100   return r;
1101 }
1102 #endif
1103 
1104 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ParseEnv * env)1105 compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env)
1106 {
1107   int i, r;
1108 
1109   for (i = 0; i < n; i++) {
1110     r = compile_tree(node, reg, env);
1111     if (r != 0) return r;
1112   }
1113   return 0;
1114 }
1115 
1116 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)1117 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
1118                           regex_t* reg ARG_UNUSED)
1119 {
1120   return 1;
1121 }
1122 
1123 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)1124 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
1125 {
1126   int op;
1127   int r;
1128   int byte_len;
1129   UChar* p;
1130   UChar* end;
1131 
1132   op = select_str_opcode(mb_len, str_len);
1133   r = add_op(reg, op);
1134   if (r != 0) return r;
1135 
1136   byte_len = mb_len * str_len;
1137   end = s + byte_len;
1138 
1139   if (op == OP_STR_MBN) {
1140     p = onigenc_strdup(reg->enc, s, end);
1141     CHECK_NULL_RETURN_MEMERR(p);
1142 
1143     COP(reg)->exact_len_n.len = mb_len;
1144     COP(reg)->exact_len_n.n   = str_len;
1145     COP(reg)->exact_len_n.s   = p;
1146   }
1147   else if (IS_NEED_STR_LEN_OP(op)) {
1148     p = onigenc_strdup(reg->enc, s, end);
1149     CHECK_NULL_RETURN_MEMERR(p);
1150     COP(reg)->exact_n.n = str_len;
1151     COP(reg)->exact_n.s = p;
1152   }
1153   else {
1154     xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
1155     xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
1156   }
1157 
1158   return 0;
1159 }
1160 
1161 static int
compile_length_string_node(Node * node,regex_t * reg)1162 compile_length_string_node(Node* node, regex_t* reg)
1163 {
1164   int rlen, r, len, prev_len, slen;
1165   UChar *p, *prev;
1166   StrNode* sn;
1167   OnigEncoding enc = reg->enc;
1168 
1169   sn = STR_(node);
1170   if (sn->end <= sn->s)
1171     return 0;
1172 
1173   p = prev = sn->s;
1174   prev_len = enclen(enc, p);
1175   p += prev_len;
1176   slen = 1;
1177   rlen = 0;
1178 
1179   for (; p < sn->end; ) {
1180     len = enclen(enc, p);
1181     if (len == prev_len) {
1182       slen++;
1183     }
1184     else {
1185       r = add_compile_string_length(prev, prev_len, slen, reg);
1186       rlen += r;
1187       prev = p;
1188       slen = 1;
1189       prev_len = len;
1190     }
1191     p += len;
1192   }
1193 
1194   r = add_compile_string_length(prev, prev_len, slen, reg);
1195   rlen += r;
1196   return rlen;
1197 }
1198 
1199 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)1200 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
1201 {
1202   if (sn->end <= sn->s)
1203     return 0;
1204 
1205   return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
1206                                    reg);
1207 }
1208 
1209 static int
compile_string_node(Node * node,regex_t * reg)1210 compile_string_node(Node* node, regex_t* reg)
1211 {
1212   int r, len, prev_len, slen;
1213   UChar *p, *prev, *end;
1214   StrNode* sn;
1215   OnigEncoding enc = reg->enc;
1216 
1217   sn = STR_(node);
1218   if (sn->end <= sn->s)
1219     return 0;
1220 
1221   end = sn->end;
1222 
1223   p = prev = sn->s;
1224   prev_len = enclen(enc, p);
1225   p += prev_len;
1226   slen = 1;
1227 
1228   for (; p < end; ) {
1229     len = enclen(enc, p);
1230     if (len == prev_len) {
1231       slen++;
1232     }
1233     else {
1234       r = add_compile_string(prev, prev_len, slen, reg);
1235       if (r != 0) return r;
1236 
1237       prev  = p;
1238       slen  = 1;
1239       prev_len = len;
1240     }
1241 
1242     p += len;
1243   }
1244 
1245   return add_compile_string(prev, prev_len, slen, reg);
1246 }
1247 
1248 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)1249 compile_string_crude_node(StrNode* sn, regex_t* reg)
1250 {
1251   if (sn->end <= sn->s)
1252     return 0;
1253 
1254   return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
1255 }
1256 
1257 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)1258 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
1259 {
1260   size_t len;
1261   void* p;
1262 
1263   len = (size_t )mbuf->used;
1264   p = xmalloc(len);
1265   if (IS_NULL(p)) return NULL;
1266 
1267   xmemcpy(p, mbuf->p, len);
1268   return p;
1269 }
1270 
1271 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)1272 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
1273 {
1274   return 1;
1275 }
1276 
1277 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)1278 compile_cclass_node(CClassNode* cc, regex_t* reg)
1279 {
1280   int r;
1281 
1282   if (IS_NULL(cc->mbuf)) {
1283     r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
1284     if (r != 0) return r;
1285 
1286     COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
1287     CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
1288     xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
1289   }
1290   else {
1291     void* p;
1292 
1293     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
1294       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
1295       if (r != 0) return r;
1296 
1297       p = set_multi_byte_cclass(cc->mbuf, reg);
1298       CHECK_NULL_RETURN_MEMERR(p);
1299       COP(reg)->cclass_mb.mb = p;
1300     }
1301     else {
1302       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
1303       if (r != 0) return r;
1304 
1305       COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
1306       CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
1307       xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
1308 
1309       p = set_multi_byte_cclass(cc->mbuf, reg);
1310       CHECK_NULL_RETURN_MEMERR(p);
1311       COP(reg)->cclass_mix.mb = p;
1312     }
1313   }
1314 
1315   return 0;
1316 }
1317 
1318 static void
set_addr_in_repeat_range(regex_t * reg)1319 set_addr_in_repeat_range(regex_t* reg)
1320 {
1321   int i;
1322 
1323   for (i = 0; i < reg->num_repeat; i++) {
1324     RepeatRange* p = reg->repeat_range + i;
1325     int offset = p->u.offset;
1326     p->u.pcode = reg->ops + offset;
1327   }
1328 }
1329 
1330 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1331 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1332 {
1333 #define REPEAT_RANGE_ALLOC  4
1334 
1335   RepeatRange* p;
1336 
1337   if (reg->repeat_range_alloc == 0) {
1338     p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1339     CHECK_NULL_RETURN_MEMERR(p);
1340     reg->repeat_range = p;
1341     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1342   }
1343   else if (reg->repeat_range_alloc <= id) {
1344     int n;
1345     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1346     p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1347     CHECK_NULL_RETURN_MEMERR(p);
1348     reg->repeat_range = p;
1349     reg->repeat_range_alloc = n;
1350   }
1351   else {
1352     p = reg->repeat_range;
1353   }
1354 
1355   p[id].lower    = lower;
1356   p[id].upper    = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1357   p[id].u.offset = ops_index;
1358   return 0;
1359 }
1360 
1361 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ParseEnv * env)1362 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1363                           regex_t* reg, ParseEnv* env)
1364 {
1365   int r;
1366   int num_repeat = reg->num_repeat++;
1367 
1368   r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1369   if (r != 0) return r;
1370 
1371   COP(reg)->repeat.id   = num_repeat;
1372   COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1373 
1374   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1375                          COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1376   if (r != 0) return r;
1377 
1378   r = compile_quant_body_with_empty_check(qn, reg, env);
1379   if (r != 0) return r;
1380 
1381   r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1382   if (r != 0) return r;
1383 
1384   COP(reg)->repeat_inc.id = num_repeat;
1385   return r;
1386 }
1387 
1388 static int
is_anychar_infinite_greedy(QuantNode * qn)1389 is_anychar_infinite_greedy(QuantNode* qn)
1390 {
1391   if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1392       NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1393     return 1;
1394   else
1395     return 0;
1396 }
1397 
1398 #define QUANTIFIER_EXPAND_LIMIT_SIZE   10
1399 #define CKN_ON   (ckn > 0)
1400 
1401 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1402 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1403 {
1404   int len, mod_tlen;
1405   int infinite = IS_INFINITE_REPEAT(qn->upper);
1406   enum BodyEmptyType emptiness = qn->emptiness;
1407   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1408 
1409   if (tlen < 0) return tlen;
1410   if (tlen == 0) return 0;
1411 
1412   /* anychar repeat */
1413   if (is_anychar_infinite_greedy(qn)) {
1414     if (qn->lower <= 1 ||
1415         len_multiply_cmp((OnigLen )tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1416       if (IS_NOT_NULL(qn->next_head_exact))
1417         return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1418       else
1419         return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1420     }
1421   }
1422 
1423   mod_tlen = tlen;
1424   if (emptiness != BODY_IS_NOT_EMPTY)
1425     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1426 
1427   if (infinite &&
1428       (qn->lower <= 1 ||
1429        len_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1430     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1431       len = OPSIZE_JUMP;
1432     }
1433     else {
1434       len = tlen * qn->lower;
1435     }
1436 
1437     if (qn->greedy) {
1438 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1439       if (IS_NOT_NULL(qn->head_exact))
1440         len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1441       else
1442 #endif
1443       if (IS_NOT_NULL(qn->next_head_exact))
1444         len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1445       else
1446         len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1447     }
1448     else
1449       len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1450   }
1451   else if (qn->upper == 0) {
1452     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1453       len = OPSIZE_JUMP + tlen;
1454     }
1455     else
1456       len = 0;
1457   }
1458   else if (!infinite && qn->greedy &&
1459            (qn->upper == 1 ||
1460             len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1461                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1462     len = tlen * qn->lower;
1463     len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1464   }
1465   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1466     len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1467   }
1468   else {
1469     len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1470   }
1471 
1472   return len;
1473 }
1474 
1475 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ParseEnv * env)1476 compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
1477 {
1478   int i, r, mod_tlen;
1479   int infinite = IS_INFINITE_REPEAT(qn->upper);
1480   enum BodyEmptyType emptiness = qn->emptiness;
1481   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1482 
1483   if (tlen < 0) return tlen;
1484   if (tlen == 0) return 0;
1485 
1486   if (is_anychar_infinite_greedy(qn) &&
1487       (qn->lower <= 1 ||
1488        len_multiply_cmp((OnigLen )tlen, qn->lower,
1489                         QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1490     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1491     if (r != 0) return r;
1492     if (IS_NOT_NULL(qn->next_head_exact)) {
1493       r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1494                  OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1495       if (r != 0) return r;
1496 
1497       COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1498       return 0;
1499     }
1500     else {
1501       r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1502                  OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1503       return r;
1504     }
1505   }
1506 
1507   mod_tlen = tlen;
1508   if (emptiness != BODY_IS_NOT_EMPTY)
1509     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1510 
1511   if (infinite &&
1512       (qn->lower <= 1 ||
1513        len_multiply_cmp((OnigLen )tlen, qn->lower,
1514                         QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1515     int addr;
1516 
1517     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1518       r = add_op(reg, OP_JUMP);
1519       if (r != 0) return r;
1520       if (qn->greedy) {
1521 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1522         if (IS_NOT_NULL(qn->head_exact))
1523           COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1524         else
1525 #endif
1526         if (IS_NOT_NULL(qn->next_head_exact))
1527           COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1528         else
1529           COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1530       }
1531       else {
1532         COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1533       }
1534     }
1535     else {
1536       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1537       if (r != 0) return r;
1538     }
1539 
1540     if (qn->greedy) {
1541 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1542       if (IS_NOT_NULL(qn->head_exact)) {
1543         r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1544         if (r != 0) return r;
1545         COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1546         COP(reg)->push_or_jump_exact1.c    = STR_(qn->head_exact)->s[0];
1547 
1548         r = compile_quant_body_with_empty_check(qn, reg, env);
1549         if (r != 0) return r;
1550 
1551         addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1552       }
1553       else
1554 #endif
1555       if (IS_NOT_NULL(qn->next_head_exact)) {
1556         r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1557         if (r != 0) return r;
1558         COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1559         COP(reg)->push_if_peek_next.c    = STR_(qn->next_head_exact)->s[0];
1560 
1561         r = compile_quant_body_with_empty_check(qn, reg, env);
1562         if (r != 0) return r;
1563 
1564         addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1565       }
1566       else {
1567         r = add_op(reg, OP_PUSH);
1568         if (r != 0) return r;
1569         COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1570 
1571         r = compile_quant_body_with_empty_check(qn, reg, env);
1572         if (r != 0) return r;
1573 
1574         addr = -(mod_tlen + (int )OPSIZE_PUSH);
1575       }
1576 
1577       r = add_op(reg, OP_JUMP);
1578       if (r != 0) return r;
1579       COP(reg)->jump.addr = addr;
1580     }
1581     else {
1582       r = add_op(reg, OP_JUMP);
1583       if (r != 0) return r;
1584       COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1585 
1586       r = compile_quant_body_with_empty_check(qn, reg, env);
1587       if (r != 0) return r;
1588 
1589       r = add_op(reg, OP_PUSH);
1590       if (r != 0) return r;
1591       COP(reg)->push.addr = -mod_tlen;
1592     }
1593   }
1594   else if (qn->upper == 0) {
1595     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1596       r = add_op(reg, OP_JUMP);
1597       if (r != 0) return r;
1598       COP(reg)->jump.addr = tlen + SIZE_INC;
1599 
1600       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1601     }
1602     else {
1603       /* Nothing output */
1604       r = 0;
1605     }
1606   }
1607   else if (! infinite && qn->greedy &&
1608            (qn->upper == 1 ||
1609             len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1610                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1611     int n = qn->upper - qn->lower;
1612 
1613     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1614     if (r != 0) return r;
1615 
1616     for (i = 0; i < n; i++) {
1617       int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1618       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1619 
1620       r = add_op(reg, OP_PUSH);
1621       if (r != 0) return r;
1622       COP(reg)->push.addr = v;
1623 
1624       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1625       if (r != 0) return r;
1626     }
1627   }
1628   else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1629     r = add_op(reg, OP_PUSH);
1630     if (r != 0) return r;
1631     COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1632 
1633     r = add_op(reg, OP_JUMP);
1634     if (r != 0) return r;
1635     COP(reg)->jump.addr = tlen + SIZE_INC;
1636 
1637     r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1638   }
1639   else {
1640     r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1641   }
1642   return r;
1643 }
1644 
1645 static int
compile_length_option_node(BagNode * node,regex_t * reg)1646 compile_length_option_node(BagNode* node, regex_t* reg)
1647 {
1648   int tlen;
1649 
1650   tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1651 
1652   return tlen;
1653 }
1654 
1655 static int
compile_option_node(BagNode * node,regex_t * reg,ParseEnv * env)1656 compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env)
1657 {
1658   int r;
1659 
1660   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1661 
1662   return r;
1663 }
1664 
1665 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1666 compile_length_bag_node(BagNode* node, regex_t* reg)
1667 {
1668   int len;
1669   int tlen;
1670 
1671   if (node->type == BAG_OPTION)
1672     return compile_length_option_node(node, reg);
1673 
1674   if (NODE_BAG_BODY(node)) {
1675     tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1676     if (tlen < 0) return tlen;
1677   }
1678   else
1679     tlen = 0;
1680 
1681   switch (node->type) {
1682   case BAG_MEMORY:
1683 #ifdef USE_CALL
1684 
1685     if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1686       len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1687       return len;
1688     }
1689 
1690     if (NODE_IS_CALLED(node)) {
1691       len = OPSIZE_MEM_START_PUSH + tlen
1692         + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1693       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1694         len += (NODE_IS_RECURSION(node)
1695                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1696       else
1697         len += (NODE_IS_RECURSION(node)
1698                 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1699     }
1700     else if (NODE_IS_RECURSION(node)) {
1701       len = OPSIZE_MEM_START_PUSH;
1702       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1703                      ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1704     }
1705     else
1706 #endif
1707     {
1708       if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1709         len = OPSIZE_MEM_START_PUSH;
1710       else
1711         len = OPSIZE_MEM_START;
1712 
1713       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1714                      ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1715     }
1716     break;
1717 
1718   case BAG_STOP_BACKTRACK:
1719     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1720       int v;
1721       QuantNode* qn;
1722 
1723       qn = QUANT_(NODE_BAG_BODY(node));
1724       tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1725       if (tlen < 0) return tlen;
1726 
1727       v = onig_positive_int_multiply(qn->lower, tlen);
1728       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1729       len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP;
1730     }
1731     else {
1732       len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1733     }
1734     break;
1735 
1736   case BAG_IF_ELSE:
1737     {
1738       Node* cond = NODE_BAG_BODY(node);
1739       Node* Then = node->te.Then;
1740       Node* Else = node->te.Else;
1741 
1742       len = compile_length_tree(cond, reg);
1743       if (len < 0) return len;
1744       len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK;
1745 
1746       if (IS_NOT_NULL(Then)) {
1747         tlen = compile_length_tree(Then, reg);
1748         if (tlen < 0) return tlen;
1749         len += tlen;
1750       }
1751 
1752       len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK;
1753 
1754       if (IS_NOT_NULL(Else)) {
1755         tlen = compile_length_tree(Else, reg);
1756         if (tlen < 0) return tlen;
1757         len += tlen;
1758       }
1759     }
1760     break;
1761 
1762   case BAG_OPTION:
1763     /* never come here, but set for escape warning */
1764     len = 0;
1765     break;
1766   }
1767 
1768   return len;
1769 }
1770 
1771 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ParseEnv * env)1772 compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)
1773 {
1774   int r;
1775 
1776 #ifdef USE_CALL
1777   if (NODE_IS_CALLED(node)) {
1778     int len;
1779 
1780     r = add_op(reg, OP_CALL);
1781     if (r != 0) return r;
1782 
1783     node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1784     NODE_STATUS_ADD(node, FIXED_ADDR);
1785     COP(reg)->call.addr = (int )node->m.called_addr;
1786 
1787     if (node->m.regnum == 0) {
1788       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1789       len += OPSIZE_RETURN;
1790 
1791       r = add_op(reg, OP_JUMP);
1792       if (r != 0) return r;
1793       COP(reg)->jump.addr = len + SIZE_INC;
1794 
1795       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1796       if (r != 0) return r;
1797 
1798       r = add_op(reg, OP_RETURN);
1799       return r;
1800     }
1801     else {
1802       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1803       len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1804       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1805         len += (NODE_IS_RECURSION(node)
1806                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1807       else
1808         len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1809 
1810       r = add_op(reg, OP_JUMP);
1811       if (r != 0) return r;
1812       COP(reg)->jump.addr = len + SIZE_INC;
1813     }
1814   }
1815 #endif
1816 
1817   if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1818     r = add_op(reg, OP_MEM_START_PUSH);
1819   else
1820     r = add_op(reg, OP_MEM_START);
1821   if (r != 0) return r;
1822   COP(reg)->memory_start.num = node->m.regnum;
1823 
1824   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1825   if (r != 0) return r;
1826 
1827 #ifdef USE_CALL
1828   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1829     r = add_op(reg, (NODE_IS_RECURSION(node)
1830                      ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1831   else
1832     r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1833   if (r != 0) return r;
1834   COP(reg)->memory_end.num = node->m.regnum;
1835 
1836   if (NODE_IS_CALLED(node)) {
1837     r = add_op(reg, OP_RETURN);
1838   }
1839 #else
1840   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1841     r = add_op(reg, OP_MEM_END_PUSH);
1842   else
1843     r = add_op(reg, OP_MEM_END);
1844   if (r != 0) return r;
1845   COP(reg)->memory_end.num = node->m.regnum;
1846 #endif
1847 
1848   return r;
1849 }
1850 
1851 static int
compile_bag_node(BagNode * node,regex_t * reg,ParseEnv * env)1852 compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
1853 {
1854   int r, len;
1855 
1856   switch (node->type) {
1857   case BAG_MEMORY:
1858     r = compile_bag_memory_node(node, reg, env);
1859     break;
1860 
1861   case BAG_OPTION:
1862     r = compile_option_node(node, reg, env);
1863     break;
1864 
1865   case BAG_STOP_BACKTRACK:
1866     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1867       QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1868       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1869       if (r != 0) return r;
1870 
1871       len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1872       if (len < 0) return len;
1873 
1874       r = add_op(reg, OP_PUSH);
1875       if (r != 0) return r;
1876       COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP;
1877 
1878       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1879       if (r != 0) return r;
1880       r = add_op(reg, OP_POP);
1881       if (r != 0) return r;
1882 
1883       r = add_op(reg, OP_JUMP);
1884       if (r != 0) return r;
1885       COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP);
1886     }
1887     else {
1888       MemNumType mid;
1889 
1890       ID_ENTRY(env, mid);
1891       r = add_op(reg, OP_MARK);
1892       if (r != 0) return r;
1893       COP(reg)->mark.id = mid;
1894       COP(reg)->mark.save_pos = 0;
1895 
1896       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1897       if (r != 0) return r;
1898       r = add_op(reg, OP_CUT_TO_MARK);
1899       if (r != 0) return r;
1900       COP(reg)->cut_to_mark.id = mid;
1901       COP(reg)->cut_to_mark.restore_pos = 0;
1902     }
1903     break;
1904 
1905   case BAG_IF_ELSE:
1906     {
1907       int cond_len, then_len, else_len, jump_len;
1908       MemNumType mid;
1909       Node* cond = NODE_BAG_BODY(node);
1910       Node* Then = node->te.Then;
1911       Node* Else = node->te.Else;
1912 
1913       ID_ENTRY(env, mid);
1914 
1915       r = add_op(reg, OP_MARK);
1916       if (r != 0) return r;
1917       COP(reg)->mark.id = mid;
1918       COP(reg)->mark.save_pos = 0;
1919 
1920       cond_len = compile_length_tree(cond, reg);
1921       if (cond_len < 0) return cond_len;
1922       if (IS_NOT_NULL(Then)) {
1923         then_len = compile_length_tree(Then, reg);
1924         if (then_len < 0) return then_len;
1925       }
1926       else
1927         then_len = 0;
1928 
1929       jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP;
1930 
1931       r = add_op(reg, OP_PUSH);
1932       if (r != 0) return r;
1933       COP(reg)->push.addr = SIZE_INC + jump_len;
1934 
1935       r = compile_tree(cond, reg, env);
1936       if (r != 0) return r;
1937       r = add_op(reg, OP_CUT_TO_MARK);
1938       if (r != 0) return r;
1939       COP(reg)->cut_to_mark.id = mid;
1940       COP(reg)->cut_to_mark.restore_pos = 0;
1941 
1942       if (IS_NOT_NULL(Then)) {
1943         r = compile_tree(Then, reg, env);
1944         if (r != 0) return r;
1945       }
1946 
1947       if (IS_NOT_NULL(Else)) {
1948         else_len = compile_length_tree(Else, reg);
1949         if (else_len < 0) return else_len;
1950       }
1951       else
1952         else_len = 0;
1953 
1954       r = add_op(reg, OP_JUMP);
1955       if (r != 0) return r;
1956       COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC;
1957 
1958       r = add_op(reg, OP_CUT_TO_MARK);
1959       if (r != 0) return r;
1960       COP(reg)->cut_to_mark.id = mid;
1961       COP(reg)->cut_to_mark.restore_pos = 0;
1962 
1963       if (IS_NOT_NULL(Else)) {
1964         r = compile_tree(Else, reg, env);
1965       }
1966     }
1967     break;
1968   }
1969 
1970   return r;
1971 }
1972 
1973 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1974 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1975 {
1976   int len;
1977   int tlen = 0;
1978 
1979   if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1980     tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1981     if (tlen < 0) return tlen;
1982   }
1983 
1984   switch (node->type) {
1985   case ANCR_PREC_READ:
1986     len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1987     break;
1988   case ANCR_PREC_READ_NOT:
1989     len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
1990     break;
1991   case ANCR_LOOK_BEHIND:
1992     if (node->char_min_len == node->char_max_len)
1993       len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK;
1994     else {
1995       len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR;
1996 
1997       if (IS_NOT_NULL(node->lead_node)) {
1998         int llen = compile_length_tree(node->lead_node, reg);
1999         if (llen < 0) return llen;
2000 
2001         len += OPSIZE_MOVE + llen;
2002       }
2003     }
2004     break;
2005   case ANCR_LOOK_BEHIND_NOT:
2006     if (node->char_min_len == node->char_max_len)
2007       len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP;
2008     else {
2009       len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP;
2010 
2011       if (IS_NOT_NULL(node->lead_node)) {
2012         int llen = compile_length_tree(node->lead_node, reg);
2013         if (llen < 0) return llen;
2014 
2015         len += OPSIZE_MOVE + llen;
2016       }
2017     }
2018     break;
2019 
2020   case ANCR_WORD_BOUNDARY:
2021   case ANCR_NO_WORD_BOUNDARY:
2022 #ifdef USE_WORD_BEGIN_END
2023   case ANCR_WORD_BEGIN:
2024   case ANCR_WORD_END:
2025 #endif
2026     len = OPSIZE_WORD_BOUNDARY;
2027     break;
2028 
2029   case ANCR_TEXT_SEGMENT_BOUNDARY:
2030   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2031     len = SIZE_OPCODE;
2032     break;
2033 
2034   default:
2035     len = SIZE_OPCODE;
2036     break;
2037   }
2038 
2039   return len;
2040 }
2041 
2042 static int
compile_anchor_look_behind_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2043 compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
2044 {
2045   int r;
2046 
2047   if (node->char_min_len == node->char_max_len) {
2048     MemNumType mid;
2049 
2050     ID_ENTRY(env, mid);
2051     r = add_op(reg, OP_MARK);
2052     if (r != 0) return r;
2053     COP(reg)->mark.id = mid;
2054     COP(reg)->mark.save_pos = FALSE;
2055 
2056     r = add_op(reg, OP_STEP_BACK_START);
2057     if (r != 0) return r;
2058     COP(reg)->step_back_start.initial   = node->char_min_len;
2059     COP(reg)->step_back_start.remaining = 0;
2060     COP(reg)->step_back_start.addr      = 1;
2061 
2062     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2063     if (r != 0) return r;
2064 
2065     r = add_op(reg, OP_CUT_TO_MARK);
2066     if (r != 0) return r;
2067     COP(reg)->cut_to_mark.id = mid;
2068     COP(reg)->cut_to_mark.restore_pos = FALSE;
2069   }
2070   else {
2071     MemNumType mid1, mid2;
2072     OnigLen diff;
2073 
2074     if (IS_NOT_NULL(node->lead_node)) {
2075       MinMaxCharLen ci;
2076 
2077       r = node_char_len(node->lead_node, reg, &ci, env);
2078       if (r < 0) return r;
2079       r = add_op(reg, OP_MOVE);
2080       if (r != 0) return r;
2081       COP(reg)->move.n = -((RelPositionType )ci.min);
2082       r = compile_tree(node->lead_node, reg, env);
2083       if (r != 0) return r;
2084     }
2085 
2086     ID_ENTRY(env, mid1);
2087     r = add_op(reg, OP_SAVE_VAL);
2088     if (r != 0) return r;
2089     COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2090     COP(reg)->save_val.id   = mid1;
2091 
2092     r = add_op(reg, OP_UPDATE_VAR);
2093     if (r != 0) return r;
2094     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2095 
2096     ID_ENTRY(env, mid2);
2097     r = add_op(reg, OP_MARK);
2098     if (r != 0) return r;
2099     COP(reg)->mark.id = mid2;
2100     COP(reg)->mark.save_pos = FALSE;
2101 
2102     r = add_op(reg, OP_PUSH);
2103     if (r != 0) return r;
2104     COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
2105 
2106     r = add_op(reg, OP_JUMP);
2107     if (r != 0) return r;
2108     COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL;
2109 
2110     r = add_op(reg, OP_UPDATE_VAR);
2111     if (r != 0) return r;
2112     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2113     COP(reg)->update_var.id    = mid1;
2114     COP(reg)->update_var.clear = FALSE;
2115     r = add_op(reg, OP_FAIL);
2116     if (r != 0) return r;
2117 
2118     r = add_op(reg, OP_STEP_BACK_START);
2119     if (r != 0) return r;
2120 
2121     if (node->char_max_len != INFINITE_LEN)
2122       diff = node->char_max_len - node->char_min_len;
2123     else
2124       diff = INFINITE_LEN;
2125 
2126     COP(reg)->step_back_start.initial   = node->char_min_len;
2127     COP(reg)->step_back_start.remaining = diff;
2128     COP(reg)->step_back_start.addr      = 2;
2129 
2130     r = add_op(reg, OP_STEP_BACK_NEXT);
2131     if (r != 0) return r;
2132 
2133     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2134     if (r != 0) return r;
2135 
2136     r = add_op(reg, OP_CHECK_POSITION);
2137     if (r != 0) return r;
2138     COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2139 
2140     r = add_op(reg, OP_CUT_TO_MARK);
2141     if (r != 0) return r;
2142     COP(reg)->cut_to_mark.id = mid2;
2143     COP(reg)->cut_to_mark.restore_pos = FALSE;
2144 
2145     r = add_op(reg, OP_UPDATE_VAR);
2146     if (r != 0) return r;
2147     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2148     COP(reg)->update_var.id    = mid1;
2149     COP(reg)->update_var.clear = TRUE;
2150   }
2151 
2152   return r;
2153 }
2154 
2155 static int
compile_anchor_look_behind_not_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2156 compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
2157                                     ParseEnv* env)
2158 {
2159   int r;
2160   int len;
2161 
2162   len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2163 
2164   if (node->char_min_len == node->char_max_len) {
2165     MemNumType mid;
2166 
2167     ID_ENTRY(env, mid);
2168     r = add_op(reg, OP_MARK);
2169     if (r != 0) return r;
2170     COP(reg)->mark.id = mid;
2171     COP(reg)->mark.save_pos = FALSE;
2172 
2173     r = add_op(reg, OP_PUSH);
2174     if (r != 0) return r;
2175     COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL;
2176 
2177     r = add_op(reg, OP_STEP_BACK_START);
2178     if (r != 0) return r;
2179     COP(reg)->step_back_start.initial   = node->char_min_len;
2180     COP(reg)->step_back_start.remaining = 0;
2181     COP(reg)->step_back_start.addr      = 1;
2182 
2183     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2184     if (r != 0) return r;
2185 
2186     r = add_op(reg, OP_POP_TO_MARK);
2187     if (r != 0) return r;
2188     COP(reg)->pop_to_mark.id = mid;
2189     r = add_op(reg, OP_FAIL);
2190     if (r != 0) return r;
2191     r = add_op(reg, OP_POP);
2192   }
2193   else {
2194     MemNumType mid1, mid2;
2195     OnigLen diff;
2196 
2197     ID_ENTRY(env, mid1);
2198     r = add_op(reg, OP_SAVE_VAL);
2199     if (r != 0) return r;
2200     COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2201     COP(reg)->save_val.id   = mid1;
2202 
2203     r = add_op(reg, OP_UPDATE_VAR);
2204     if (r != 0) return r;
2205     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2206 
2207     ID_ENTRY(env, mid2);
2208     r = add_op(reg, OP_MARK);
2209     if (r != 0) return r;
2210     COP(reg)->mark.id = mid2;
2211     COP(reg)->mark.save_pos = FALSE;
2212 
2213     r = add_op(reg, OP_PUSH);
2214     if (r != 0) return r;
2215     COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL;
2216 
2217     if (IS_NOT_NULL(node->lead_node)) {
2218       int clen;
2219       MinMaxCharLen ci;
2220 
2221       clen = compile_length_tree(node->lead_node, reg);
2222       COP(reg)->push.addr += OPSIZE_MOVE + clen;
2223 
2224       r = node_char_len(node->lead_node, reg, &ci, env);
2225       if (r < 0) return r;
2226       r = add_op(reg, OP_MOVE);
2227       if (r != 0) return r;
2228       COP(reg)->move.n = -((RelPositionType )ci.min);
2229 
2230       r = compile_tree(node->lead_node, reg, env);
2231       if (r != 0) return r;
2232     }
2233 
2234     r = add_op(reg, OP_STEP_BACK_START);
2235     if (r != 0) return r;
2236 
2237     if (node->char_max_len != INFINITE_LEN)
2238       diff = node->char_max_len - node->char_min_len;
2239     else
2240       diff = INFINITE_LEN;
2241 
2242     COP(reg)->step_back_start.initial   = node->char_min_len;
2243     COP(reg)->step_back_start.remaining = diff;
2244     COP(reg)->step_back_start.addr      = 2;
2245 
2246     r = add_op(reg, OP_STEP_BACK_NEXT);
2247     if (r != 0) return r;
2248 
2249     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2250     if (r != 0) return r;
2251 
2252     r = add_op(reg, OP_CHECK_POSITION);
2253     if (r != 0) return r;
2254     COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2255 
2256     r = add_op(reg, OP_POP_TO_MARK);
2257     if (r != 0) return r;
2258     COP(reg)->pop_to_mark.id = mid2;
2259 
2260     r = add_op(reg, OP_UPDATE_VAR);
2261     if (r != 0) return r;
2262     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2263     COP(reg)->update_var.id   = mid1;
2264     COP(reg)->update_var.clear = FALSE;
2265 
2266     r = add_op(reg, OP_POP); /* pop save val */
2267     if (r != 0) return r;
2268     r = add_op(reg, OP_FAIL);
2269     if (r != 0) return r;
2270 
2271     r = add_op(reg, OP_UPDATE_VAR);
2272     if (r != 0) return r;
2273     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2274     COP(reg)->update_var.id   = mid1;
2275     COP(reg)->update_var.clear = FALSE;
2276 
2277     r = add_op(reg, OP_POP); /* pop mark */
2278     if (r != 0) return r;
2279     r = add_op(reg, OP_POP); /* pop save val */
2280   }
2281 
2282   return r;
2283 }
2284 
2285 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ParseEnv * env)2286 compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
2287 {
2288   int r, len;
2289   enum OpCode op;
2290   MemNumType mid;
2291 
2292   switch (node->type) {
2293   case ANCR_BEGIN_BUF:      r = add_op(reg, OP_BEGIN_BUF);      break;
2294   case ANCR_END_BUF:        r = add_op(reg, OP_END_BUF);        break;
2295   case ANCR_BEGIN_LINE:     r = add_op(reg, OP_BEGIN_LINE);     break;
2296   case ANCR_END_LINE:       r = add_op(reg, OP_END_LINE);       break;
2297   case ANCR_SEMI_END_BUF:   r = add_op(reg, OP_SEMI_END_BUF);   break;
2298   case ANCR_BEGIN_POSITION:
2299     r = add_op(reg, OP_CHECK_POSITION);
2300     if (r != 0) return r;
2301     COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START;
2302     break;
2303 
2304   case ANCR_WORD_BOUNDARY:
2305     op = OP_WORD_BOUNDARY;
2306   word:
2307     r = add_op(reg, op);
2308     if (r != 0) return r;
2309     COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
2310     break;
2311 
2312   case ANCR_NO_WORD_BOUNDARY:
2313     op = OP_NO_WORD_BOUNDARY; goto word;
2314     break;
2315 #ifdef USE_WORD_BEGIN_END
2316   case ANCR_WORD_BEGIN:
2317     op = OP_WORD_BEGIN; goto word;
2318     break;
2319   case ANCR_WORD_END:
2320     op = OP_WORD_END; goto word;
2321     break;
2322 #endif
2323 
2324   case ANCR_TEXT_SEGMENT_BOUNDARY:
2325   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2326     {
2327       enum TextSegmentBoundaryType type;
2328 
2329       r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
2330       if (r != 0) return r;
2331 
2332       type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
2333 #ifdef USE_UNICODE_WORD_BREAK
2334       if (NODE_IS_TEXT_SEGMENT_WORD(node))
2335         type = WORD_BOUNDARY;
2336 #endif
2337 
2338       COP(reg)->text_segment_boundary.type = type;
2339       COP(reg)->text_segment_boundary.not =
2340         (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
2341     }
2342     break;
2343 
2344   case ANCR_PREC_READ:
2345     {
2346       ID_ENTRY(env, mid);
2347       r = add_op(reg, OP_MARK);
2348       if (r != 0) return r;
2349       COP(reg)->mark.id = mid;
2350       COP(reg)->mark.save_pos = TRUE;
2351 
2352       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2353       if (r != 0) return r;
2354 
2355       r = add_op(reg, OP_CUT_TO_MARK);
2356       if (r != 0) return r;
2357       COP(reg)->cut_to_mark.id = mid;
2358       COP(reg)->cut_to_mark.restore_pos = TRUE;
2359     }
2360     break;
2361 
2362   case ANCR_PREC_READ_NOT:
2363     {
2364       len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2365       if (len < 0) return len;
2366 
2367       ID_ENTRY(env, mid);
2368       r = add_op(reg, OP_PUSH);
2369       if (r != 0) return r;
2370       COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len +
2371                             OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
2372 
2373       r = add_op(reg, OP_MARK);
2374       if (r != 0) return r;
2375       COP(reg)->mark.id = mid;
2376       COP(reg)->mark.save_pos = FALSE;
2377 
2378       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2379       if (r != 0) return r;
2380 
2381       r = add_op(reg, OP_POP_TO_MARK);
2382       if (r != 0) return r;
2383       COP(reg)->pop_to_mark.id = mid;
2384 
2385       r = add_op(reg, OP_POP);
2386       if (r != 0) return r;
2387       r = add_op(reg, OP_FAIL);
2388     }
2389     break;
2390 
2391   case ANCR_LOOK_BEHIND:
2392     r = compile_anchor_look_behind_node(node, reg, env);
2393     break;
2394 
2395   case ANCR_LOOK_BEHIND_NOT:
2396     r = compile_anchor_look_behind_not_node(node, reg, env);
2397     break;
2398 
2399   default:
2400     return ONIGERR_TYPE_BUG;
2401     break;
2402   }
2403 
2404   return r;
2405 }
2406 
2407 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)2408 compile_gimmick_node(GimmickNode* node, regex_t* reg)
2409 {
2410   int r = 0;
2411 
2412   switch (node->type) {
2413   case GIMMICK_FAIL:
2414     r = add_op(reg, OP_FAIL);
2415     break;
2416 
2417   case GIMMICK_SAVE:
2418     r = add_op(reg, OP_SAVE_VAL);
2419     if (r != 0) return r;
2420     COP(reg)->save_val.type = node->detail_type;
2421     COP(reg)->save_val.id   = node->id;
2422     break;
2423 
2424   case GIMMICK_UPDATE_VAR:
2425     r = add_op(reg, OP_UPDATE_VAR);
2426     if (r != 0) return r;
2427     COP(reg)->update_var.type = node->detail_type;
2428     COP(reg)->update_var.id   = node->id;
2429     COP(reg)->update_var.clear = FALSE;
2430     break;
2431 
2432 #ifdef USE_CALLOUT
2433   case GIMMICK_CALLOUT:
2434     switch (node->detail_type) {
2435     case ONIG_CALLOUT_OF_CONTENTS:
2436     case ONIG_CALLOUT_OF_NAME:
2437       {
2438         if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
2439           r = add_op(reg, OP_CALLOUT_NAME);
2440           if (r != 0) return r;
2441           COP(reg)->callout_name.id  = node->id;
2442           COP(reg)->callout_name.num = node->num;
2443         }
2444         else {
2445           r = add_op(reg, OP_CALLOUT_CONTENTS);
2446           if (r != 0) return r;
2447           COP(reg)->callout_contents.num = node->num;
2448         }
2449       }
2450       break;
2451 
2452     default:
2453       r = ONIGERR_TYPE_BUG;
2454       break;
2455     }
2456 #endif
2457   }
2458 
2459   return r;
2460 }
2461 
2462 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)2463 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
2464 {
2465   int len;
2466 
2467   switch (node->type) {
2468   case GIMMICK_FAIL:
2469     len = OPSIZE_FAIL;
2470     break;
2471 
2472   case GIMMICK_SAVE:
2473     len = OPSIZE_SAVE_VAL;
2474     break;
2475 
2476   case GIMMICK_UPDATE_VAR:
2477     len = OPSIZE_UPDATE_VAR;
2478     break;
2479 
2480 #ifdef USE_CALLOUT
2481   case GIMMICK_CALLOUT:
2482     switch (node->detail_type) {
2483     case ONIG_CALLOUT_OF_CONTENTS:
2484       len = OPSIZE_CALLOUT_CONTENTS;
2485       break;
2486     case ONIG_CALLOUT_OF_NAME:
2487       len = OPSIZE_CALLOUT_NAME;
2488       break;
2489 
2490     default:
2491       len = ONIGERR_TYPE_BUG;
2492       break;
2493     }
2494     break;
2495 #endif
2496   }
2497 
2498   return len;
2499 }
2500 
2501 static int
compile_length_tree(Node * node,regex_t * reg)2502 compile_length_tree(Node* node, regex_t* reg)
2503 {
2504   int len, r;
2505 
2506   switch (NODE_TYPE(node)) {
2507   case NODE_LIST:
2508     len = 0;
2509     do {
2510       r = compile_length_tree(NODE_CAR(node), reg);
2511       if (r < 0) return r;
2512       len += r;
2513     } while (IS_NOT_NULL(node = NODE_CDR(node)));
2514     r = len;
2515     break;
2516 
2517   case NODE_ALT:
2518     {
2519       int n;
2520 
2521       n = r = 0;
2522       do {
2523         r += compile_length_tree(NODE_CAR(node), reg);
2524         n++;
2525       } while (IS_NOT_NULL(node = NODE_CDR(node)));
2526       r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
2527     }
2528     break;
2529 
2530   case NODE_STRING:
2531     if (NODE_STRING_IS_CRUDE(node))
2532       r = compile_length_string_crude_node(STR_(node), reg);
2533     else
2534       r = compile_length_string_node(node, reg);
2535     break;
2536 
2537   case NODE_CCLASS:
2538     r = compile_length_cclass_node(CCLASS_(node), reg);
2539     break;
2540 
2541   case NODE_CTYPE:
2542     r = SIZE_OPCODE;
2543     break;
2544 
2545   case NODE_BACKREF:
2546     r = OPSIZE_BACKREF;
2547     break;
2548 
2549 #ifdef USE_CALL
2550   case NODE_CALL:
2551     r = OPSIZE_CALL;
2552     break;
2553 #endif
2554 
2555   case NODE_QUANT:
2556     r = compile_length_quantifier_node(QUANT_(node), reg);
2557     break;
2558 
2559   case NODE_BAG:
2560     r = compile_length_bag_node(BAG_(node), reg);
2561     break;
2562 
2563   case NODE_ANCHOR:
2564     r = compile_length_anchor_node(ANCHOR_(node), reg);
2565     break;
2566 
2567   case NODE_GIMMICK:
2568     r = compile_length_gimmick_node(GIMMICK_(node), reg);
2569     break;
2570 
2571   default:
2572     return ONIGERR_TYPE_BUG;
2573     break;
2574   }
2575 
2576   return r;
2577 }
2578 
2579 static int
compile_tree(Node * node,regex_t * reg,ParseEnv * env)2580 compile_tree(Node* node, regex_t* reg, ParseEnv* env)
2581 {
2582   int n, len, pos, r = 0;
2583 
2584   switch (NODE_TYPE(node)) {
2585   case NODE_LIST:
2586     do {
2587       r = compile_tree(NODE_CAR(node), reg, env);
2588     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2589     break;
2590 
2591   case NODE_ALT:
2592     {
2593       Node* x = node;
2594       len = 0;
2595       do {
2596         len += compile_length_tree(NODE_CAR(x), reg);
2597         if (IS_NOT_NULL(NODE_CDR(x))) {
2598           len += OPSIZE_PUSH + OPSIZE_JUMP;
2599         }
2600       } while (IS_NOT_NULL(x = NODE_CDR(x)));
2601       pos = COP_CURR_OFFSET(reg) + 1 + len;  /* goal position */
2602 
2603       do {
2604         len = compile_length_tree(NODE_CAR(node), reg);
2605         if (IS_NOT_NULL(NODE_CDR(node))) {
2606           enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2607           r = add_op(reg, push);
2608           if (r != 0) break;
2609           COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2610         }
2611         r = compile_tree(NODE_CAR(node), reg, env);
2612         if (r != 0) break;
2613         if (IS_NOT_NULL(NODE_CDR(node))) {
2614           len = pos - (COP_CURR_OFFSET(reg) + 1);
2615           r = add_op(reg, OP_JUMP);
2616           if (r != 0) break;
2617           COP(reg)->jump.addr = len;
2618         }
2619       } while (IS_NOT_NULL(node = NODE_CDR(node)));
2620     }
2621     break;
2622 
2623   case NODE_STRING:
2624     if (NODE_STRING_IS_CRUDE(node))
2625       r = compile_string_crude_node(STR_(node), reg);
2626     else
2627       r = compile_string_node(node, reg);
2628     break;
2629 
2630   case NODE_CCLASS:
2631     r = compile_cclass_node(CCLASS_(node), reg);
2632     break;
2633 
2634   case NODE_CTYPE:
2635     {
2636       int op;
2637 
2638       switch (CTYPE_(node)->ctype) {
2639       case CTYPE_ANYCHAR:
2640         r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR);
2641         break;
2642 
2643       case ONIGENC_CTYPE_WORD:
2644         if (CTYPE_(node)->ascii_mode == 0) {
2645           op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2646         }
2647         else {
2648           op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2649         }
2650         r = add_op(reg, op);
2651         break;
2652 
2653       default:
2654         return ONIGERR_TYPE_BUG;
2655         break;
2656       }
2657     }
2658     break;
2659 
2660   case NODE_BACKREF:
2661     {
2662       BackRefNode* br = BACKREF_(node);
2663 
2664       if (NODE_IS_CHECKER(node)) {
2665 #ifdef USE_BACKREF_WITH_LEVEL
2666         if (NODE_IS_NEST_LEVEL(node)) {
2667           r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2668           if (r != 0) return r;
2669           COP(reg)->backref_general.nest_level = br->nest_level;
2670         }
2671         else
2672 #endif
2673           {
2674             r = add_op(reg, OP_BACKREF_CHECK);
2675             if (r != 0) return r;
2676           }
2677         goto add_bacref_mems;
2678       }
2679       else {
2680 #ifdef USE_BACKREF_WITH_LEVEL
2681         if (NODE_IS_NEST_LEVEL(node)) {
2682           if (NODE_IS_IGNORECASE(node))
2683             r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2684           else
2685             r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2686 
2687           if (r != 0) return r;
2688           COP(reg)->backref_general.nest_level = br->nest_level;
2689           goto add_bacref_mems;
2690         }
2691         else
2692 #endif
2693         if (br->back_num == 1) {
2694           n = br->back_static[0];
2695           if (NODE_IS_IGNORECASE(node)) {
2696             r = add_op(reg, OP_BACKREF_N_IC);
2697             if (r != 0) return r;
2698             COP(reg)->backref_n.n1 = n;
2699           }
2700           else {
2701             switch (n) {
2702             case 1:  r = add_op(reg, OP_BACKREF1); break;
2703             case 2:  r = add_op(reg, OP_BACKREF2); break;
2704             default:
2705               r = add_op(reg, OP_BACKREF_N);
2706               if (r != 0) return r;
2707               COP(reg)->backref_n.n1 = n;
2708               break;
2709             }
2710           }
2711         }
2712         else {
2713           int num;
2714           int* p;
2715 
2716           r = add_op(reg, NODE_IS_IGNORECASE(node) ?
2717                      OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2718           if (r != 0) return r;
2719 
2720         add_bacref_mems:
2721           num = br->back_num;
2722           COP(reg)->backref_general.num = num;
2723           if (num == 1) {
2724             COP(reg)->backref_general.n1 = br->back_static[0];
2725           }
2726           else {
2727             int i, j;
2728             MemNumType* ns;
2729 
2730             ns = xmalloc(sizeof(MemNumType) * num);
2731             CHECK_NULL_RETURN_MEMERR(ns);
2732             COP(reg)->backref_general.ns = ns;
2733             p = BACKREFS_P(br);
2734             for (i = num - 1, j = 0; i >= 0; i--, j++) {
2735               ns[j] = p[i];
2736             }
2737           }
2738         }
2739       }
2740     }
2741     break;
2742 
2743 #ifdef USE_CALL
2744   case NODE_CALL:
2745     r = compile_call(CALL_(node), reg, env);
2746     break;
2747 #endif
2748 
2749   case NODE_QUANT:
2750     r = compile_quantifier_node(QUANT_(node), reg, env);
2751     break;
2752 
2753   case NODE_BAG:
2754     r = compile_bag_node(BAG_(node), reg, env);
2755     break;
2756 
2757   case NODE_ANCHOR:
2758     r = compile_anchor_node(ANCHOR_(node), reg, env);
2759     break;
2760 
2761   case NODE_GIMMICK:
2762     r = compile_gimmick_node(GIMMICK_(node), reg);
2763     break;
2764 
2765   default:
2766 #ifdef ONIG_DEBUG
2767     fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2768 #endif
2769     break;
2770   }
2771 
2772   return r;
2773 }
2774 
2775 static int
make_named_capture_number_map(Node ** plink,GroupNumMap * map,int * counter)2776 make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)
2777 {
2778   int r;
2779   Node* node = *plink;
2780 
2781   switch (NODE_TYPE(node)) {
2782   case NODE_LIST:
2783   case NODE_ALT:
2784     do {
2785       r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter);
2786     } while (r >= 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2787     if (r < 0) return r;
2788     break;
2789 
2790   case NODE_QUANT:
2791     {
2792       Node** ptarget = &(NODE_BODY(node));
2793       r = make_named_capture_number_map(ptarget, map, counter);
2794       if (r < 0) return r;
2795       if (r == 1 && NODE_TYPE(*ptarget) == NODE_QUANT) {
2796         return onig_reduce_nested_quantifier(node);
2797       }
2798     }
2799     break;
2800 
2801   case NODE_BAG:
2802     {
2803       BagNode* en = BAG_(node);
2804       if (en->type == BAG_MEMORY) {
2805         if (NODE_IS_NAMED_GROUP(node)) {
2806           (*counter)++;
2807           map[en->m.regnum].new_val = *counter;
2808           en->m.regnum = *counter;
2809           r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2810           if (r < 0) return r;
2811         }
2812         else {
2813           *plink = NODE_BODY(node);
2814           NODE_BODY(node) = NULL_NODE;
2815           onig_node_free(node);
2816           r = make_named_capture_number_map(plink, map, counter);
2817           if (r < 0) return r;
2818           return 1;
2819         }
2820       }
2821       else if (en->type == BAG_IF_ELSE) {
2822         r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter);
2823         if (r < 0) return r;
2824         if (IS_NOT_NULL(en->te.Then)) {
2825           r = make_named_capture_number_map(&(en->te.Then), map, counter);
2826           if (r < 0) return r;
2827         }
2828         if (IS_NOT_NULL(en->te.Else)) {
2829           r = make_named_capture_number_map(&(en->te.Else), map, counter);
2830           if (r < 0) return r;
2831         }
2832       }
2833       else {
2834         r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2835         if (r < 0) return r;
2836       }
2837     }
2838     break;
2839 
2840   case NODE_ANCHOR:
2841     if (IS_NOT_NULL(NODE_BODY(node))) {
2842       r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2843       if (r < 0) return r;
2844     }
2845     break;
2846 
2847   default:
2848     break;
2849   }
2850 
2851   return 0;
2852 }
2853 
2854 static int
renumber_backref_node(Node * node,GroupNumMap * map)2855 renumber_backref_node(Node* node, GroupNumMap* map)
2856 {
2857   int i, pos, n, old_num;
2858   int *backs;
2859   BackRefNode* bn = BACKREF_(node);
2860 
2861   if (! NODE_IS_BY_NAME(node))
2862     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2863 
2864   old_num = bn->back_num;
2865   if (IS_NULL(bn->back_dynamic))
2866     backs = bn->back_static;
2867   else
2868     backs = bn->back_dynamic;
2869 
2870   for (i = 0, pos = 0; i < old_num; i++) {
2871     n = map[backs[i]].new_val;
2872     if (n > 0) {
2873       backs[pos] = n;
2874       pos++;
2875     }
2876   }
2877 
2878   bn->back_num = pos;
2879   return 0;
2880 }
2881 
2882 static int
renumber_backref_traverse(Node * node,GroupNumMap * map)2883 renumber_backref_traverse(Node* node, GroupNumMap* map)
2884 {
2885   int r = 0;
2886 
2887   switch (NODE_TYPE(node)) {
2888   case NODE_LIST:
2889   case NODE_ALT:
2890     do {
2891       r = renumber_backref_traverse(NODE_CAR(node), map);
2892     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2893     break;
2894 
2895   case NODE_QUANT:
2896     r = renumber_backref_traverse(NODE_BODY(node), map);
2897     break;
2898 
2899   case NODE_BAG:
2900     {
2901       BagNode* en = BAG_(node);
2902 
2903       r = renumber_backref_traverse(NODE_BODY(node), map);
2904       if (r != 0) return r;
2905 
2906       if (en->type == BAG_IF_ELSE) {
2907         if (IS_NOT_NULL(en->te.Then)) {
2908           r = renumber_backref_traverse(en->te.Then, map);
2909           if (r != 0) return r;
2910         }
2911         if (IS_NOT_NULL(en->te.Else)) {
2912           r = renumber_backref_traverse(en->te.Else, map);
2913           if (r != 0) return r;
2914         }
2915       }
2916     }
2917     break;
2918 
2919   case NODE_BACKREF:
2920     r = renumber_backref_node(node, map);
2921     break;
2922 
2923   case NODE_ANCHOR:
2924     if (IS_NOT_NULL(NODE_BODY(node)))
2925       r = renumber_backref_traverse(NODE_BODY(node), map);
2926     break;
2927 
2928   default:
2929     break;
2930   }
2931 
2932   return r;
2933 }
2934 
2935 static int
numbered_ref_check(Node * node)2936 numbered_ref_check(Node* node)
2937 {
2938   int r = 0;
2939 
2940   switch (NODE_TYPE(node)) {
2941   case NODE_LIST:
2942   case NODE_ALT:
2943     do {
2944       r = numbered_ref_check(NODE_CAR(node));
2945     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2946     break;
2947 
2948   case NODE_ANCHOR:
2949     if (IS_NULL(NODE_BODY(node)))
2950       break;
2951     /* fall */
2952   case NODE_QUANT:
2953     r = numbered_ref_check(NODE_BODY(node));
2954     break;
2955 
2956   case NODE_BAG:
2957     {
2958       BagNode* en = BAG_(node);
2959 
2960       r = numbered_ref_check(NODE_BODY(node));
2961       if (r != 0) return r;
2962 
2963       if (en->type == BAG_IF_ELSE) {
2964         if (IS_NOT_NULL(en->te.Then)) {
2965           r = numbered_ref_check(en->te.Then);
2966           if (r != 0) return r;
2967         }
2968         if (IS_NOT_NULL(en->te.Else)) {
2969           r = numbered_ref_check(en->te.Else);
2970           if (r != 0) return r;
2971         }
2972       }
2973     }
2974 
2975     break;
2976 
2977   case NODE_BACKREF:
2978     if (! NODE_IS_BY_NAME(node))
2979       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2980     break;
2981 
2982   default:
2983     break;
2984   }
2985 
2986   return r;
2987 }
2988 
2989 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ParseEnv * env)2990 disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env)
2991 {
2992   int r, i, pos, counter;
2993   MemStatusType loc;
2994   GroupNumMap* map;
2995 
2996   map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1));
2997   CHECK_NULL_RETURN_MEMERR(map);
2998   for (i = 1; i <= env->num_mem; i++) {
2999     map[i].new_val = 0;
3000   }
3001   counter = 0;
3002   r = make_named_capture_number_map(root, map, &counter);
3003   if (r < 0) return r;
3004 
3005   r = renumber_backref_traverse(*root, map);
3006   if (r != 0) return r;
3007 
3008   for (i = 1, pos = 1; i <= env->num_mem; i++) {
3009     if (map[i].new_val > 0) {
3010       PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i];
3011       pos++;
3012     }
3013   }
3014 
3015   loc = env->cap_history;
3016   MEM_STATUS_CLEAR(env->cap_history);
3017   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
3018     if (MEM_STATUS_AT(loc, i)) {
3019       MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
3020     }
3021   }
3022 
3023   env->num_mem = env->num_named;
3024   reg->num_mem = env->num_named;
3025 
3026   return onig_renumber_name_table(reg, map);
3027 }
3028 
3029 #ifdef USE_CALL
3030 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)3031 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
3032 {
3033   int i, offset;
3034   BagNode* en;
3035   AbsAddrType addr;
3036   AbsAddrType* paddr;
3037 
3038   for (i = 0; i < uslist->num; i++) {
3039     if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) {
3040       if (NODE_IS_CALLED(uslist->us[i].target))
3041         return ONIGERR_PARSER_BUG;
3042       else {
3043         /* CASE: called node doesn't have called address.
3044            ex. /((|a\g<1>)(.){0}){0}\g<3>/
3045            group-1 doesn't called, but compiled into bytecodes,
3046            because group-3 is referred from outside.
3047         */
3048         continue;
3049       }
3050     }
3051 
3052     en = BAG_(uslist->us[i].target);
3053     addr   = en->m.called_addr;
3054     offset = uslist->us[i].offset;
3055 
3056     paddr = (AbsAddrType* )((char* )reg->ops + offset);
3057     *paddr = addr;
3058   }
3059   return 0;
3060 }
3061 #endif
3062 
3063 /* x is not included y ==>  1 : 0 */
3064 static int
is_exclusive(Node * x,Node * y,regex_t * reg)3065 is_exclusive(Node* x, Node* y, regex_t* reg)
3066 {
3067   int i, len;
3068   OnigCodePoint code;
3069   UChar *p;
3070   NodeType ytype;
3071 
3072  retry:
3073   ytype = NODE_TYPE(y);
3074   switch (NODE_TYPE(x)) {
3075   case NODE_CTYPE:
3076     {
3077       if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
3078           CTYPE_(y)->ctype == CTYPE_ANYCHAR)
3079         break;
3080 
3081       switch (ytype) {
3082       case NODE_CTYPE:
3083         if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
3084             CTYPE_(y)->not   != CTYPE_(x)->not &&
3085             CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
3086           return 1;
3087         else
3088           return 0;
3089         break;
3090 
3091       case NODE_CCLASS:
3092       swap:
3093         {
3094           Node* tmp;
3095           tmp = x; x = y; y = tmp;
3096           goto retry;
3097         }
3098         break;
3099 
3100       case NODE_STRING:
3101         goto swap;
3102         break;
3103 
3104       default:
3105         break;
3106       }
3107     }
3108     break;
3109 
3110   case NODE_CCLASS:
3111     {
3112       int range;
3113       CClassNode* xc = CCLASS_(x);
3114 
3115       switch (ytype) {
3116       case NODE_CTYPE:
3117         switch (CTYPE_(y)->ctype) {
3118         case CTYPE_ANYCHAR:
3119           return 0;
3120           break;
3121 
3122         case ONIGENC_CTYPE_WORD:
3123           if (CTYPE_(y)->not == 0) {
3124             if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
3125               range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3126               for (i = 0; i < range; i++) {
3127                 if (BITSET_AT(xc->bs, i)) {
3128                   if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
3129                 }
3130               }
3131               return 1;
3132             }
3133             return 0;
3134           }
3135           else {
3136             if (IS_NOT_NULL(xc->mbuf)) return 0;
3137             if (IS_NCCLASS_NOT(xc)) return 0;
3138 
3139             range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3140             for (i = 0; i < range; i++) {
3141               if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
3142                 if (BITSET_AT(xc->bs, i))
3143                   return 0;
3144               }
3145             }
3146             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
3147               if (BITSET_AT(xc->bs, i)) return 0;
3148             }
3149             return 1;
3150           }
3151           break;
3152 
3153         default:
3154           break;
3155         }
3156         break;
3157 
3158       case NODE_CCLASS:
3159         {
3160           int v;
3161           CClassNode* yc = CCLASS_(y);
3162 
3163           for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
3164             v = BITSET_AT(xc->bs, i);
3165             if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
3166               v = BITSET_AT(yc->bs, i);
3167               if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
3168                   (v == 0 && IS_NCCLASS_NOT(yc)))
3169                 return 0;
3170             }
3171           }
3172           if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
3173               (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
3174             return 1;
3175           return 0;
3176         }
3177         break;
3178 
3179       case NODE_STRING:
3180         goto swap;
3181         break;
3182 
3183       default:
3184         break;
3185       }
3186     }
3187     break;
3188 
3189   case NODE_STRING:
3190     {
3191       StrNode* xs = STR_(x);
3192 
3193       if (NODE_STRING_LEN(x) == 0)
3194         break;
3195 
3196       switch (ytype) {
3197       case NODE_CTYPE:
3198         switch (CTYPE_(y)->ctype) {
3199         case CTYPE_ANYCHAR:
3200           break;
3201 
3202         case ONIGENC_CTYPE_WORD:
3203           if (CTYPE_(y)->ascii_mode == 0) {
3204             if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
3205               return CTYPE_(y)->not;
3206             else
3207               return !(CTYPE_(y)->not);
3208           }
3209           else {
3210             if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
3211               return CTYPE_(y)->not;
3212             else
3213               return !(CTYPE_(y)->not);
3214           }
3215           break;
3216         default:
3217           break;
3218         }
3219         break;
3220 
3221       case NODE_CCLASS:
3222         {
3223           CClassNode* cc = CCLASS_(y);
3224 
3225           code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
3226                                      xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
3227           return onig_is_code_in_cc(reg->enc, code, cc) == 0;
3228         }
3229         break;
3230 
3231       case NODE_STRING:
3232         {
3233           UChar *q;
3234           StrNode* ys = STR_(y);
3235 
3236           len = NODE_STRING_LEN(x);
3237           if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
3238 
3239           for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
3240             if (*p != *q) return 1;
3241           }
3242         }
3243         break;
3244 
3245       default:
3246         break;
3247       }
3248     }
3249     break;
3250 
3251   default:
3252     break;
3253   }
3254 
3255   return 0;
3256 }
3257 
3258 static Node*
get_tree_head_literal(Node * node,int exact,regex_t * reg)3259 get_tree_head_literal(Node* node, int exact, regex_t* reg)
3260 {
3261   Node* n = NULL_NODE;
3262 
3263   switch (NODE_TYPE(node)) {
3264   case NODE_BACKREF:
3265   case NODE_ALT:
3266 #ifdef USE_CALL
3267   case NODE_CALL:
3268 #endif
3269     break;
3270 
3271   case NODE_CTYPE:
3272     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
3273       break;
3274     /* fall */
3275   case NODE_CCLASS:
3276     if (exact == 0) {
3277       n = node;
3278     }
3279     break;
3280 
3281   case NODE_LIST:
3282     n = get_tree_head_literal(NODE_CAR(node), exact, reg);
3283     break;
3284 
3285   case NODE_STRING:
3286     {
3287       StrNode* sn = STR_(node);
3288 
3289       if (sn->end <= sn->s)
3290         break;
3291 
3292       if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) {
3293         n = node;
3294       }
3295     }
3296     break;
3297 
3298   case NODE_QUANT:
3299     {
3300       QuantNode* qn = QUANT_(node);
3301       if (qn->lower > 0) {
3302         if (IS_NOT_NULL(qn->head_exact))
3303           n = qn->head_exact;
3304         else
3305           n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3306       }
3307     }
3308     break;
3309 
3310   case NODE_BAG:
3311     {
3312       BagNode* en = BAG_(node);
3313       switch (en->type) {
3314       case BAG_OPTION:
3315       case BAG_MEMORY:
3316       case BAG_STOP_BACKTRACK:
3317       case BAG_IF_ELSE:
3318         n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3319         break;
3320       }
3321     }
3322     break;
3323 
3324   case NODE_ANCHOR:
3325     if (ANCHOR_(node)->type == ANCR_PREC_READ)
3326       n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3327     break;
3328 
3329   case NODE_GIMMICK:
3330   default:
3331     break;
3332   }
3333 
3334   return n;
3335 }
3336 
3337 enum GetValue {
3338   GET_VALUE_NONE   = -1,
3339   GET_VALUE_IGNORE =  0,
3340   GET_VALUE_FOUND  =  1
3341 };
3342 
3343 static int
get_tree_tail_literal(Node * node,Node ** rnode,regex_t * reg)3344 get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)
3345 {
3346   int r;
3347 
3348   switch (NODE_TYPE(node)) {
3349   case NODE_LIST:
3350     if (IS_NULL(NODE_CDR(node))) {
3351       r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3352     }
3353     else {
3354       r = get_tree_tail_literal(NODE_CDR(node), rnode, reg);
3355       if (r == GET_VALUE_IGNORE) {
3356         r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3357       }
3358     }
3359     break;
3360 
3361 #ifdef USE_CALL
3362   case NODE_CALL:
3363     r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3364     break;
3365 #endif
3366 
3367   case NODE_CTYPE:
3368     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) {
3369       r = GET_VALUE_NONE;
3370       break;
3371     }
3372     /* fall */
3373   case NODE_CCLASS:
3374     *rnode = node;
3375     r = GET_VALUE_FOUND;
3376     break;
3377 
3378   case NODE_STRING:
3379     {
3380       StrNode* sn = STR_(node);
3381 
3382       if (sn->end <= sn->s) {
3383         r = GET_VALUE_IGNORE;
3384         break;
3385       }
3386 
3387       if (NODE_IS_REAL_IGNORECASE(node)) {
3388         r = GET_VALUE_NONE;
3389         break;
3390       }
3391 
3392       *rnode = node;
3393       r = GET_VALUE_FOUND;
3394     }
3395     break;
3396 
3397   case NODE_QUANT:
3398     {
3399       QuantNode* qn = QUANT_(node);
3400       if (qn->lower != 0) {
3401         r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3402       }
3403       else
3404         r = GET_VALUE_NONE;
3405     }
3406     break;
3407 
3408   case NODE_BAG:
3409     {
3410       BagNode* en = BAG_(node);
3411 
3412       if (en->type == BAG_MEMORY) {
3413         if (NODE_IS_MARK1(node))
3414           r = GET_VALUE_NONE;
3415         else {
3416           NODE_STATUS_ADD(node, MARK1);
3417           r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3418           NODE_STATUS_REMOVE(node, MARK1);
3419         }
3420       }
3421       else {
3422         r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3423       }
3424     }
3425     break;
3426 
3427   case NODE_ANCHOR:
3428   case NODE_GIMMICK:
3429     r = GET_VALUE_IGNORE;
3430     break;
3431 
3432   case NODE_ALT:
3433   case NODE_BACKREF:
3434   default:
3435     r = GET_VALUE_NONE;
3436     break;
3437   }
3438 
3439   return r;
3440 }
3441 
3442 static int
check_called_node_in_look_behind(Node * node,int not)3443 check_called_node_in_look_behind(Node* node, int not)
3444 {
3445   int r;
3446 
3447   r = 0;
3448 
3449   switch (NODE_TYPE(node)) {
3450   case NODE_LIST:
3451   case NODE_ALT:
3452     do {
3453       r = check_called_node_in_look_behind(NODE_CAR(node), not);
3454     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3455     break;
3456 
3457   case NODE_QUANT:
3458     r = check_called_node_in_look_behind(NODE_BODY(node), not);
3459     break;
3460 
3461   case NODE_BAG:
3462     {
3463       BagNode* en = BAG_(node);
3464 
3465       if (en->type == BAG_MEMORY) {
3466         if (NODE_IS_MARK1(node))
3467           return 0;
3468         else {
3469           NODE_STATUS_ADD(node, MARK1);
3470           r = check_called_node_in_look_behind(NODE_BODY(node), not);
3471           NODE_STATUS_REMOVE(node, MARK1);
3472         }
3473       }
3474       else {
3475         r = check_called_node_in_look_behind(NODE_BODY(node), not);
3476         if (r == 0 && en->type == BAG_IF_ELSE) {
3477           if (IS_NOT_NULL(en->te.Then)) {
3478             r = check_called_node_in_look_behind(en->te.Then, not);
3479             if (r != 0) break;
3480           }
3481           if (IS_NOT_NULL(en->te.Else)) {
3482             r = check_called_node_in_look_behind(en->te.Else, not);
3483           }
3484         }
3485       }
3486     }
3487     break;
3488 
3489   case NODE_ANCHOR:
3490     if (IS_NOT_NULL(NODE_BODY(node)))
3491       r = check_called_node_in_look_behind(NODE_BODY(node), not);
3492     break;
3493 
3494   case NODE_GIMMICK:
3495     if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3496       return 1;
3497     break;
3498 
3499   default:
3500     break;
3501   }
3502 
3503   return r;
3504 }
3505 
3506 /* allowed node types in look-behind */
3507 #define ALLOWED_TYPE_IN_LB \
3508   ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
3509   | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
3510   | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK)
3511 
3512 #define ALLOWED_BAG_IN_LB       ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3513 #define ALLOWED_BAG_IN_LB_NOT   ( 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3514 
3515 #define ALLOWED_ANCHOR_IN_LB \
3516   ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
3517   | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
3518   | ANCR_WORD_BEGIN | ANCR_WORD_END \
3519   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3520 
3521 #define ALLOWED_ANCHOR_IN_LB_NOT \
3522   ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
3523   | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
3524   | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
3525   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3526 
3527 
3528 static int
check_node_in_look_behind(Node * node,int not,int * used)3529 check_node_in_look_behind(Node* node, int not, int* used)
3530 {
3531   static unsigned int
3532     bag_mask[2] = { ALLOWED_BAG_IN_LB, ALLOWED_BAG_IN_LB_NOT };
3533 
3534   static unsigned int
3535     anchor_mask[2] = { ALLOWED_ANCHOR_IN_LB, ALLOWED_ANCHOR_IN_LB_NOT };
3536 
3537   NodeType type;
3538   int r = 0;
3539 
3540   type = NODE_TYPE(node);
3541   if ((NODE_TYPE2BIT(type) & ALLOWED_TYPE_IN_LB) == 0)
3542     return 1;
3543 
3544   switch (type) {
3545   case NODE_LIST:
3546   case NODE_ALT:
3547     do {
3548       r = check_node_in_look_behind(NODE_CAR(node), not, used);
3549     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3550     break;
3551 
3552   case NODE_QUANT:
3553     r = check_node_in_look_behind(NODE_BODY(node), not, used);
3554     break;
3555 
3556   case NODE_BAG:
3557     {
3558       BagNode* en = BAG_(node);
3559       if (((1<<en->type) & bag_mask[not]) == 0)
3560         return 1;
3561 
3562       r = check_node_in_look_behind(NODE_BODY(node), not, used);
3563       if (r != 0) break;
3564 
3565       if (en->type == BAG_MEMORY) {
3566         if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)
3567          || NODE_IS_REFERENCED(node))
3568           *used = TRUE;
3569       }
3570       else if (en->type == BAG_IF_ELSE) {
3571         if (IS_NOT_NULL(en->te.Then)) {
3572           r = check_node_in_look_behind(en->te.Then, not, used);
3573           if (r != 0) break;
3574         }
3575         if (IS_NOT_NULL(en->te.Else)) {
3576           r = check_node_in_look_behind(en->te.Else, not, used);
3577         }
3578       }
3579     }
3580     break;
3581 
3582   case NODE_ANCHOR:
3583     type = ANCHOR_(node)->type;
3584     if ((type & anchor_mask[not]) == 0)
3585       return 1;
3586 
3587     if (IS_NOT_NULL(NODE_BODY(node)))
3588       r = check_node_in_look_behind(NODE_BODY(node), not, used);
3589     break;
3590 
3591   case NODE_GIMMICK:
3592     if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3593       return 1;
3594     break;
3595 
3596   case NODE_CALL:
3597     r = check_called_node_in_look_behind(NODE_BODY(node), not);
3598     break;
3599 
3600   default:
3601     break;
3602   }
3603   return r;
3604 }
3605 
3606 static OnigLen
node_min_byte_len(Node * node,ParseEnv * env)3607 node_min_byte_len(Node* node, ParseEnv* env)
3608 {
3609   OnigLen len;
3610   OnigLen tmin;
3611 
3612   len = 0;
3613   switch (NODE_TYPE(node)) {
3614   case NODE_BACKREF:
3615     if (! NODE_IS_CHECKER(node)) {
3616       int i;
3617       int* backs;
3618       MemEnv* mem_env = PARSEENV_MEMENV(env);
3619       BackRefNode* br = BACKREF_(node);
3620       if (NODE_IS_RECURSION(node)) break;
3621 
3622       backs = BACKREFS_P(br);
3623       len = node_min_byte_len(mem_env[backs[0]].mem_node, env);
3624       for (i = 1; i < br->back_num; i++) {
3625         tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env);
3626         if (len > tmin) len = tmin;
3627       }
3628     }
3629     break;
3630 
3631 #ifdef USE_CALL
3632   case NODE_CALL:
3633     {
3634       Node* t = NODE_BODY(node);
3635       if (NODE_IS_FIXED_MIN(t))
3636         len = BAG_(t)->min_len;
3637       else
3638         len = node_min_byte_len(t, env);
3639     }
3640     break;
3641 #endif
3642 
3643   case NODE_LIST:
3644     do {
3645       tmin = node_min_byte_len(NODE_CAR(node), env);
3646       len = distance_add(len, tmin);
3647     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3648     break;
3649 
3650   case NODE_ALT:
3651     {
3652       Node *x, *y;
3653       y = node;
3654       do {
3655         x = NODE_CAR(y);
3656         tmin = node_min_byte_len(x, env);
3657         if (y == node) len = tmin;
3658         else if (len > tmin) len = tmin;
3659       } while (IS_NOT_NULL(y = NODE_CDR(y)));
3660     }
3661     break;
3662 
3663   case NODE_STRING:
3664     {
3665       StrNode* sn = STR_(node);
3666       len = (int )(sn->end - sn->s);
3667     }
3668     break;
3669 
3670   case NODE_CTYPE:
3671   case NODE_CCLASS:
3672     len = ONIGENC_MBC_MINLEN(env->enc);
3673     break;
3674 
3675   case NODE_QUANT:
3676     {
3677       QuantNode* qn = QUANT_(node);
3678 
3679       if (qn->lower > 0) {
3680         len = node_min_byte_len(NODE_BODY(node), env);
3681         len = distance_multiply(len, qn->lower);
3682       }
3683     }
3684     break;
3685 
3686   case NODE_BAG:
3687     {
3688       BagNode* en = BAG_(node);
3689       switch (en->type) {
3690       case BAG_MEMORY:
3691         if (NODE_IS_FIXED_MIN(node))
3692           len = en->min_len;
3693         else {
3694           if (NODE_IS_MARK1(node))
3695             len = 0;  /* recursive */
3696           else {
3697             NODE_STATUS_ADD(node, MARK1);
3698             len = node_min_byte_len(NODE_BODY(node), env);
3699             NODE_STATUS_REMOVE(node, MARK1);
3700 
3701             en->min_len = len;
3702             NODE_STATUS_ADD(node, FIXED_MIN);
3703           }
3704         }
3705         break;
3706 
3707       case BAG_OPTION:
3708       case BAG_STOP_BACKTRACK:
3709         len = node_min_byte_len(NODE_BODY(node), env);
3710         break;
3711       case BAG_IF_ELSE:
3712         {
3713           OnigLen elen;
3714 
3715           len = node_min_byte_len(NODE_BODY(node), env);
3716           if (IS_NOT_NULL(en->te.Then))
3717             len += node_min_byte_len(en->te.Then, env);
3718           if (IS_NOT_NULL(en->te.Else))
3719             elen = node_min_byte_len(en->te.Else, env);
3720           else elen = 0;
3721 
3722           if (elen < len) len = elen;
3723         }
3724         break;
3725       }
3726     }
3727     break;
3728 
3729   case NODE_GIMMICK:
3730     {
3731       GimmickNode* g = GIMMICK_(node);
3732       if (g->type == GIMMICK_FAIL) {
3733         len = INFINITE_LEN;
3734         break;
3735       }
3736     }
3737     /* fall */
3738   case NODE_ANCHOR:
3739   default:
3740     break;
3741   }
3742 
3743   return len;
3744 }
3745 
3746 static int
check_backrefs(Node * node,ParseEnv * env)3747 check_backrefs(Node* node, ParseEnv* env)
3748 {
3749   int r;
3750 
3751   switch (NODE_TYPE(node)) {
3752   case NODE_LIST:
3753   case NODE_ALT:
3754     do {
3755       r = check_backrefs(NODE_CAR(node), env);
3756     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3757     break;
3758 
3759   case NODE_ANCHOR:
3760     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3761       r = 0;
3762       break;
3763     }
3764     /* fall */
3765   case NODE_QUANT:
3766     r = check_backrefs(NODE_BODY(node), env);
3767     break;
3768 
3769   case NODE_BAG:
3770     r = check_backrefs(NODE_BODY(node), env);
3771     {
3772       BagNode* en = BAG_(node);
3773 
3774       if (en->type == BAG_IF_ELSE) {
3775         if (r != 0) return r;
3776         if (IS_NOT_NULL(en->te.Then)) {
3777           r = check_backrefs(en->te.Then, env);
3778           if (r != 0) return r;
3779         }
3780         if (IS_NOT_NULL(en->te.Else)) {
3781           r = check_backrefs(en->te.Else, env);
3782         }
3783       }
3784     }
3785     break;
3786 
3787   case NODE_BACKREF:
3788     {
3789       int i;
3790       BackRefNode* br = BACKREF_(node);
3791       int* backs = BACKREFS_P(br);
3792       MemEnv* mem_env = PARSEENV_MEMENV(env);
3793 
3794       for (i = 0; i < br->back_num; i++) {
3795         if (backs[i] > env->num_mem)
3796           return ONIGERR_INVALID_BACKREF;
3797 
3798         NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3799       }
3800       r = 0;
3801     }
3802     break;
3803 
3804   default:
3805     r = 0;
3806     break;
3807   }
3808 
3809   return r;
3810 }
3811 
3812 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ParseEnv * env)3813 set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env)
3814 {
3815   int r;
3816 
3817   switch (NODE_TYPE(node)) {
3818   case NODE_LIST:
3819   case NODE_ALT:
3820     do {
3821       r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3822     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3823     break;
3824 
3825   case NODE_ANCHOR:
3826     {
3827       AnchorNode* an = ANCHOR_(node);
3828 
3829       if (! ANCHOR_HAS_BODY(an)) {
3830         r = 0;
3831         break;
3832       }
3833 
3834       switch (an->type) {
3835       case ANCR_PREC_READ:
3836       case ANCR_LOOK_BEHIND:
3837         empty = NULL_NODE;
3838         break;
3839       default:
3840         break;
3841       }
3842       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3843     }
3844     break;
3845 
3846   case NODE_QUANT:
3847     {
3848       QuantNode* qn = QUANT_(node);
3849 
3850       if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3851       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3852     }
3853     break;
3854 
3855   case NODE_BAG:
3856     if (IS_NOT_NULL(NODE_BODY(node))) {
3857       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3858       if (r != 0) return r;
3859     }
3860     {
3861       BagNode* en = BAG_(node);
3862 
3863       r = 0;
3864       if (en->type == BAG_MEMORY) {
3865         if (NODE_IS_BACKREF(node)) {
3866           if (IS_NOT_NULL(empty))
3867             PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3868         }
3869       }
3870       else if (en->type == BAG_IF_ELSE) {
3871         if (IS_NOT_NULL(en->te.Then)) {
3872           r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3873           if (r != 0) return r;
3874         }
3875         if (IS_NOT_NULL(en->te.Else)) {
3876           r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3877         }
3878       }
3879     }
3880     break;
3881 
3882   default:
3883     r = 0;
3884     break;
3885   }
3886 
3887   return r;
3888 }
3889 
3890 static int
is_ancestor_node(Node * node,Node * me)3891 is_ancestor_node(Node* node, Node* me)
3892 {
3893   Node* parent;
3894 
3895   while ((parent = NODE_PARENT(me)) != NULL_NODE) {
3896     if (parent == node) return 1;
3897     me = parent;
3898   }
3899   return 0;
3900 }
3901 
3902 static void
set_empty_status_check_trav(Node * node,ParseEnv * env)3903 set_empty_status_check_trav(Node* node, ParseEnv* env)
3904 {
3905   switch (NODE_TYPE(node)) {
3906   case NODE_LIST:
3907   case NODE_ALT:
3908     do {
3909       set_empty_status_check_trav(NODE_CAR(node), env);
3910     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3911     break;
3912 
3913   case NODE_ANCHOR:
3914     {
3915       AnchorNode* an = ANCHOR_(node);
3916 
3917       if (! ANCHOR_HAS_BODY(an)) break;
3918       set_empty_status_check_trav(NODE_BODY(node), env);
3919     }
3920     break;
3921 
3922   case NODE_QUANT:
3923     set_empty_status_check_trav(NODE_BODY(node), env);
3924     break;
3925 
3926   case NODE_BAG:
3927     if (IS_NOT_NULL(NODE_BODY(node)))
3928       set_empty_status_check_trav(NODE_BODY(node), env);
3929     {
3930       BagNode* en = BAG_(node);
3931 
3932       if (en->type == BAG_IF_ELSE) {
3933         if (IS_NOT_NULL(en->te.Then)) {
3934           set_empty_status_check_trav(en->te.Then, env);
3935         }
3936         if (IS_NOT_NULL(en->te.Else)) {
3937           set_empty_status_check_trav(en->te.Else, env);
3938         }
3939       }
3940     }
3941     break;
3942 
3943   case NODE_BACKREF:
3944     {
3945       int i;
3946       int* backs;
3947       MemEnv* mem_env = PARSEENV_MEMENV(env);
3948       BackRefNode* br = BACKREF_(node);
3949       backs = BACKREFS_P(br);
3950       for (i = 0; i < br->back_num; i++) {
3951         Node* ernode = mem_env[backs[i]].empty_repeat_node;
3952         if (IS_NOT_NULL(ernode)) {
3953           if (! is_ancestor_node(ernode, node)) {
3954             MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]);
3955             NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
3956             NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
3957           }
3958         }
3959       }
3960     }
3961     break;
3962 
3963   default:
3964     break;
3965   }
3966 }
3967 
3968 static void
set_parent_node_trav(Node * node,Node * parent)3969 set_parent_node_trav(Node* node, Node* parent)
3970 {
3971   NODE_PARENT(node) = parent;
3972 
3973   switch (NODE_TYPE(node)) {
3974   case NODE_LIST:
3975   case NODE_ALT:
3976     do {
3977       set_parent_node_trav(NODE_CAR(node), node);
3978     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3979     break;
3980 
3981   case NODE_ANCHOR:
3982     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
3983     set_parent_node_trav(NODE_BODY(node), node);
3984     break;
3985 
3986   case NODE_QUANT:
3987     set_parent_node_trav(NODE_BODY(node), node);
3988     break;
3989 
3990   case NODE_BAG:
3991     if (IS_NOT_NULL(NODE_BODY(node)))
3992       set_parent_node_trav(NODE_BODY(node), node);
3993     {
3994       BagNode* en = BAG_(node);
3995 
3996       if (en->type == BAG_IF_ELSE) {
3997         if (IS_NOT_NULL(en->te.Then))
3998           set_parent_node_trav(en->te.Then, node);
3999         if (IS_NOT_NULL(en->te.Else)) {
4000           set_parent_node_trav(en->te.Else, node);
4001         }
4002       }
4003     }
4004     break;
4005 
4006   default:
4007     break;
4008   }
4009 }
4010 
4011 
4012 #ifdef USE_CALL
4013 
4014 #define RECURSION_EXIST        (1<<0)
4015 #define RECURSION_MUST         (1<<1)
4016 #define RECURSION_INFINITE     (1<<2)
4017 
4018 static int
infinite_recursive_call_check(Node * node,ParseEnv * env,int head)4019 infinite_recursive_call_check(Node* node, ParseEnv* env, int head)
4020 {
4021   int ret;
4022   int r = 0;
4023 
4024   switch (NODE_TYPE(node)) {
4025   case NODE_LIST:
4026     {
4027       Node *x;
4028       OnigLen min;
4029 
4030       x = node;
4031       do {
4032         ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
4033         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4034         r |= ret;
4035         if (head != 0) {
4036           min = node_min_byte_len(NODE_CAR(x), env);
4037           if (min != 0) head = 0;
4038         }
4039       } while (IS_NOT_NULL(x = NODE_CDR(x)));
4040     }
4041     break;
4042 
4043   case NODE_ALT:
4044     {
4045       int must;
4046 
4047       must = RECURSION_MUST;
4048       do {
4049         ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
4050         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4051 
4052         r    |= (ret & RECURSION_EXIST);
4053         must &= ret;
4054       } while (IS_NOT_NULL(node = NODE_CDR(node)));
4055       r |= must;
4056     }
4057     break;
4058 
4059   case NODE_QUANT:
4060     if (QUANT_(node)->upper == 0) break;
4061 
4062     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4063     if (r < 0) return r;
4064     if ((r & RECURSION_MUST) != 0) {
4065       if (QUANT_(node)->lower == 0)
4066         r &= ~RECURSION_MUST;
4067     }
4068     break;
4069 
4070   case NODE_ANCHOR:
4071     if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
4072       break;
4073     /* fall */
4074   case NODE_CALL:
4075     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4076     break;
4077 
4078   case NODE_BAG:
4079     {
4080       BagNode* en = BAG_(node);
4081 
4082       if (en->type == BAG_MEMORY) {
4083         if (NODE_IS_MARK2(node))
4084           return 0;
4085         else if (NODE_IS_MARK1(node))
4086           return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
4087                   : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
4088         else {
4089           NODE_STATUS_ADD(node, MARK2);
4090           r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4091           NODE_STATUS_REMOVE(node, MARK2);
4092         }
4093       }
4094       else if (en->type == BAG_IF_ELSE) {
4095         int eret;
4096 
4097         ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
4098         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4099         r |= ret;
4100         if (IS_NOT_NULL(en->te.Then)) {
4101           OnigLen min;
4102           if (head != 0) {
4103             min = node_min_byte_len(NODE_BODY(node), env);
4104           }
4105           else min = 0;
4106 
4107           ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
4108           if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4109           r |= ret;
4110         }
4111         if (IS_NOT_NULL(en->te.Else)) {
4112           eret = infinite_recursive_call_check(en->te.Else, env, head);
4113           if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
4114           r |= (eret & RECURSION_EXIST);
4115           if ((eret & RECURSION_MUST) == 0)
4116             r &= ~RECURSION_MUST;
4117         }
4118         else {
4119           r &= ~RECURSION_MUST;
4120         }
4121       }
4122       else {
4123         r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4124       }
4125     }
4126     break;
4127 
4128   default:
4129     break;
4130   }
4131 
4132   return r;
4133 }
4134 
4135 static int
infinite_recursive_call_check_trav(Node * node,ParseEnv * env)4136 infinite_recursive_call_check_trav(Node* node, ParseEnv* env)
4137 {
4138   int r;
4139 
4140   switch (NODE_TYPE(node)) {
4141   case NODE_LIST:
4142   case NODE_ALT:
4143     do {
4144       r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
4145     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4146     break;
4147 
4148   case NODE_ANCHOR:
4149     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4150       r = 0;
4151       break;
4152     }
4153     /* fall */
4154   case NODE_QUANT:
4155     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4156     break;
4157 
4158   case NODE_BAG:
4159     {
4160       BagNode* en = BAG_(node);
4161 
4162       if (en->type == BAG_MEMORY) {
4163         if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
4164           int ret;
4165 
4166           NODE_STATUS_ADD(node, MARK1);
4167 
4168           ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
4169           if (ret < 0) return ret;
4170           else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
4171             return ONIGERR_NEVER_ENDING_RECURSION;
4172 
4173           NODE_STATUS_REMOVE(node, MARK1);
4174         }
4175       }
4176       else if (en->type == BAG_IF_ELSE) {
4177         if (IS_NOT_NULL(en->te.Then)) {
4178           r = infinite_recursive_call_check_trav(en->te.Then, env);
4179           if (r != 0) return r;
4180         }
4181         if (IS_NOT_NULL(en->te.Else)) {
4182           r = infinite_recursive_call_check_trav(en->te.Else, env);
4183           if (r != 0) return r;
4184         }
4185       }
4186     }
4187 
4188     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4189     break;
4190 
4191   default:
4192     r = 0;
4193     break;
4194   }
4195 
4196   return r;
4197 }
4198 
4199 static int
recursive_call_check(Node * node)4200 recursive_call_check(Node* node)
4201 {
4202   int r;
4203 
4204   switch (NODE_TYPE(node)) {
4205   case NODE_LIST:
4206   case NODE_ALT:
4207     r = 0;
4208     do {
4209       r |= recursive_call_check(NODE_CAR(node));
4210     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4211     break;
4212 
4213   case NODE_ANCHOR:
4214     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4215       r = 0;
4216       break;
4217     }
4218     /* fall */
4219   case NODE_QUANT:
4220     r = recursive_call_check(NODE_BODY(node));
4221     break;
4222 
4223   case NODE_CALL:
4224     r = recursive_call_check(NODE_BODY(node));
4225     if (r != 0) {
4226       if (NODE_IS_MARK1(NODE_BODY(node)))
4227         NODE_STATUS_ADD(node, RECURSION);
4228     }
4229     break;
4230 
4231   case NODE_BAG:
4232     {
4233       BagNode* en = BAG_(node);
4234 
4235       if (en->type == BAG_MEMORY) {
4236         if (NODE_IS_MARK2(node))
4237           return 0;
4238         else if (NODE_IS_MARK1(node))
4239           return 1; /* recursion */
4240         else {
4241           NODE_STATUS_ADD(node, MARK2);
4242           r = recursive_call_check(NODE_BODY(node));
4243           NODE_STATUS_REMOVE(node, MARK2);
4244         }
4245       }
4246       else if (en->type == BAG_IF_ELSE) {
4247         r = 0;
4248         if (IS_NOT_NULL(en->te.Then)) {
4249           r |= recursive_call_check(en->te.Then);
4250         }
4251         if (IS_NOT_NULL(en->te.Else)) {
4252           r |= recursive_call_check(en->te.Else);
4253         }
4254         r |= recursive_call_check(NODE_BODY(node));
4255       }
4256       else {
4257         r = recursive_call_check(NODE_BODY(node));
4258       }
4259     }
4260     break;
4261 
4262   default:
4263     r = 0;
4264     break;
4265   }
4266 
4267   return r;
4268 }
4269 
4270 #define IN_RECURSION         (1<<0)
4271 #define FOUND_CALLED_NODE    1
4272 
4273 static int
recursive_call_check_trav(Node * node,ParseEnv * env,int state)4274 recursive_call_check_trav(Node* node, ParseEnv* env, int state)
4275 {
4276   int r = 0;
4277 
4278   switch (NODE_TYPE(node)) {
4279   case NODE_LIST:
4280   case NODE_ALT:
4281     {
4282       int ret;
4283       do {
4284         ret = recursive_call_check_trav(NODE_CAR(node), env, state);
4285         if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
4286         else if (ret < 0) return ret;
4287       } while (IS_NOT_NULL(node = NODE_CDR(node)));
4288     }
4289     break;
4290 
4291   case NODE_QUANT:
4292     r = recursive_call_check_trav(NODE_BODY(node), env, state);
4293     if (QUANT_(node)->upper == 0) {
4294       if (r == FOUND_CALLED_NODE)
4295         QUANT_(node)->include_referred = 1;
4296     }
4297     break;
4298 
4299   case NODE_ANCHOR:
4300     {
4301       AnchorNode* an = ANCHOR_(node);
4302       if (ANCHOR_HAS_BODY(an))
4303         r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
4304     }
4305     break;
4306 
4307   case NODE_BAG:
4308     {
4309       int ret;
4310       int state1;
4311       BagNode* en = BAG_(node);
4312 
4313       if (en->type == BAG_MEMORY) {
4314         if (NODE_IS_CALLED(node)) {
4315           r = FOUND_CALLED_NODE;
4316           goto check_recursion;
4317         }
4318         else if ((state & IN_RECURSION) != 0) {
4319         check_recursion:
4320           if (! NODE_IS_RECURSION(node)) {
4321             NODE_STATUS_ADD(node, MARK1);
4322             ret = recursive_call_check(NODE_BODY(node));
4323             if (ret != 0) {
4324               NODE_STATUS_ADD(node, RECURSION);
4325               MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
4326             }
4327             NODE_STATUS_REMOVE(node, MARK1);
4328           }
4329         }
4330       }
4331 
4332       state1 = state;
4333       if (NODE_IS_RECURSION(node))
4334         state1 |= IN_RECURSION;
4335 
4336       ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
4337       if (ret == FOUND_CALLED_NODE)
4338         r = FOUND_CALLED_NODE;
4339 
4340       if (en->type == BAG_IF_ELSE) {
4341         if (IS_NOT_NULL(en->te.Then)) {
4342           ret = recursive_call_check_trav(en->te.Then, env, state1);
4343           if (ret == FOUND_CALLED_NODE)
4344             r = FOUND_CALLED_NODE;
4345         }
4346         if (IS_NOT_NULL(en->te.Else)) {
4347           ret = recursive_call_check_trav(en->te.Else, env, state1);
4348           if (ret == FOUND_CALLED_NODE)
4349             r = FOUND_CALLED_NODE;
4350         }
4351       }
4352     }
4353     break;
4354 
4355   default:
4356     break;
4357   }
4358 
4359   return r;
4360 }
4361 
4362 #endif
4363 
4364 static void
remove_from_list(Node * prev,Node * a)4365 remove_from_list(Node* prev, Node* a)
4366 {
4367   if (NODE_CDR(prev) != a) return ;
4368 
4369   NODE_CDR(prev) = NODE_CDR(a);
4370   NODE_CDR(a) = NULL_NODE;
4371 }
4372 
4373 static int
reduce_string_list(Node * node,OnigEncoding enc)4374 reduce_string_list(Node* node, OnigEncoding enc)
4375 {
4376   int r = 0;
4377 
4378   switch (NODE_TYPE(node)) {
4379   case NODE_LIST:
4380     {
4381       Node* prev;
4382       Node* curr;
4383       Node* prev_node;
4384       Node* next_node;
4385 
4386       prev = NULL_NODE;
4387       do {
4388         next_node = NODE_CDR(node);
4389         curr = NODE_CAR(node);
4390         if (NODE_TYPE(curr) == NODE_STRING) {
4391           if (IS_NULL(prev)
4392               || STR_(curr)->flag  != STR_(prev)->flag
4393               || NODE_STATUS(curr) != NODE_STATUS(prev)) {
4394             prev = curr;
4395             prev_node = node;
4396           }
4397           else {
4398             r = node_str_node_cat(prev, curr);
4399             if (r != 0) return r;
4400             remove_from_list(prev_node, node);
4401             onig_node_free(node);
4402           }
4403         }
4404         else {
4405           if (IS_NOT_NULL(prev)) {
4406 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4407             StrNode* sn = STR_(prev);
4408             if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4409               return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4410 #endif
4411             prev = NULL_NODE;
4412           }
4413           r = reduce_string_list(curr, enc);
4414           if (r != 0) return r;
4415           prev_node = node;
4416         }
4417 
4418         node = next_node;
4419       } while (r == 0 && IS_NOT_NULL(node));
4420 
4421 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4422       if (IS_NOT_NULL(prev)) {
4423         StrNode* sn = STR_(prev);
4424         if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4425           return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4426       }
4427 #endif
4428     }
4429     break;
4430 
4431   case NODE_ALT:
4432     do {
4433       r = reduce_string_list(NODE_CAR(node), enc);
4434     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4435     break;
4436 
4437 #ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE
4438   case NODE_STRING:
4439     {
4440       StrNode* sn = STR_(node);
4441       if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end))
4442         return ONIGERR_INVALID_WIDE_CHAR_VALUE;
4443     }
4444     break;
4445 #endif
4446 
4447   case NODE_ANCHOR:
4448     if (IS_NULL(NODE_BODY(node)))
4449       break;
4450     /* fall */
4451   case NODE_QUANT:
4452     r = reduce_string_list(NODE_BODY(node), enc);
4453     break;
4454 
4455   case NODE_BAG:
4456     {
4457       BagNode* en = BAG_(node);
4458 
4459       r = reduce_string_list(NODE_BODY(node), enc);
4460       if (r != 0) return r;
4461 
4462       if (en->type == BAG_IF_ELSE) {
4463         if (IS_NOT_NULL(en->te.Then)) {
4464           r = reduce_string_list(en->te.Then, enc);
4465           if (r != 0) return r;
4466         }
4467         if (IS_NOT_NULL(en->te.Else)) {
4468           r = reduce_string_list(en->te.Else, enc);
4469           if (r != 0) return r;
4470         }
4471       }
4472     }
4473     break;
4474 
4475   default:
4476     break;
4477   }
4478 
4479   return r;
4480 }
4481 
4482 
4483 #define IN_ALT          (1<<0)
4484 #define IN_NOT          (1<<1)
4485 #define IN_REAL_REPEAT  (1<<2)
4486 #define IN_VAR_REPEAT   (1<<3)
4487 #define IN_ZERO_REPEAT  (1<<4)
4488 #define IN_MULTI_ENTRY  (1<<5)
4489 #define IN_PREC_READ    (1<<6)
4490 #define IN_LOOK_BEHIND  (1<<7)
4491 #define IN_PEEK         (1<<8)
4492 
4493 /* divide different length alternatives in look-behind.
4494   (?<=A|B) ==> (?<=A)|(?<=B)
4495   (?<!A|B) ==> (?<!A)(?<!B)
4496 */
4497 static int
divide_look_behind_alternatives(Node * node)4498 divide_look_behind_alternatives(Node* node)
4499 {
4500   int r;
4501   int anc_type;
4502   Node *head, *np, *insert_node;
4503   AnchorNode* an;
4504 
4505   an = ANCHOR_(node);
4506   anc_type = an->type;
4507 
4508   head = NODE_ANCHOR_BODY(an);
4509   np = NODE_CAR(head);
4510   node_swap(node, head);
4511   NODE_CAR(node) = head;
4512   NODE_BODY(head) = np;
4513 
4514   np = node;
4515   while (IS_NOT_NULL(np = NODE_CDR(np))) {
4516     r = onig_node_copy(&insert_node, head);
4517     if (r != 0) return r;
4518     CHECK_NULL_RETURN_MEMERR(insert_node);
4519     NODE_BODY(insert_node) = NODE_CAR(np);
4520     NODE_CAR(np) = insert_node;
4521   }
4522 
4523   if (anc_type == ANCR_LOOK_BEHIND_NOT) {
4524     np = node;
4525     do {
4526       NODE_SET_TYPE(np, NODE_LIST);  /* alt -> list */
4527     } while (IS_NOT_NULL(np = NODE_CDR(np)));
4528   }
4529   return 0;
4530 }
4531 
4532 static int
node_reduce_in_look_behind(Node * node)4533 node_reduce_in_look_behind(Node* node)
4534 {
4535   NodeType type;
4536   Node* body;
4537 
4538   if (NODE_TYPE(node) != NODE_QUANT) return 0;
4539 
4540   body = NODE_BODY(node);
4541   type = NODE_TYPE(body);
4542   if (type == NODE_STRING || type == NODE_CTYPE ||
4543       type == NODE_CCLASS || type == NODE_BACKREF) {
4544     QuantNode* qn = QUANT_(node);
4545     qn->upper = qn->lower;
4546     if (qn->upper == 0)
4547       return 1; /* removed */
4548   }
4549 
4550   return 0;
4551 }
4552 
4553 static int
list_reduce_in_look_behind(Node * node)4554 list_reduce_in_look_behind(Node* node)
4555 {
4556   int r;
4557 
4558   switch (NODE_TYPE(node)) {
4559   case NODE_QUANT:
4560     r = node_reduce_in_look_behind(node);
4561     if (r > 0) r = 0;
4562     break;
4563 
4564   case NODE_LIST:
4565     do {
4566       r = node_reduce_in_look_behind(NODE_CAR(node));
4567       if (r <= 0) break;
4568     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4569     break;
4570 
4571   default:
4572     r = 0;
4573     break;
4574   }
4575 
4576   return r;
4577 }
4578 
4579 static int
alt_reduce_in_look_behind(Node * node,regex_t * reg,ParseEnv * env)4580 alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env)
4581 {
4582   int r;
4583 
4584   switch (NODE_TYPE(node)) {
4585   case NODE_ALT:
4586     do {
4587       r = list_reduce_in_look_behind(NODE_CAR(node));
4588     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4589     break;
4590 
4591   default:
4592     r = list_reduce_in_look_behind(node);
4593     break;
4594   }
4595 
4596   return r;
4597 }
4598 
4599 static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env);
4600 
4601 static int
tune_look_behind(Node * node,regex_t * reg,int state,ParseEnv * env)4602 tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env)
4603 {
4604   int r;
4605   int state1;
4606   int used;
4607   MinMaxCharLen ci;
4608   Node* body;
4609   AnchorNode* an = ANCHOR_(node);
4610 
4611   used = FALSE;
4612   r = check_node_in_look_behind(NODE_ANCHOR_BODY(an),
4613                                 an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0,
4614                                 &used);
4615   if (r < 0) return r;
4616   if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4617 
4618   if (an->type == ANCR_LOOK_BEHIND_NOT)
4619     state1 = state | IN_NOT | IN_LOOK_BEHIND;
4620   else
4621     state1 = state | IN_LOOK_BEHIND;
4622 
4623   body = NODE_ANCHOR_BODY(an);
4624   /* Execute tune_tree(body) before call node_char_len().
4625      Because case-fold expansion must be done before node_char_len().
4626    */
4627   r = tune_tree(body, reg, state1, env);
4628   if (r != 0) return r;
4629 
4630   r = alt_reduce_in_look_behind(body, reg, env);
4631   if (r != 0) return r;
4632 
4633   r = node_char_len(body, reg, &ci, env);
4634   if (r >= 0) {
4635     /* #177: overflow in onigenc_step_back() */
4636     if ((ci.max != INFINITE_LEN && ci.max > LOOK_BEHIND_MAX_CHAR_LEN)
4637       || ci.min > LOOK_BEHIND_MAX_CHAR_LEN) {
4638       return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4639     }
4640 
4641     if (ci.min == 0 && ci.min_is_sure != FALSE && used == FALSE) {
4642       if (an->type == ANCR_LOOK_BEHIND_NOT)
4643         r = onig_node_reset_fail(node);
4644       else
4645         r = onig_node_reset_empty(node);
4646 
4647       return r;
4648     }
4649 
4650     if (r == CHAR_LEN_TOP_ALT_FIXED) {
4651       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) {
4652         r = divide_look_behind_alternatives(node);
4653         if (r == 0)
4654           r = tune_tree(node, reg, state, env);
4655       }
4656       else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND))
4657         goto normal;
4658       else
4659         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4660     }
4661     else { /* CHAR_LEN_NORMAL */
4662     normal:
4663       if (ci.min == INFINITE_LEN) {
4664         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4665       }
4666       else {
4667         if (ci.min != ci.max &&
4668             ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) {
4669           r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4670         }
4671         else {
4672           Node* tail;
4673 
4674           /* check lead_node is already set by double call after
4675              divide_look_behind_alternatives() */
4676           if (IS_NULL(an->lead_node)) {
4677             an->char_min_len = ci.min;
4678             an->char_max_len = ci.max;
4679             r = get_tree_tail_literal(body, &tail, reg);
4680             if (r == GET_VALUE_FOUND) {
4681               r = onig_node_copy(&(an->lead_node), tail);
4682               if (r != 0) return r;
4683             }
4684           }
4685           r = ONIG_NORMAL;
4686         }
4687       }
4688     }
4689   }
4690 
4691   return r;
4692 }
4693 
4694 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4695 tune_next(Node* node, Node* next_node, regex_t* reg)
4696 {
4697   int called;
4698   NodeType type;
4699 
4700   called = FALSE;
4701 
4702  retry:
4703   type = NODE_TYPE(node);
4704   if (type == NODE_QUANT) {
4705     QuantNode* qn = QUANT_(node);
4706     if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4707 #ifdef USE_QUANT_PEEK_NEXT
4708       if (called == FALSE) {
4709         Node* n = get_tree_head_literal(next_node, 1, reg);
4710         /* '\0': for UTF-16BE etc... */
4711         if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4712           qn->next_head_exact = n;
4713         }
4714       }
4715 #endif
4716       /* automatic posseivation a*b ==> (?>a*)b */
4717       if (qn->lower <= 1) {
4718         if (is_strict_real_node(NODE_BODY(node))) {
4719           Node *x, *y;
4720           x = get_tree_head_literal(NODE_BODY(node), 0, reg);
4721           if (IS_NOT_NULL(x)) {
4722             y = get_tree_head_literal(next_node,  0, reg);
4723             if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4724               Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4725               CHECK_NULL_RETURN_MEMERR(en);
4726               NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4727               node_swap(node, en);
4728               NODE_BODY(node) = en;
4729             }
4730           }
4731         }
4732       }
4733     }
4734   }
4735   else if (type == NODE_BAG) {
4736     BagNode* en = BAG_(node);
4737     if (en->type == BAG_MEMORY) {
4738       if (NODE_IS_CALLED(node))
4739         called = TRUE;
4740       node = NODE_BODY(node);
4741       goto retry;
4742     }
4743   }
4744   return 0;
4745 }
4746 
4747 
4748 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4749 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4750 {
4751   int i;
4752 
4753   for (i = 0; i < n; i++) {
4754     OnigCaseFoldCodeItem* item = items + i;
4755     if (item->code_len != 1) return 0;
4756   }
4757 
4758   return 1;
4759 }
4760 
4761 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],OnigLen * rmin,OnigLen * rmax)4762 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[],
4763                                      OnigLen* rmin, OnigLen* rmax)
4764 {
4765   int i;
4766   OnigLen len, minlen, maxlen;
4767 
4768   minlen = INFINITE_LEN;
4769   maxlen = 0;
4770   for (i = 0; i < n; i++) {
4771     OnigCaseFoldCodeItem* item = items + i;
4772 
4773     len = item->byte_len;
4774     if (len < minlen) minlen = len;
4775     if (len > maxlen) maxlen = len;
4776   }
4777 
4778   *rmin = minlen;
4779   *rmax = maxlen;
4780   return 0;
4781 }
4782 
4783 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4784 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4785                          int n, OnigCodePoint codes[])
4786 {
4787   int r, i, len;
4788   Node* node;
4789   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4790 
4791   *rnode = NULL_NODE;
4792   node = onig_node_new_str(NULL, NULL);
4793   CHECK_NULL_RETURN_MEMERR(node);
4794 
4795   for (i = 0; i < n; i++) {
4796     len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4797     if (len < 0) {
4798       r = len;
4799       goto err;
4800     }
4801 
4802     r = onig_node_str_cat(node, buf, buf + len);
4803     if (r != 0) goto err;
4804   }
4805 
4806   *rnode = node;
4807   return 0;
4808 
4809  err:
4810   onig_node_free(node);
4811   return r;
4812 }
4813 
4814 static int
unravel_cf_node_add(Node ** rlist,Node * add)4815 unravel_cf_node_add(Node** rlist, Node* add)
4816 {
4817   Node *list;
4818 
4819   list = *rlist;
4820   if (IS_NULL(list)) {
4821     list = onig_node_new_list(add, NULL);
4822     CHECK_NULL_RETURN_MEMERR(list);
4823     *rlist = list;
4824   }
4825   else {
4826     Node* r = node_list_add(list, add);
4827     CHECK_NULL_RETURN_MEMERR(r);
4828   }
4829 
4830   return 0;
4831 }
4832 
4833 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag)4834 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4835                       unsigned int flag)
4836 {
4837   int r;
4838   Node *sn, *list;
4839 
4840   list = *rlist;
4841   sn   = *rsn;
4842 
4843   if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4844     r = onig_node_str_cat(sn, s, end);
4845   }
4846   else {
4847     sn = onig_node_new_str(s, end);
4848     CHECK_NULL_RETURN_MEMERR(sn);
4849 
4850     STR_(sn)->flag = flag;
4851     r = unravel_cf_node_add(&list, sn);
4852   }
4853 
4854   if (r == 0) {
4855     *rlist = list;
4856     *rsn = sn;
4857   }
4858   return r;
4859 }
4860 
4861 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4862 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4863             OnigCaseFoldCodeItem items[], OnigEncoding enc,
4864             OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4865 {
4866   int r, i;
4867   Node* node;
4868 
4869   if (is_all_code_len_1_items(n, items)) {
4870     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4871 
4872     codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4873     for (i = 0; i < n; i++) {
4874       OnigCaseFoldCodeItem* item = items + i;
4875       codes[i+1] = item->code[0];
4876     }
4877     r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4878     if (r != 0) return r;
4879   }
4880   else {
4881     Node *snode, *alt, *curr;
4882 
4883     snode = onig_node_new_str(s, end);
4884     CHECK_NULL_RETURN_MEMERR(snode);
4885     node = curr = onig_node_new_alt(snode, NULL_NODE);
4886     if (IS_NULL(curr)) {
4887       onig_node_free(snode);
4888       return ONIGERR_MEMORY;
4889     }
4890 
4891     r = 0;
4892     for (i = 0; i < n; i++) {
4893       OnigCaseFoldCodeItem* item = items + i;
4894       r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4895       if (r != 0) {
4896         onig_node_free(node);
4897         return r;
4898       }
4899 
4900       alt = onig_node_new_alt(snode, NULL_NODE);
4901       if (IS_NULL(alt)) {
4902         onig_node_free(snode);
4903         onig_node_free(node);
4904         return ONIGERR_MEMORY;
4905       }
4906 
4907       NODE_CDR(curr) = alt;
4908       curr = alt;
4909     }
4910   }
4911 
4912   r = unravel_cf_node_add(rlist, node);
4913   if (r != 0) onig_node_free(node);
4914   return r;
4915 }
4916 
4917 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,OnigLen one_len)4918 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4919                 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4920                 UChar* s, OnigLen one_len)
4921 {
4922   int r, i, found;
4923 
4924   found = FALSE;
4925   for (i = 0; i < n; i++) {
4926     OnigCaseFoldCodeItem* item = items + i;
4927     if (item->byte_len == one_len) {
4928       if (item->code_len == 1) {
4929         found = TRUE;
4930         break;
4931       }
4932     }
4933   }
4934 
4935   if (found == FALSE) {
4936     r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */);
4937   }
4938   else {
4939     Node* node;
4940     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4941 
4942     found = 0;
4943     codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
4944     for (i = 0; i < n; i++) {
4945       OnigCaseFoldCodeItem* item = items + i;
4946       if (item->byte_len == one_len) {
4947         if (item->code_len == 1) {
4948           codes[found++] = item->code[0];
4949         }
4950       }
4951     }
4952     r = onig_new_cclass_with_code_list(&node, enc, found, codes);
4953     if (r != 0) return r;
4954 
4955     r = unravel_cf_node_add(rlist, node);
4956     if (r != 0) onig_node_free(node);
4957 
4958     *rsn = NULL_NODE;
4959   }
4960 
4961   return r;
4962 }
4963 
4964 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)4965 unravel_case_fold_string(Node* node, regex_t* reg, int state)
4966 {
4967   int r, n, in_look_behind;
4968   OnigLen min_len, max_len, one_len;
4969   UChar *start, *end, *p, *q;
4970   StrNode* snode;
4971   Node *sn, *list;
4972   OnigEncoding enc;
4973   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4974 
4975   if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
4976 
4977   NODE_STATUS_REMOVE(node, IGNORECASE);
4978   snode = STR_(node);
4979   start = snode->s;
4980   end   = snode->end;
4981   if (start >= end) return 0;
4982 
4983   in_look_behind = (state & IN_LOOK_BEHIND) != 0;
4984   enc = reg->enc;
4985 
4986   list = sn = NULL_NODE;
4987   p = start;
4988   while (p < end) {
4989     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
4990                                            items);
4991     if (n < 0) {
4992       r = n;
4993       goto err;
4994     }
4995 
4996     one_len = (OnigLen )enclen(enc, p);
4997     if (n == 0) {
4998       q = p + one_len;
4999       if (q > end) q = end;
5000       r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */);
5001       if (r != 0) goto err;
5002     }
5003     else {
5004       if (in_look_behind != 0) {
5005         q = p + one_len;
5006         if (items[0].byte_len != one_len) {
5007           r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q,
5008                                                  items);
5009           if (r < 0) goto err;
5010           n = r;
5011         }
5012         r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
5013         if (r != 0) goto err;
5014       }
5015       else {
5016         get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
5017         if (min_len != max_len) {
5018           r = ONIGERR_PARSER_BUG;
5019           goto err;
5020         }
5021 
5022         q = p + max_len;
5023         r = unravel_cf_string_alt_or_cc_add(&list, n, items, enc,
5024                                             reg->case_fold_flag, p, q);
5025         if (r != 0) goto err;
5026         sn = NULL_NODE;
5027       }
5028     }
5029 
5030     p = q;
5031   }
5032 
5033   if (IS_NOT_NULL(list)) {
5034     if (node_list_len(list) == 1) {
5035       node_swap(node, NODE_CAR(list));
5036     }
5037     else {
5038       node_swap(node, list);
5039     }
5040     onig_node_free(list);
5041   }
5042   else {
5043     node_swap(node, sn);
5044     onig_node_free(sn);
5045   }
5046   return 0;
5047 
5048  err:
5049   if (IS_NOT_NULL(list))
5050     onig_node_free(list);
5051   else if (IS_NOT_NULL(sn))
5052     onig_node_free(sn);
5053 
5054   return r;
5055 }
5056 
5057 #ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
5058 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)5059 quantifiers_memory_node_info(Node* node)
5060 {
5061   int r = BODY_MAY_BE_EMPTY;
5062 
5063   switch (NODE_TYPE(node)) {
5064   case NODE_LIST:
5065   case NODE_ALT:
5066     {
5067       int v;
5068       do {
5069         v = quantifiers_memory_node_info(NODE_CAR(node));
5070         if (v > r) r = v;
5071       } while (IS_NOT_NULL(node = NODE_CDR(node)));
5072     }
5073     break;
5074 
5075 #ifdef USE_CALL
5076   case NODE_CALL:
5077     if (NODE_IS_RECURSION(node)) {
5078       return BODY_MAY_BE_EMPTY_REC; /* tiny version */
5079     }
5080     else
5081       r = quantifiers_memory_node_info(NODE_BODY(node));
5082     break;
5083 #endif
5084 
5085   case NODE_QUANT:
5086     {
5087       QuantNode* qn = QUANT_(node);
5088       if (qn->upper != 0) {
5089         r = quantifiers_memory_node_info(NODE_BODY(node));
5090       }
5091     }
5092     break;
5093 
5094   case NODE_BAG:
5095     {
5096       BagNode* en = BAG_(node);
5097       switch (en->type) {
5098       case BAG_MEMORY:
5099         if (NODE_IS_RECURSION(node)) {
5100           return BODY_MAY_BE_EMPTY_REC;
5101         }
5102         return BODY_MAY_BE_EMPTY_MEM;
5103         break;
5104 
5105       case BAG_OPTION:
5106       case BAG_STOP_BACKTRACK:
5107         r = quantifiers_memory_node_info(NODE_BODY(node));
5108         break;
5109       case BAG_IF_ELSE:
5110         {
5111           int v;
5112           r = quantifiers_memory_node_info(NODE_BODY(node));
5113           if (IS_NOT_NULL(en->te.Then)) {
5114             v = quantifiers_memory_node_info(en->te.Then);
5115             if (v > r) r = v;
5116           }
5117           if (IS_NOT_NULL(en->te.Else)) {
5118             v = quantifiers_memory_node_info(en->te.Else);
5119             if (v > r) r = v;
5120           }
5121         }
5122         break;
5123       }
5124     }
5125     break;
5126 
5127   case NODE_BACKREF:
5128   case NODE_STRING:
5129   case NODE_CTYPE:
5130   case NODE_CCLASS:
5131   case NODE_ANCHOR:
5132   case NODE_GIMMICK:
5133   default:
5134     break;
5135   }
5136 
5137   return r;
5138 }
5139 #endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */
5140 
5141 
5142 #ifdef USE_CALL
5143 
5144 #ifdef __GNUC__
5145 __inline
5146 #endif
5147 static int
check_call_reference(CallNode * cn,ParseEnv * env,int state)5148 check_call_reference(CallNode* cn, ParseEnv* env, int state)
5149 {
5150   MemEnv* mem_env = PARSEENV_MEMENV(env);
5151 
5152   if (cn->by_number != 0) {
5153     int gnum = cn->called_gnum;
5154 
5155     if (env->num_named > 0 &&
5156         IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5157         ! OPTON_CAPTURE_GROUP(env->options)) {
5158       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
5159     }
5160 
5161     if (gnum > env->num_mem) {
5162       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
5163                                      cn->name, cn->name_end);
5164       return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5165     }
5166 
5167   set_call_attr:
5168     NODE_CALL_BODY(cn) = mem_env[cn->called_gnum].mem_node;
5169     if (IS_NULL(NODE_CALL_BODY(cn))) {
5170       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5171                                      cn->name, cn->name_end);
5172       return ONIGERR_UNDEFINED_NAME_REFERENCE;
5173     }
5174 
5175     NODE_STATUS_ADD(NODE_CALL_BODY(cn), REFERENCED);
5176   }
5177   else {
5178     int *refs;
5179 
5180     int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
5181     if (n <= 0) {
5182       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5183                                      cn->name, cn->name_end);
5184       return ONIGERR_UNDEFINED_NAME_REFERENCE;
5185     }
5186     else if (n > 1) {
5187       onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
5188                                      cn->name, cn->name_end);
5189       return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
5190     }
5191     else {
5192       cn->called_gnum = refs[0];
5193       goto set_call_attr;
5194     }
5195   }
5196 
5197   return 0;
5198 }
5199 
5200 static void
tune_call2_call(Node * node)5201 tune_call2_call(Node* node)
5202 {
5203   switch (NODE_TYPE(node)) {
5204   case NODE_LIST:
5205   case NODE_ALT:
5206     do {
5207       tune_call2_call(NODE_CAR(node));
5208     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5209     break;
5210 
5211   case NODE_QUANT:
5212     tune_call2_call(NODE_BODY(node));
5213     break;
5214 
5215   case NODE_ANCHOR:
5216     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5217       tune_call2_call(NODE_BODY(node));
5218     break;
5219 
5220   case NODE_BAG:
5221     {
5222       BagNode* en = BAG_(node);
5223 
5224       if (en->type == BAG_MEMORY) {
5225         if (! NODE_IS_MARK1(node)) {
5226           NODE_STATUS_ADD(node, MARK1);
5227           tune_call2_call(NODE_BODY(node));
5228           NODE_STATUS_REMOVE(node, MARK1);
5229         }
5230       }
5231       else if (en->type == BAG_IF_ELSE) {
5232         tune_call2_call(NODE_BODY(node));
5233         if (IS_NOT_NULL(en->te.Then))
5234           tune_call2_call(en->te.Then);
5235         if (IS_NOT_NULL(en->te.Else))
5236           tune_call2_call(en->te.Else);
5237       }
5238       else {
5239         tune_call2_call(NODE_BODY(node));
5240       }
5241     }
5242     break;
5243 
5244   case NODE_CALL:
5245     if (! NODE_IS_MARK1(node)) {
5246       NODE_STATUS_ADD(node, MARK1);
5247       {
5248         CallNode* cn = CALL_(node);
5249         Node* called = NODE_CALL_BODY(cn);
5250 
5251         cn->entry_count++;
5252 
5253         NODE_STATUS_ADD(called, CALLED);
5254         BAG_(called)->m.entry_count++;
5255         tune_call2_call(called);
5256       }
5257       NODE_STATUS_REMOVE(node, MARK1);
5258     }
5259     break;
5260 
5261   default:
5262     break;
5263   }
5264 }
5265 
5266 static int
tune_call(Node * node,ParseEnv * env,int state)5267 tune_call(Node* node, ParseEnv* env, int state)
5268 {
5269   int r;
5270 
5271   switch (NODE_TYPE(node)) {
5272   case NODE_LIST:
5273   case NODE_ALT:
5274     do {
5275       r = tune_call(NODE_CAR(node), env, state);
5276     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5277     break;
5278 
5279   case NODE_QUANT:
5280     if (QUANT_(node)->upper == 0)
5281       state |= IN_ZERO_REPEAT;
5282 
5283     r = tune_call(NODE_BODY(node), env, state);
5284     break;
5285 
5286   case NODE_ANCHOR:
5287     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5288       r = tune_call(NODE_BODY(node), env, state);
5289     else
5290       r = 0;
5291     break;
5292 
5293   case NODE_BAG:
5294     {
5295       BagNode* en = BAG_(node);
5296 
5297       if (en->type == BAG_MEMORY) {
5298         if ((state & IN_ZERO_REPEAT) != 0) {
5299           NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5300           BAG_(node)->m.entry_count--;
5301         }
5302         r = tune_call(NODE_BODY(node), env, state);
5303       }
5304       else if (en->type == BAG_IF_ELSE) {
5305         r = tune_call(NODE_BODY(node), env, state);
5306         if (r != 0) return r;
5307         if (IS_NOT_NULL(en->te.Then)) {
5308           r = tune_call(en->te.Then, env, state);
5309           if (r != 0) return r;
5310         }
5311         if (IS_NOT_NULL(en->te.Else))
5312           r = tune_call(en->te.Else, env, state);
5313       }
5314       else
5315         r = tune_call(NODE_BODY(node), env, state);
5316     }
5317     break;
5318 
5319   case NODE_CALL:
5320     if ((state & IN_ZERO_REPEAT) != 0) {
5321       NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5322       CALL_(node)->entry_count--;
5323     }
5324 
5325     r = check_call_reference(CALL_(node), env, state);
5326     break;
5327 
5328   default:
5329     r = 0;
5330     break;
5331   }
5332 
5333   return r;
5334 }
5335 
5336 static int
tune_call2(Node * node)5337 tune_call2(Node* node)
5338 {
5339   int r = 0;
5340 
5341   switch (NODE_TYPE(node)) {
5342   case NODE_LIST:
5343   case NODE_ALT:
5344     do {
5345       r = tune_call2(NODE_CAR(node));
5346     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5347     break;
5348 
5349   case NODE_QUANT:
5350     if (QUANT_(node)->upper != 0)
5351       r = tune_call2(NODE_BODY(node));
5352     break;
5353 
5354   case NODE_ANCHOR:
5355     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5356       r = tune_call2(NODE_BODY(node));
5357     break;
5358 
5359   case NODE_BAG:
5360     if (! NODE_IS_IN_ZERO_REPEAT(node))
5361       r = tune_call2(NODE_BODY(node));
5362 
5363     {
5364       BagNode* en = BAG_(node);
5365 
5366       if (r != 0) return r;
5367       if (en->type == BAG_IF_ELSE) {
5368         if (IS_NOT_NULL(en->te.Then)) {
5369           r = tune_call2(en->te.Then);
5370           if (r != 0) return r;
5371         }
5372         if (IS_NOT_NULL(en->te.Else))
5373           r = tune_call2(en->te.Else);
5374       }
5375     }
5376     break;
5377 
5378   case NODE_CALL:
5379     if (! NODE_IS_IN_ZERO_REPEAT(node)) {
5380       tune_call2_call(node);
5381     }
5382     break;
5383 
5384   default:
5385     break;
5386   }
5387 
5388   return r;
5389 }
5390 
5391 
5392 static void
tune_called_state_call(Node * node,int state)5393 tune_called_state_call(Node* node, int state)
5394 {
5395   switch (NODE_TYPE(node)) {
5396   case NODE_ALT:
5397     state |= IN_ALT;
5398     /* fall */
5399   case NODE_LIST:
5400     do {
5401       tune_called_state_call(NODE_CAR(node), state);
5402     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5403     break;
5404 
5405   case NODE_QUANT:
5406     {
5407       QuantNode* qn = QUANT_(node);
5408 
5409       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5410         state |= IN_REAL_REPEAT;
5411       if (qn->lower != qn->upper)
5412         state |= IN_VAR_REPEAT;
5413       if ((state & IN_PEEK) != 0)
5414         NODE_STATUS_ADD(node, INPEEK);
5415 
5416       tune_called_state_call(NODE_QUANT_BODY(qn), state);
5417     }
5418     break;
5419 
5420   case NODE_ANCHOR:
5421     {
5422       AnchorNode* an = ANCHOR_(node);
5423 
5424       switch (an->type) {
5425       case ANCR_PREC_READ_NOT:
5426       case ANCR_LOOK_BEHIND_NOT:
5427         state |= (IN_NOT | IN_PEEK);
5428         tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5429         break;
5430       case ANCR_PREC_READ:
5431       case ANCR_LOOK_BEHIND:
5432         state |= IN_PEEK;
5433         tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5434         break;
5435       default:
5436         break;
5437       }
5438     }
5439     break;
5440 
5441   case NODE_BAG:
5442     {
5443       BagNode* en = BAG_(node);
5444 
5445       if (en->type == BAG_MEMORY) {
5446         if (NODE_IS_MARK1(node)) {
5447           if ((~en->m.called_state & state) != 0) {
5448             en->m.called_state |= state;
5449             tune_called_state_call(NODE_BODY(node), state);
5450           }
5451         }
5452         else {
5453           NODE_STATUS_ADD(node, MARK1);
5454           en->m.called_state |= state;
5455           tune_called_state_call(NODE_BODY(node), state);
5456           NODE_STATUS_REMOVE(node, MARK1);
5457         }
5458       }
5459       else if (en->type == BAG_IF_ELSE) {
5460         state |= IN_ALT;
5461         tune_called_state_call(NODE_BODY(node), state);
5462         if (IS_NOT_NULL(en->te.Then)) {
5463           tune_called_state_call(en->te.Then, state);
5464         }
5465         if (IS_NOT_NULL(en->te.Else))
5466           tune_called_state_call(en->te.Else, state);
5467       }
5468       else {
5469         tune_called_state_call(NODE_BODY(node), state);
5470       }
5471     }
5472     break;
5473 
5474   case NODE_CALL:
5475     if ((state & IN_PEEK) != 0)
5476       NODE_STATUS_ADD(node, INPEEK);
5477     if ((state & IN_REAL_REPEAT) != 0)
5478       NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5479 
5480     tune_called_state_call(NODE_BODY(node), state);
5481     break;
5482 
5483   default:
5484     break;
5485   }
5486 }
5487 
5488 static void
tune_called_state(Node * node,int state)5489 tune_called_state(Node* node, int state)
5490 {
5491   switch (NODE_TYPE(node)) {
5492   case NODE_ALT:
5493     state |= IN_ALT;
5494     /* fall */
5495   case NODE_LIST:
5496     do {
5497       tune_called_state(NODE_CAR(node), state);
5498     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5499     break;
5500 
5501 #ifdef USE_CALL
5502   case NODE_CALL:
5503     if ((state & IN_PEEK) != 0)
5504       NODE_STATUS_ADD(node, INPEEK);
5505     if ((state & IN_REAL_REPEAT) != 0)
5506       NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5507 
5508     tune_called_state_call(node, state);
5509     break;
5510 #endif
5511 
5512   case NODE_BAG:
5513     {
5514       BagNode* en = BAG_(node);
5515 
5516       switch (en->type) {
5517       case BAG_MEMORY:
5518         if (en->m.entry_count > 1)
5519           state |= IN_MULTI_ENTRY;
5520 
5521         en->m.called_state |= state;
5522         /* fall */
5523       case BAG_OPTION:
5524       case BAG_STOP_BACKTRACK:
5525         tune_called_state(NODE_BODY(node), state);
5526         break;
5527       case BAG_IF_ELSE:
5528         state |= IN_ALT;
5529         tune_called_state(NODE_BODY(node), state);
5530         if (IS_NOT_NULL(en->te.Then))
5531           tune_called_state(en->te.Then, state);
5532         if (IS_NOT_NULL(en->te.Else))
5533           tune_called_state(en->te.Else, state);
5534         break;
5535       }
5536     }
5537     break;
5538 
5539   case NODE_QUANT:
5540     {
5541       QuantNode* qn = QUANT_(node);
5542 
5543       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5544         state |= IN_REAL_REPEAT;
5545       if (qn->lower != qn->upper)
5546         state |= IN_VAR_REPEAT;
5547       if ((state & IN_PEEK) != 0)
5548         NODE_STATUS_ADD(node, INPEEK);
5549 
5550       tune_called_state(NODE_QUANT_BODY(qn), state);
5551     }
5552     break;
5553 
5554   case NODE_ANCHOR:
5555     {
5556       AnchorNode* an = ANCHOR_(node);
5557 
5558       switch (an->type) {
5559       case ANCR_PREC_READ_NOT:
5560       case ANCR_LOOK_BEHIND_NOT:
5561         state |= (IN_NOT | IN_PEEK);
5562         tune_called_state(NODE_ANCHOR_BODY(an), state);
5563         break;
5564       case ANCR_PREC_READ:
5565       case ANCR_LOOK_BEHIND:
5566         state |= IN_PEEK;
5567         tune_called_state(NODE_ANCHOR_BODY(an), state);
5568         break;
5569       default:
5570         break;
5571       }
5572     }
5573     break;
5574 
5575   case NODE_BACKREF:
5576   case NODE_STRING:
5577   case NODE_CTYPE:
5578   case NODE_CCLASS:
5579   case NODE_GIMMICK:
5580   default:
5581     break;
5582   }
5583 }
5584 
5585 #endif  /* USE_CALL */
5586 
5587 
5588 #ifdef __GNUC__
5589 __inline
5590 #endif
5591 static int
tune_anchor(Node * node,regex_t * reg,int state,ParseEnv * env)5592 tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env)
5593 {
5594   int r;
5595   AnchorNode* an = ANCHOR_(node);
5596 
5597   switch (an->type) {
5598   case ANCR_PREC_READ:
5599     r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env);
5600     break;
5601   case ANCR_PREC_READ_NOT:
5602     r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT),
5603                   env);
5604     break;
5605 
5606   case ANCR_LOOK_BEHIND:
5607   case ANCR_LOOK_BEHIND_NOT:
5608     r = tune_look_behind(node, reg, state, env);
5609     break;
5610 
5611   default:
5612     r = 0;
5613     break;
5614   }
5615 
5616   return r;
5617 }
5618 
5619 #ifdef __GNUC__
5620 __inline
5621 #endif
5622 static int
tune_quant(Node * node,regex_t * reg,int state,ParseEnv * env)5623 tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env)
5624 {
5625   int r;
5626   QuantNode* qn = QUANT_(node);
5627   Node* body = NODE_BODY(node);
5628 
5629   if ((state & IN_REAL_REPEAT) != 0) {
5630     NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5631   }
5632   if ((state & IN_MULTI_ENTRY) != 0) {
5633     NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5634   }
5635 
5636   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5637     OnigLen d = node_min_byte_len(body, env);
5638     if (d == 0) {
5639 #ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
5640       qn->emptiness = quantifiers_memory_node_info(body);
5641 #else
5642       qn->emptiness = BODY_MAY_BE_EMPTY;
5643 #endif
5644     }
5645   }
5646 
5647   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5648     state |= IN_REAL_REPEAT;
5649   if (qn->lower != qn->upper)
5650     state |= IN_VAR_REPEAT;
5651 
5652   r = tune_tree(body, reg, state, env);
5653   if (r != 0) return r;
5654 
5655   /* expand string */
5656 #define EXPAND_STRING_MAX_LENGTH  100
5657   if (NODE_TYPE(body) == NODE_STRING) {
5658     if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5659         qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5660       int len = NODE_STRING_LEN(body);
5661 
5662       if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5663         int i, n = qn->lower;
5664         node_conv_to_str_node(node, body);
5665         for (i = 0; i < n; i++) {
5666           r = node_str_node_cat(node, body);
5667           if (r != 0) return r;
5668         }
5669         onig_node_free(body);
5670         return r;
5671       }
5672     }
5673   }
5674 
5675   if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5676     if (NODE_TYPE(body) == NODE_QUANT) {
5677       QuantNode* tqn = QUANT_(body);
5678       if (IS_NOT_NULL(tqn->head_exact)) {
5679         qn->head_exact  = tqn->head_exact;
5680         tqn->head_exact = NULL;
5681       }
5682     }
5683     else {
5684       qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg);
5685     }
5686   }
5687 
5688   return r;
5689 }
5690 
5691 /* tune_tree does the following work.
5692  1. check empty loop. (set qn->emptiness)
5693  2. expand ignore-case in char class.
5694  3. set memory status bit flags. (reg->mem_stats)
5695  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5696  5. find invalid patterns in look-behind.
5697  6. expand repeated string.
5698  */
5699 static int
tune_tree(Node * node,regex_t * reg,int state,ParseEnv * env)5700 tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env)
5701 {
5702   int r = 0;
5703 
5704   switch (NODE_TYPE(node)) {
5705   case NODE_LIST:
5706     {
5707       Node* prev = NULL_NODE;
5708       do {
5709         r = tune_tree(NODE_CAR(node), reg, state, env);
5710         if (IS_NOT_NULL(prev) && r == 0) {
5711           r = tune_next(prev, NODE_CAR(node), reg);
5712         }
5713         prev = NODE_CAR(node);
5714       } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5715     }
5716     break;
5717 
5718   case NODE_ALT:
5719     do {
5720       r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5721     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5722     break;
5723 
5724   case NODE_STRING:
5725     if (NODE_IS_REAL_IGNORECASE(node)) {
5726       r = unravel_case_fold_string(node, reg, state);
5727     }
5728     break;
5729 
5730   case NODE_BACKREF:
5731     {
5732       int i;
5733       int* p;
5734       BackRefNode* br = BACKREF_(node);
5735       p = BACKREFS_P(br);
5736       for (i = 0; i < br->back_num; i++) {
5737         if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
5738         MEM_STATUS_ON(env->backrefed_mem, p[i]);
5739 #if 0
5740 #ifdef USE_BACKREF_WITH_LEVEL
5741         if (NODE_IS_NEST_LEVEL(node)) {
5742           MEM_STATUS_ON(env->backtrack_mem, p[i]);
5743         }
5744 #endif
5745 #else
5746         /* More precisely, it should be checked whether alt/repeat exists before
5747            the subject capture node, and then this backreference position
5748            exists before (or in) the capture node. */
5749         MEM_STATUS_ON(env->backtrack_mem, p[i]);
5750 #endif
5751       }
5752     }
5753     break;
5754 
5755   case NODE_BAG:
5756     {
5757       BagNode* en = BAG_(node);
5758 
5759       switch (en->type) {
5760       case BAG_OPTION:
5761         {
5762           OnigOptionType options = reg->options;
5763           reg->options = BAG_(node)->o.options;
5764           r = tune_tree(NODE_BODY(node), reg, state, env);
5765           reg->options = options;
5766         }
5767         break;
5768 
5769       case BAG_MEMORY:
5770 #ifdef USE_CALL
5771         state |= en->m.called_state;
5772 #endif
5773 
5774         if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5775             || NODE_IS_RECURSION(node)) {
5776           MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5777         }
5778         r = tune_tree(NODE_BODY(node), reg, state, env);
5779         break;
5780 
5781       case BAG_STOP_BACKTRACK:
5782         {
5783           Node* target = NODE_BODY(node);
5784           r = tune_tree(target, reg, state, env);
5785           if (NODE_TYPE(target) == NODE_QUANT) {
5786             QuantNode* tqn = QUANT_(target);
5787             if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5788                 tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
5789               if (is_strict_real_node(NODE_BODY(target)))
5790                 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5791             }
5792           }
5793         }
5794         break;
5795 
5796       case BAG_IF_ELSE:
5797         r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5798         if (r != 0) return r;
5799         if (IS_NOT_NULL(en->te.Then)) {
5800           r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5801           if (r != 0) return r;
5802         }
5803         if (IS_NOT_NULL(en->te.Else))
5804           r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5805         break;
5806       }
5807     }
5808     break;
5809 
5810   case NODE_QUANT:
5811     if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0)
5812       NODE_STATUS_ADD(node, INPEEK);
5813 
5814     r = tune_quant(node, reg, state, env);
5815     break;
5816 
5817   case NODE_ANCHOR:
5818     r = tune_anchor(node, reg, state, env);
5819     break;
5820 
5821 #ifdef USE_CALL
5822   case NODE_CALL:
5823 #endif
5824   case NODE_CTYPE:
5825   case NODE_CCLASS:
5826   case NODE_GIMMICK:
5827   default:
5828     break;
5829   }
5830 
5831   return r;
5832 }
5833 
5834 #ifndef ONIG_DONT_OPTIMIZE
5835 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5836 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5837                                           UChar* s, UChar* end,
5838                                           UChar skip[], int* roffset)
5839 {
5840   int i, j, k, len, offset;
5841   int n, clen;
5842   UChar* p;
5843   OnigEncoding enc;
5844   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5845   UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5846 
5847   enc = reg->enc;
5848   offset = ENC_GET_SKIP_OFFSET(enc);
5849   if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5850     UChar* p = s;
5851     while (1) {
5852       len = enclen(enc, p);
5853       if (p + len >= end) {
5854         if (len == 1) offset = 1;
5855         else          offset = 0;
5856         break;
5857       }
5858       p += len;
5859     }
5860   }
5861 
5862   len = (int )(end - s);
5863   if (len + offset >= UCHAR_MAX)
5864     return ONIGERR_PARSER_BUG;
5865 
5866   *roffset = offset;
5867 
5868   for (i = 0; i < CHAR_MAP_SIZE; i++) {
5869     skip[i] = (UChar )(len + offset);
5870   }
5871 
5872   for (p = s; p < end; ) {
5873     int z;
5874 
5875     clen = enclen(enc, p);
5876     if (p + clen > end) clen = (int )(end - p);
5877 
5878     len = (int )(end - p);
5879     for (j = 0; j < clen; j++) {
5880       z = len - j + (offset - 1);
5881       if (z <= 0) break;
5882       skip[p[j]] = z;
5883     }
5884 
5885     if (case_expand != 0) {
5886       n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5887                                              p, end, items);
5888       for (k = 0; k < n; k++) {
5889         ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5890         for (j = 0; j < clen; j++) {
5891           z = len - j + (offset - 1);
5892           if (z <= 0) break;
5893           if (skip[buf[j]] > z)
5894             skip[buf[j]] = z;
5895         }
5896       }
5897     }
5898 
5899     p += clen;
5900   }
5901 
5902   return 0;
5903 }
5904 #endif
5905 
5906 
5907 #define OPT_EXACT_MAXLEN   24
5908 
5909 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5910 #error Too big OPT_EXACT_MAXLEN
5911 #endif
5912 
5913 typedef struct {
5914   MinMaxLen        mm;
5915   OnigEncoding     enc;
5916   OnigCaseFoldType case_fold_flag;
5917   ParseEnv*        scan_env;
5918 } OptEnv;
5919 
5920 typedef struct {
5921   int left;
5922   int right;
5923 } OptAnc;
5924 
5925 typedef struct {
5926   MinMaxLen  mm;   /* position */
5927   OptAnc     anc;
5928   int        reach_end;
5929   int        len;
5930   UChar      s[OPT_EXACT_MAXLEN];
5931 } OptStr;
5932 
5933 typedef struct {
5934   MinMaxLen mm;     /* position */
5935   OptAnc    anc;
5936   int       value;  /* weighted value */
5937   UChar     map[CHAR_MAP_SIZE];
5938 } OptMap;
5939 
5940 typedef struct {
5941   MinMaxLen len;
5942   OptAnc  anc;
5943   OptStr  sb;     /* boundary */
5944   OptStr  sm;     /* middle */
5945   OptStr  spr;    /* prec read (?=...) */
5946   OptMap  map;    /* boundary */
5947 } OptNode;
5948 
5949 
5950 #ifndef ONIG_DONT_OPTIMIZE
5951 
5952 static int
map_position_value(OnigEncoding enc,int i)5953 map_position_value(OnigEncoding enc, int i)
5954 {
5955   static const short int Vals[] = {
5956      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
5957      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5958     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
5959      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
5960      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
5961      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
5962      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
5963      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
5964   };
5965 
5966   if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
5967     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
5968       return 20;
5969     else
5970       return (int )Vals[i];
5971   }
5972   else
5973     return 4;   /* Take it easy. */
5974 }
5975 
5976 static int
distance_value(MinMaxLen * mm)5977 distance_value(MinMaxLen* mm)
5978 {
5979   /* 1000 / (min-max-dist + 1) */
5980   static const short int dist_vals[] = {
5981     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
5982       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
5983       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
5984       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
5985       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
5986       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
5987       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
5988       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
5989       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
5990       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
5991   };
5992 
5993   OnigLen d;
5994 
5995   if (mm->max == INFINITE_LEN) return 0;
5996 
5997   d = mm->max - mm->min;
5998   if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
5999     /* return dist_vals[d] * 16 / (mm->min + 12); */
6000     return (int )dist_vals[d];
6001   else
6002     return 1;
6003 }
6004 
6005 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)6006 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
6007 {
6008   if (v2 <= 0) return -1;
6009   if (v1 <= 0) return  1;
6010 
6011   v1 *= distance_value(d1);
6012   v2 *= distance_value(d2);
6013 
6014   if (v2 > v1) return  1;
6015   if (v2 < v1) return -1;
6016 
6017   if (d2->min < d1->min) return  1;
6018   if (d2->min > d1->min) return -1;
6019   return 0;
6020 }
6021 
6022 static void
copy_opt_env(OptEnv * to,OptEnv * from)6023 copy_opt_env(OptEnv* to, OptEnv* from)
6024 {
6025   *to = *from;
6026 }
6027 
6028 static void
clear_opt_anc_info(OptAnc * a)6029 clear_opt_anc_info(OptAnc* a)
6030 {
6031   a->left  = 0;
6032   a->right = 0;
6033 }
6034 
6035 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)6036 copy_opt_anc_info(OptAnc* to, OptAnc* from)
6037 {
6038   *to = *from;
6039 }
6040 
6041 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)6042 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
6043                     OnigLen left_len, OnigLen right_len)
6044 {
6045   clear_opt_anc_info(to);
6046 
6047   to->left = left->left;
6048   if (left_len == 0) {
6049     to->left |= right->left;
6050   }
6051 
6052   to->right = right->right;
6053   if (right_len == 0) {
6054     to->right |= left->right;
6055   }
6056   else {
6057     to->right |= (left->right & ANCR_PREC_READ_NOT);
6058   }
6059 }
6060 
6061 static int
is_left(int a)6062 is_left(int a)
6063 {
6064   if (a == ANCR_END_BUF  || a == ANCR_SEMI_END_BUF ||
6065       a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
6066     return 0;
6067 
6068   return 1;
6069 }
6070 
6071 static int
is_set_opt_anc_info(OptAnc * to,int anc)6072 is_set_opt_anc_info(OptAnc* to, int anc)
6073 {
6074   if ((to->left & anc) != 0) return 1;
6075 
6076   return ((to->right & anc) != 0 ? 1 : 0);
6077 }
6078 
6079 static void
add_opt_anc_info(OptAnc * to,int anc)6080 add_opt_anc_info(OptAnc* to, int anc)
6081 {
6082   if (is_left(anc))
6083     to->left |= anc;
6084   else
6085     to->right |= anc;
6086 }
6087 
6088 static void
remove_opt_anc_info(OptAnc * to,int anc)6089 remove_opt_anc_info(OptAnc* to, int anc)
6090 {
6091   if (is_left(anc))
6092     to->left &= ~anc;
6093   else
6094     to->right &= ~anc;
6095 }
6096 
6097 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)6098 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
6099 {
6100   to->left  &= add->left;
6101   to->right &= add->right;
6102 }
6103 
6104 static int
is_full_opt_exact(OptStr * e)6105 is_full_opt_exact(OptStr* e)
6106 {
6107   return e->len >= OPT_EXACT_MAXLEN;
6108 }
6109 
6110 static void
clear_opt_exact(OptStr * e)6111 clear_opt_exact(OptStr* e)
6112 {
6113   mml_clear(&e->mm);
6114   clear_opt_anc_info(&e->anc);
6115   e->reach_end = 0;
6116   e->len       = 0;
6117   e->s[0]      = '\0';
6118 }
6119 
6120 static void
copy_opt_exact(OptStr * to,OptStr * from)6121 copy_opt_exact(OptStr* to, OptStr* from)
6122 {
6123   *to = *from;
6124 }
6125 
6126 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)6127 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
6128 {
6129   int i, j, len, r;
6130   UChar *p, *end;
6131   OptAnc tanc;
6132 
6133   r = 0;
6134   p = add->s;
6135   end = p + add->len;
6136   for (i = to->len; p < end; ) {
6137     len = enclen(enc, p);
6138     if (i + len > OPT_EXACT_MAXLEN) {
6139       r = 1; /* 1:full */
6140       break;
6141     }
6142     for (j = 0; j < len && p < end; j++) {
6143       /* coverity[overrun-local] */
6144       to->s[i++] = *p++;
6145     }
6146   }
6147 
6148   to->len = i;
6149   to->reach_end = (p == end ? add->reach_end : 0);
6150 
6151   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
6152   if (! to->reach_end) tanc.right = 0;
6153   copy_opt_anc_info(&to->anc, &tanc);
6154 
6155   return r;
6156 }
6157 
6158 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)6159 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
6160 {
6161   int i, j, len;
6162   UChar *p;
6163 
6164   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
6165     len = enclen(enc, p);
6166     if (i + len > OPT_EXACT_MAXLEN) break;
6167     for (j = 0; j < len && p < end; j++) {
6168       /* coverity[overrun-local] */
6169       to->s[i++] = *p++;
6170     }
6171   }
6172 
6173   to->len = i;
6174 
6175   if (p >= end)
6176     to->reach_end = 1;
6177 }
6178 
6179 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)6180 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
6181 {
6182   int i, j, len;
6183 
6184   if (add->len == 0 || to->len == 0) {
6185     clear_opt_exact(to);
6186     return ;
6187   }
6188 
6189   if (! mml_is_equal(&to->mm, &add->mm)) {
6190     clear_opt_exact(to);
6191     return ;
6192   }
6193 
6194   for (i = 0; i < to->len && i < add->len; ) {
6195     if (to->s[i] != add->s[i]) break;
6196     len = enclen(env->enc, to->s + i);
6197 
6198     for (j = 1; j < len; j++) {
6199       if (to->s[i+j] != add->s[i+j]) break;
6200     }
6201     if (j < len) break;
6202     i += len;
6203   }
6204 
6205   if (! add->reach_end || i < add->len || i < to->len) {
6206     to->reach_end = 0;
6207   }
6208   to->len = i;
6209 
6210   alt_merge_opt_anc_info(&to->anc, &add->anc);
6211   if (! to->reach_end) to->anc.right = 0;
6212 }
6213 
6214 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)6215 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
6216 {
6217   int vn, va;
6218 
6219   vn = now->len;
6220   va = alt->len;
6221 
6222   if (va == 0) {
6223     return ;
6224   }
6225   else if (vn == 0) {
6226     copy_opt_exact(now, alt);
6227     return ;
6228   }
6229   else if (vn <= 2 && va <= 2) {
6230     /* ByteValTable[x] is big value --> low price */
6231     va = map_position_value(enc, now->s[0]);
6232     vn = map_position_value(enc, alt->s[0]);
6233 
6234     if (now->len > 1) vn += 5;
6235     if (alt->len > 1) va += 5;
6236   }
6237 
6238   vn *= 2;
6239   va *= 2;
6240 
6241   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6242     copy_opt_exact(now, alt);
6243 }
6244 
6245 static void
clear_opt_map(OptMap * map)6246 clear_opt_map(OptMap* map)
6247 {
6248   static const OptMap clean_info = {
6249     {0, 0}, {0, 0}, 0,
6250     {
6251       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6252       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6253       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6254       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6255       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6256       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6257       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6258       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6259       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6260       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6261       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6262       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6263       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6264       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6265       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6266       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6267     }
6268   };
6269 
6270   xmemcpy(map, &clean_info, sizeof(OptMap));
6271 }
6272 
6273 static void
copy_opt_map(OptMap * to,OptMap * from)6274 copy_opt_map(OptMap* to, OptMap* from)
6275 {
6276   *to = *from;
6277 }
6278 
6279 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)6280 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
6281 {
6282   if (m->map[c] == 0) {
6283     m->map[c] = 1;
6284     m->value += map_position_value(enc, c);
6285   }
6286 }
6287 
6288 static void
select_opt_map(OptMap * now,OptMap * alt)6289 select_opt_map(OptMap* now, OptMap* alt)
6290 {
6291   static int z = 1<<15; /* 32768: something big value */
6292 
6293   int vn, va;
6294 
6295   if (alt->value == 0) return ;
6296   if (now->value == 0) {
6297     copy_opt_map(now, alt);
6298     return ;
6299   }
6300 
6301   vn = z / now->value;
6302   va = z / alt->value;
6303   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6304     copy_opt_map(now, alt);
6305 }
6306 
6307 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)6308 comp_opt_exact_or_map(OptStr* e, OptMap* m)
6309 {
6310 #define COMP_EM_BASE  20
6311   int ae, am;
6312   int case_value;
6313 
6314   if (m->value <= 0) return -1;
6315 
6316   case_value = 3;
6317   ae = COMP_EM_BASE * e->len * case_value;
6318   am = COMP_EM_BASE * 5 * 2 / m->value;
6319   return comp_distance_value(&e->mm, &m->mm, ae, am);
6320 }
6321 
6322 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)6323 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
6324 {
6325   int i, val;
6326 
6327   /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */
6328   if (to->value == 0) return ;
6329   if (add->value == 0 || to->mm.max < add->mm.min) {
6330     clear_opt_map(to);
6331     return ;
6332   }
6333 
6334   mml_alt_merge(&to->mm, &add->mm);
6335 
6336   val = 0;
6337   for (i = 0; i < CHAR_MAP_SIZE; i++) {
6338     if (add->map[i])
6339       to->map[i] = 1;
6340 
6341     if (to->map[i])
6342       val += map_position_value(enc, i);
6343   }
6344   to->value = val;
6345 
6346   alt_merge_opt_anc_info(&to->anc, &add->anc);
6347 }
6348 
6349 static void
set_bound_node_opt_info(OptNode * opt,MinMaxLen * plen)6350 set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen)
6351 {
6352   mml_copy(&(opt->sb.mm),  plen);
6353   mml_copy(&(opt->spr.mm), plen);
6354   mml_copy(&(opt->map.mm), plen);
6355 }
6356 
6357 static void
clear_node_opt_info(OptNode * opt)6358 clear_node_opt_info(OptNode* opt)
6359 {
6360   mml_clear(&opt->len);
6361   clear_opt_anc_info(&opt->anc);
6362   clear_opt_exact(&opt->sb);
6363   clear_opt_exact(&opt->sm);
6364   clear_opt_exact(&opt->spr);
6365   clear_opt_map(&opt->map);
6366 }
6367 
6368 static void
copy_node_opt_info(OptNode * to,OptNode * from)6369 copy_node_opt_info(OptNode* to, OptNode* from)
6370 {
6371   *to = *from;
6372 }
6373 
6374 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)6375 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
6376 {
6377   int sb_reach, sm_reach;
6378   OptAnc tanc;
6379 
6380   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
6381   copy_opt_anc_info(&to->anc, &tanc);
6382 
6383   if (add->sb.len > 0 && to->len.max == 0) {
6384     concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
6385     copy_opt_anc_info(&add->sb.anc, &tanc);
6386   }
6387 
6388   if (add->map.value > 0 && to->len.max == 0) {
6389     if (add->map.mm.max == 0)
6390       add->map.anc.left |= to->anc.left;
6391   }
6392 
6393   sb_reach = to->sb.reach_end;
6394   sm_reach = to->sm.reach_end;
6395 
6396   if (add->len.max != 0)
6397     to->sb.reach_end = to->sm.reach_end = 0;
6398 
6399   if (add->sb.len > 0) {
6400     if (sb_reach) {
6401       concat_opt_exact(&to->sb, &add->sb, enc);
6402       clear_opt_exact(&add->sb);
6403     }
6404     else if (sm_reach) {
6405       concat_opt_exact(&to->sm, &add->sb, enc);
6406       clear_opt_exact(&add->sb);
6407     }
6408   }
6409   select_opt_exact(enc, &to->sm, &add->sb);
6410   select_opt_exact(enc, &to->sm, &add->sm);
6411 
6412   if (to->spr.len > 0) {
6413     if (add->len.max > 0) {
6414       if (to->spr.mm.max == 0)
6415         select_opt_exact(enc, &to->sb, &to->spr);
6416       else
6417         select_opt_exact(enc, &to->sm, &to->spr);
6418     }
6419   }
6420   else if (add->spr.len > 0) {
6421     copy_opt_exact(&to->spr, &add->spr);
6422   }
6423 
6424   select_opt_map(&to->map, &add->map);
6425   mml_add(&to->len, &add->len);
6426 }
6427 
6428 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)6429 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
6430 {
6431   alt_merge_opt_anc_info(&to->anc, &add->anc);
6432   alt_merge_opt_exact(&to->sb,  &add->sb, env);
6433   alt_merge_opt_exact(&to->sm,  &add->sm, env);
6434   alt_merge_opt_exact(&to->spr, &add->spr, env);
6435   alt_merge_opt_map(env->enc, &to->map, &add->map);
6436 
6437   mml_alt_merge(&to->len, &add->len);
6438 }
6439 
6440 static OnigLen
node_max_byte_len(Node * node,ParseEnv * env)6441 node_max_byte_len(Node* node, ParseEnv* env)
6442 {
6443   OnigLen len;
6444   OnigLen tmax;
6445 
6446   len = 0;
6447   switch (NODE_TYPE(node)) {
6448   case NODE_LIST:
6449     do {
6450       tmax = node_max_byte_len(NODE_CAR(node), env);
6451       len = distance_add(len, tmax);
6452     } while (IS_NOT_NULL(node = NODE_CDR(node)));
6453     break;
6454 
6455   case NODE_ALT:
6456     do {
6457       tmax = node_max_byte_len(NODE_CAR(node), env);
6458       if (len < tmax) len = tmax;
6459     } while (IS_NOT_NULL(node = NODE_CDR(node)));
6460     break;
6461 
6462   case NODE_STRING:
6463     {
6464       StrNode* sn = STR_(node);
6465       len = (OnigLen )(sn->end - sn->s);
6466     }
6467     break;
6468 
6469   case NODE_CTYPE:
6470   case NODE_CCLASS:
6471     len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
6472     break;
6473 
6474   case NODE_BACKREF:
6475     if (! NODE_IS_CHECKER(node)) {
6476       int i;
6477       int* backs;
6478       MemEnv* mem_env = PARSEENV_MEMENV(env);
6479       BackRefNode* br = BACKREF_(node);
6480       if (NODE_IS_RECURSION(node)) {
6481 #ifdef USE_BACKREF_WITH_LEVEL
6482         if (NODE_IS_NEST_LEVEL(node)) {
6483           len = INFINITE_LEN;
6484         }
6485 #endif
6486         break;
6487       }
6488       backs = BACKREFS_P(br);
6489       for (i = 0; i < br->back_num; i++) {
6490         tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
6491         if (len < tmax) len = tmax;
6492       }
6493     }
6494     break;
6495 
6496 #ifdef USE_CALL
6497   case NODE_CALL:
6498     if (! NODE_IS_RECURSION(node))
6499       len = node_max_byte_len(NODE_BODY(node), env);
6500     else
6501       len = INFINITE_LEN;
6502     break;
6503 #endif
6504 
6505   case NODE_QUANT:
6506     {
6507       QuantNode* qn = QUANT_(node);
6508 
6509       if (qn->upper != 0) {
6510         len = node_max_byte_len(NODE_BODY(node), env);
6511         if (len != 0) {
6512           if (! IS_INFINITE_REPEAT(qn->upper))
6513             len = distance_multiply(len, qn->upper);
6514           else
6515             len = INFINITE_LEN;
6516         }
6517       }
6518     }
6519     break;
6520 
6521   case NODE_BAG:
6522     {
6523       BagNode* en = BAG_(node);
6524       switch (en->type) {
6525       case BAG_MEMORY:
6526         if (NODE_IS_FIXED_MAX(node))
6527           len = en->max_len;
6528         else {
6529           if (NODE_IS_MARK1(node))
6530             len = INFINITE_LEN;
6531           else {
6532             NODE_STATUS_ADD(node, MARK1);
6533             len = node_max_byte_len(NODE_BODY(node), env);
6534             NODE_STATUS_REMOVE(node, MARK1);
6535 
6536             en->max_len = len;
6537             NODE_STATUS_ADD(node, FIXED_MAX);
6538           }
6539         }
6540         break;
6541 
6542       case BAG_OPTION:
6543       case BAG_STOP_BACKTRACK:
6544         len = node_max_byte_len(NODE_BODY(node), env);
6545         break;
6546       case BAG_IF_ELSE:
6547         {
6548           OnigLen tlen, elen;
6549 
6550           len = node_max_byte_len(NODE_BODY(node), env);
6551           if (IS_NOT_NULL(en->te.Then)) {
6552             tlen = node_max_byte_len(en->te.Then, env);
6553             len = distance_add(len, tlen);
6554           }
6555           if (IS_NOT_NULL(en->te.Else))
6556             elen = node_max_byte_len(en->te.Else, env);
6557           else elen = 0;
6558 
6559           if (elen > len) len = elen;
6560         }
6561         break;
6562       }
6563     }
6564     break;
6565 
6566   case NODE_ANCHOR:
6567   case NODE_GIMMICK:
6568   default:
6569     break;
6570   }
6571 
6572   return len;
6573 }
6574 
6575 #define MAX_NODE_OPT_INFO_REF_COUNT    5
6576 
6577 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)6578 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
6579 {
6580   int i;
6581   int r;
6582   OptNode xo;
6583   OnigEncoding enc;
6584 
6585   r = 0;
6586   enc = env->enc;
6587   clear_node_opt_info(opt);
6588   set_bound_node_opt_info(opt, &env->mm);
6589 
6590   switch (NODE_TYPE(node)) {
6591   case NODE_LIST:
6592     {
6593       OptEnv nenv;
6594       Node* nd = node;
6595 
6596       copy_opt_env(&nenv, env);
6597       do {
6598         r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
6599         if (r == 0) {
6600           mml_add(&nenv.mm, &xo.len);
6601           concat_left_node_opt_info(enc, opt, &xo);
6602         }
6603       } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
6604     }
6605     break;
6606 
6607   case NODE_ALT:
6608     {
6609       Node* nd = node;
6610 
6611       do {
6612         r = optimize_nodes(NODE_CAR(nd), &xo, env);
6613         if (r == 0) {
6614           if (nd == node) copy_node_opt_info(opt, &xo);
6615           else            alt_merge_node_opt_info(opt, &xo, env);
6616         }
6617       } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
6618     }
6619     break;
6620 
6621   case NODE_STRING:
6622     {
6623       StrNode* sn = STR_(node);
6624       int slen = (int )(sn->end - sn->s);
6625 
6626       concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
6627       if (slen > 0) {
6628         add_char_opt_map(&opt->map, *(sn->s), enc);
6629       }
6630       mml_set_min_max(&opt->len, slen, slen);
6631     }
6632     break;
6633 
6634   case NODE_CCLASS:
6635     {
6636       int z;
6637       CClassNode* cc = CCLASS_(node);
6638 
6639       /* no need to check ignore case. (set in tune_tree()) */
6640 
6641       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
6642         OnigLen min = ONIGENC_MBC_MINLEN(enc);
6643         OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
6644 
6645         mml_set_min_max(&opt->len, min, max);
6646       }
6647       else {
6648         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6649           z = BITSET_AT(cc->bs, i);
6650           if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6651             add_char_opt_map(&opt->map, (UChar )i, enc);
6652           }
6653         }
6654         mml_set_min_max(&opt->len, 1, 1);
6655       }
6656     }
6657     break;
6658 
6659   case NODE_CTYPE:
6660     {
6661       int min, max;
6662       int range;
6663 
6664       max = ONIGENC_MBC_MAXLEN_DIST(enc);
6665 
6666       if (max == 1) {
6667         min = 1;
6668 
6669         switch (CTYPE_(node)->ctype) {
6670         case CTYPE_ANYCHAR:
6671           break;
6672 
6673         case ONIGENC_CTYPE_WORD:
6674           range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6675           if (CTYPE_(node)->not != 0) {
6676             for (i = 0; i < range; i++) {
6677               if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6678                 add_char_opt_map(&opt->map, (UChar )i, enc);
6679               }
6680             }
6681             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6682               add_char_opt_map(&opt->map, (UChar )i, enc);
6683             }
6684           }
6685           else {
6686             for (i = 0; i < range; i++) {
6687               if (ONIGENC_IS_CODE_WORD(enc, i)) {
6688                 add_char_opt_map(&opt->map, (UChar )i, enc);
6689               }
6690             }
6691           }
6692           break;
6693         }
6694       }
6695       else {
6696         min = ONIGENC_MBC_MINLEN(enc);
6697       }
6698       mml_set_min_max(&opt->len, min, max);
6699     }
6700     break;
6701 
6702   case NODE_ANCHOR:
6703     switch (ANCHOR_(node)->type) {
6704     case ANCR_BEGIN_BUF:
6705     case ANCR_BEGIN_POSITION:
6706     case ANCR_BEGIN_LINE:
6707     case ANCR_END_BUF:
6708     case ANCR_SEMI_END_BUF:
6709     case ANCR_END_LINE:
6710     case ANCR_PREC_READ_NOT:
6711     case ANCR_LOOK_BEHIND:
6712       add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6713       break;
6714 
6715     case ANCR_PREC_READ:
6716       {
6717         r = optimize_nodes(NODE_BODY(node), &xo, env);
6718         if (r == 0) {
6719           if (xo.sb.len > 0)
6720             copy_opt_exact(&opt->spr, &xo.sb);
6721           else if (xo.sm.len > 0)
6722             copy_opt_exact(&opt->spr, &xo.sm);
6723 
6724           opt->spr.reach_end = 0;
6725 
6726           if (xo.map.value > 0)
6727             copy_opt_map(&opt->map, &xo.map);
6728         }
6729       }
6730       break;
6731 
6732     case ANCR_LOOK_BEHIND_NOT:
6733       break;
6734     }
6735     break;
6736 
6737   case NODE_BACKREF:
6738     if (! NODE_IS_CHECKER(node)) {
6739       OnigLen min, max;
6740 
6741       min = node_min_byte_len(node, env->scan_env);
6742       max = node_max_byte_len(node, env->scan_env);
6743       mml_set_min_max(&opt->len, min, max);
6744     }
6745     break;
6746 
6747 #ifdef USE_CALL
6748   case NODE_CALL:
6749     if (NODE_IS_RECURSION(node))
6750       mml_set_min_max(&opt->len, 0, INFINITE_LEN);
6751     else {
6752       r = optimize_nodes(NODE_BODY(node), opt, env);
6753     }
6754     break;
6755 #endif
6756 
6757   case NODE_QUANT:
6758     {
6759       OnigLen min, max;
6760       QuantNode* qn = QUANT_(node);
6761 
6762       /* Issue #175
6763          ex. /\g<1>{0}(?<=|())/
6764 
6765          Empty and unused nodes in look-behind is removed in
6766          tune_look_behind().
6767          Called group nodes are assigned to be not called if the caller side is
6768          inside of zero-repetition.
6769          As a result, the nodes are considered unused.
6770        */
6771       if (qn->upper == 0) {
6772         mml_set_min_max(&opt->len, 0, 0);
6773         break;
6774       }
6775 
6776       r = optimize_nodes(NODE_BODY(node), &xo, env);
6777       if (r != 0) break;
6778 
6779       if (qn->lower > 0) {
6780         copy_node_opt_info(opt, &xo);
6781         if (xo.sb.len > 0) {
6782           if (xo.sb.reach_end) {
6783             for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6784               int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6785               if (rc > 0) break;
6786             }
6787             if (i < qn->lower) opt->sb.reach_end = 0;
6788           }
6789         }
6790 
6791         if (qn->lower != qn->upper) {
6792           opt->sb.reach_end = 0;
6793           opt->sm.reach_end = 0;
6794         }
6795         if (qn->lower > 1)
6796           opt->sm.reach_end = 0;
6797       }
6798 
6799       if (IS_INFINITE_REPEAT(qn->upper)) {
6800         if (env->mm.max == 0 &&
6801             NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6802           if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)))
6803             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6804           else
6805             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6806         }
6807 
6808         max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6809       }
6810       else {
6811         max = distance_multiply(xo.len.max, qn->upper);
6812       }
6813 
6814       min = distance_multiply(xo.len.min, qn->lower);
6815       mml_set_min_max(&opt->len, min, max);
6816     }
6817     break;
6818 
6819   case NODE_BAG:
6820     {
6821       BagNode* en = BAG_(node);
6822 
6823       switch (en->type) {
6824       case BAG_STOP_BACKTRACK:
6825       case BAG_OPTION:
6826         r = optimize_nodes(NODE_BODY(node), opt, env);
6827         break;
6828 
6829       case BAG_MEMORY:
6830 #ifdef USE_CALL
6831         en->opt_count++;
6832         if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6833           OnigLen min, max;
6834 
6835           min = 0;
6836           max = INFINITE_LEN;
6837           if (NODE_IS_FIXED_MIN(node)) min = en->min_len;
6838           if (NODE_IS_FIXED_MAX(node)) max = en->max_len;
6839           mml_set_min_max(&opt->len, min, max);
6840         }
6841         else
6842 #endif
6843           {
6844             r = optimize_nodes(NODE_BODY(node), opt, env);
6845             if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6846               if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6847                 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6848             }
6849           }
6850         break;
6851 
6852       case BAG_IF_ELSE:
6853         {
6854           OptEnv nenv;
6855 
6856           if (IS_NOT_NULL(en->te.Else)) {
6857             copy_opt_env(&nenv, env);
6858             r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6859             if (r == 0) {
6860               mml_add(&nenv.mm, &xo.len);
6861               concat_left_node_opt_info(enc, opt, &xo);
6862               if (IS_NOT_NULL(en->te.Then)) {
6863                 r = optimize_nodes(en->te.Then, &xo, &nenv);
6864                 if (r == 0) {
6865                   concat_left_node_opt_info(enc, opt, &xo);
6866                 }
6867               }
6868 
6869                 r = optimize_nodes(en->te.Else, &xo, env);
6870                 if (r == 0)
6871                   alt_merge_node_opt_info(opt, &xo, env);
6872             }
6873           }
6874         }
6875         break;
6876       }
6877     }
6878     break;
6879 
6880   case NODE_GIMMICK:
6881     break;
6882 
6883   default:
6884 #ifdef ONIG_DEBUG
6885     fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6886 #endif
6887     r = ONIGERR_TYPE_BUG;
6888     break;
6889   }
6890 
6891   return r;
6892 }
6893 
6894 static int
set_optimize_exact(regex_t * reg,OptStr * e)6895 set_optimize_exact(regex_t* reg, OptStr* e)
6896 {
6897   int r;
6898   int allow_reverse;
6899 
6900   if (e->len == 0) return 0;
6901 
6902   reg->exact = (UChar* )xmalloc(e->len);
6903   CHECK_NULL_RETURN_MEMERR(reg->exact);
6904   xmemcpy(reg->exact, e->s, e->len);
6905   reg->exact_end = reg->exact + e->len;
6906 
6907   allow_reverse =
6908     ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6909 
6910   if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6911     r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6912                                                   reg->exact, reg->exact_end,
6913                                                   reg->map, &(reg->map_offset));
6914     if (r != 0) return r;
6915 
6916     reg->optimize = (allow_reverse != 0
6917                      ? OPTIMIZE_STR_FAST
6918                      : OPTIMIZE_STR_FAST_STEP_FORWARD);
6919   }
6920   else {
6921     reg->optimize = OPTIMIZE_STR;
6922   }
6923 
6924   reg->dist_min = e->mm.min;
6925   reg->dist_max = e->mm.max;
6926 
6927   if (reg->dist_min != INFINITE_LEN) {
6928     int n = (int )(reg->exact_end - reg->exact);
6929     reg->threshold_len = reg->dist_min + n;
6930   }
6931 
6932   return 0;
6933 }
6934 
6935 static void
set_optimize_map(regex_t * reg,OptMap * m)6936 set_optimize_map(regex_t* reg, OptMap* m)
6937 {
6938   int i;
6939 
6940   for (i = 0; i < CHAR_MAP_SIZE; i++)
6941     reg->map[i] = m->map[i];
6942 
6943   reg->optimize   = OPTIMIZE_MAP;
6944   reg->dist_min   = m->mm.min;
6945   reg->dist_max   = m->mm.max;
6946 
6947   if (reg->dist_min != INFINITE_LEN) {
6948     reg->threshold_len = reg->dist_min + ONIGENC_MBC_MINLEN(reg->enc);
6949   }
6950 }
6951 
6952 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6953 set_sub_anchor(regex_t* reg, OptAnc* anc)
6954 {
6955   reg->sub_anchor |= anc->left  & ANCR_BEGIN_LINE;
6956   reg->sub_anchor |= anc->right & ANCR_END_LINE;
6957 }
6958 
6959 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6960 static void print_optimize_info(FILE* f, regex_t* reg);
6961 #endif
6962 
6963 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ParseEnv * scan_env)6964 set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env)
6965 {
6966   int r;
6967   OptNode opt;
6968   OptEnv env;
6969 
6970   env.enc            = reg->enc;
6971   env.case_fold_flag = reg->case_fold_flag;
6972   env.scan_env       = scan_env;
6973   mml_clear(&env.mm);
6974 
6975   r = optimize_nodes(node, &opt, &env);
6976   if (r != 0) return r;
6977 
6978   reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6979         ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6980         ANCR_LOOK_BEHIND);
6981 
6982   if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6983     reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6984 
6985   reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6986                                   ANCR_PREC_READ_NOT);
6987 
6988   if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6989     reg->anc_dist_min = opt.len.min;
6990     reg->anc_dist_max = opt.len.max;
6991   }
6992 
6993   if (opt.sb.len > 0 || opt.sm.len > 0) {
6994     select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6995     if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6996       goto set_map;
6997     }
6998     else {
6999       r = set_optimize_exact(reg, &opt.sb);
7000       set_sub_anchor(reg, &opt.sb.anc);
7001     }
7002   }
7003   else if (opt.map.value > 0) {
7004   set_map:
7005     set_optimize_map(reg, &opt.map);
7006     set_sub_anchor(reg, &opt.map.anc);
7007   }
7008   else {
7009     reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
7010     if (opt.len.max == 0)
7011       reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
7012   }
7013 
7014 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
7015   print_optimize_info(DBGFP, reg);
7016 #endif
7017   return r;
7018 }
7019 #endif /* ONIG_DONT_OPTIMIZE */
7020 
7021 static void
clear_optimize_info(regex_t * reg)7022 clear_optimize_info(regex_t* reg)
7023 {
7024   reg->optimize      = OPTIMIZE_NONE;
7025   reg->anchor        = 0;
7026   reg->anc_dist_min  = 0;
7027   reg->anc_dist_max  = 0;
7028   reg->sub_anchor    = 0;
7029   reg->exact_end     = (UChar* )NULL;
7030   reg->map_offset    = 0;
7031   reg->threshold_len = 0;
7032   if (IS_NOT_NULL(reg->exact)) {
7033     xfree(reg->exact);
7034     reg->exact = (UChar* )NULL;
7035   }
7036 }
7037 
7038 #ifdef ONIG_DEBUG
7039 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)7040 static void print_enc_string(FILE* fp, OnigEncoding enc,
7041                              const UChar *s, const UChar *end)
7042 {
7043   if (ONIGENC_MBC_MINLEN(enc) > 1) {
7044     const UChar *p;
7045     OnigCodePoint code;
7046 
7047     p = s;
7048     while (p < end) {
7049       code = ONIGENC_MBC_TO_CODE(enc, p, end);
7050       if (code >= 0x80) {
7051         fprintf(fp, " 0x%04x ", (int )code);
7052       }
7053       else {
7054         fputc((int )code, fp);
7055       }
7056 
7057       p += enclen(enc, p);
7058     }
7059   }
7060   else {
7061     while (s < end) {
7062       fputc((int )*s, fp);
7063       s++;
7064     }
7065   }
7066 }
7067 
7068 static void
print_options(FILE * fp,OnigOptionType o)7069 print_options(FILE* fp, OnigOptionType o)
7070 {
7071   if ((o & ONIG_OPTION_IGNORECASE) != 0)      fprintf(fp, " IGNORECASE");
7072   if ((o & ONIG_OPTION_EXTEND) != 0)          fprintf(fp, " EXTEND");
7073   if ((o & ONIG_OPTION_MULTILINE) != 0)       fprintf(fp, " MULTILINE");
7074   if ((o & ONIG_OPTION_SINGLELINE) != 0)      fprintf(fp, " SINGLELINE");
7075   if ((o & ONIG_OPTION_FIND_LONGEST) != 0)    fprintf(fp, " FIND_LONGEST");
7076   if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0)  fprintf(fp, " FIND_NOT_EMPTY");
7077   if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0)  fprintf(fp, " NEGATE_SINGLELINE");
7078   if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP");
7079   if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0)   fprintf(fp, " CAPTURE_GROUP");
7080   if ((o & ONIG_OPTION_NOTBOL) != 0)          fprintf(fp, " NOTBOL");
7081   if ((o & ONIG_OPTION_NOTEOL) != 0)          fprintf(fp, " NOTEOL");
7082   if ((o & ONIG_OPTION_POSIX_REGION) != 0)    fprintf(fp, " POSIX_REGION");
7083   if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING");
7084   if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII");
7085   if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0)   fprintf(fp, " WORD_IS_ASCII");
7086   if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0)  fprintf(fp, " DIGIT_IS_ASCII");
7087   if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0)  fprintf(fp, " SPACE_IS_ASCII");
7088   if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0)  fprintf(fp, " POSIX_IS_ASCII");
7089   if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER");
7090   if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD");
7091   if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING");
7092   if ((o & ONIG_OPTION_NOT_END_STRING) != 0)   fprintf(fp, " NOT_END_STRING");
7093   if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION");
7094   if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH");
7095 }
7096 
7097 #endif /* ONIG_DEBUG */
7098 
7099 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
7100 
7101 #ifndef ONIG_DONT_OPTIMIZE
7102 
7103 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)7104 print_distance_range(FILE* f, OnigLen a, OnigLen b)
7105 {
7106   if (a == INFINITE_LEN)
7107     fputs("inf", f);
7108   else
7109     fprintf(f, "(%u)", a);
7110 
7111   fputs("-", f);
7112 
7113   if (b == INFINITE_LEN)
7114     fputs("inf", f);
7115   else
7116     fprintf(f, "(%u)", b);
7117 }
7118 
7119 static void
print_anchor(FILE * f,int anchor)7120 print_anchor(FILE* f, int anchor)
7121 {
7122   int q = 0;
7123 
7124   fprintf(f, "[");
7125 
7126   if (anchor & ANCR_BEGIN_BUF) {
7127     fprintf(f, "begin-buf");
7128     q = 1;
7129   }
7130   if (anchor & ANCR_BEGIN_LINE) {
7131     if (q) fprintf(f, ", ");
7132     q = 1;
7133     fprintf(f, "begin-line");
7134   }
7135   if (anchor & ANCR_BEGIN_POSITION) {
7136     if (q) fprintf(f, ", ");
7137     q = 1;
7138     fprintf(f, "begin-pos");
7139   }
7140   if (anchor & ANCR_END_BUF) {
7141     if (q) fprintf(f, ", ");
7142     q = 1;
7143     fprintf(f, "end-buf");
7144   }
7145   if (anchor & ANCR_SEMI_END_BUF) {
7146     if (q) fprintf(f, ", ");
7147     q = 1;
7148     fprintf(f, "semi-end-buf");
7149   }
7150   if (anchor & ANCR_END_LINE) {
7151     if (q) fprintf(f, ", ");
7152     q = 1;
7153     fprintf(f, "end-line");
7154   }
7155   if (anchor & ANCR_ANYCHAR_INF) {
7156     if (q) fprintf(f, ", ");
7157     q = 1;
7158     fprintf(f, "anychar-inf");
7159   }
7160   if (anchor & ANCR_ANYCHAR_INF_ML) {
7161     if (q) fprintf(f, ", ");
7162     fprintf(f, "anychar-inf-ml");
7163   }
7164 
7165   fprintf(f, "]");
7166 }
7167 
7168 static void
print_optimize_info(FILE * f,regex_t * reg)7169 print_optimize_info(FILE* f, regex_t* reg)
7170 {
7171   static const char* on[] =
7172     { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", "MAP" };
7173 
7174   fprintf(f, "optimize: %s\n", on[reg->optimize]);
7175   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
7176   if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
7177     print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
7178   fprintf(f, "\n");
7179 
7180   if (reg->optimize) {
7181     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
7182     fprintf(f, "\n");
7183   }
7184   fprintf(f, "\n");
7185 
7186   if (reg->exact) {
7187     UChar *p;
7188     fprintf(f, "exact: [");
7189     for (p = reg->exact; p < reg->exact_end; p++) {
7190       fputc(*p, f);
7191     }
7192     fprintf(f, "]: length: %ld, dmin: %u, ",
7193             (reg->exact_end - reg->exact), reg->dist_min);
7194     if (reg->dist_max == INFINITE_LEN)
7195       fprintf(f, "dmax: inf.\n");
7196     else
7197       fprintf(f, "dmax: %u\n", reg->dist_max);
7198   }
7199   else if (reg->optimize & OPTIMIZE_MAP) {
7200     int c, i, n = 0;
7201 
7202     for (i = 0; i < CHAR_MAP_SIZE; i++)
7203       if (reg->map[i]) n++;
7204 
7205     fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n",
7206             n, reg->dist_min, reg->dist_max);
7207     if (n > 0) {
7208       c = 0;
7209       fputc('[', f);
7210       for (i = 0; i < CHAR_MAP_SIZE; i++) {
7211         if (reg->map[i] != 0) {
7212           if (c > 0)  fputs(", ", f);
7213           c++;
7214           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
7215               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
7216             fputc(i, f);
7217           else
7218             fprintf(f, "%d", i);
7219         }
7220       }
7221       fprintf(f, "]\n");
7222     }
7223   }
7224 }
7225 #endif /* ONIG_DONT_OPTIMIZE */
7226 #endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */
7227 
7228 
7229 extern RegexExt*
onig_get_regex_ext(regex_t * reg)7230 onig_get_regex_ext(regex_t* reg)
7231 {
7232   if (IS_NULL(reg->extp)) {
7233     RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
7234     if (IS_NULL(ext)) return 0;
7235 
7236     ext->pattern      = 0;
7237     ext->pattern_end  = 0;
7238 #ifdef USE_CALLOUT
7239     ext->tag_table    = 0;
7240     ext->callout_num  = 0;
7241     ext->callout_list_alloc = 0;
7242     ext->callout_list = 0;
7243 #endif
7244 
7245     reg->extp = ext;
7246   }
7247 
7248   return reg->extp;
7249 }
7250 
7251 static void
free_regex_ext(RegexExt * ext)7252 free_regex_ext(RegexExt* ext)
7253 {
7254   if (IS_NOT_NULL(ext)) {
7255     if (IS_NOT_NULL(ext->pattern))
7256       xfree((void* )ext->pattern);
7257 
7258 #ifdef USE_CALLOUT
7259     if (IS_NOT_NULL(ext->tag_table))
7260       onig_callout_tag_table_free(ext->tag_table);
7261 
7262     if (IS_NOT_NULL(ext->callout_list))
7263       onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
7264 #endif
7265 
7266     xfree(ext);
7267   }
7268 }
7269 
7270 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)7271 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
7272 {
7273   RegexExt* ext;
7274   UChar* s;
7275 
7276   ext = onig_get_regex_ext(reg);
7277   CHECK_NULL_RETURN_MEMERR(ext);
7278 
7279   s = onigenc_strdup(reg->enc, pattern, pattern_end);
7280   CHECK_NULL_RETURN_MEMERR(s);
7281 
7282   ext->pattern     = s;
7283   ext->pattern_end = s + (pattern_end - pattern);
7284 
7285   return ONIG_NORMAL;
7286 }
7287 
7288 extern void
onig_free_body(regex_t * reg)7289 onig_free_body(regex_t* reg)
7290 {
7291   if (IS_NOT_NULL(reg)) {
7292     ops_free(reg);
7293     if (IS_NOT_NULL(reg->string_pool)) {
7294       xfree(reg->string_pool);
7295       reg->string_pool_end = reg->string_pool = 0;
7296     }
7297     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
7298     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
7299     if (IS_NOT_NULL(reg->extp)) {
7300       free_regex_ext(reg->extp);
7301       reg->extp = 0;
7302     }
7303 
7304     onig_names_free(reg);
7305   }
7306 }
7307 
7308 extern void
onig_free(regex_t * reg)7309 onig_free(regex_t* reg)
7310 {
7311   if (IS_NOT_NULL(reg)) {
7312     onig_free_body(reg);
7313     xfree(reg);
7314   }
7315 }
7316 
7317 
7318 #ifdef ONIG_DEBUG_PARSE
7319 static void print_tree P_((FILE* f, Node* node));
7320 #endif
7321 
7322 extern int onig_init_for_match_at(regex_t* reg);
7323 
parse_and_tune(regex_t * reg,const UChar * pattern,const UChar * pattern_end,ParseEnv * scan_env,Node ** rroot,OnigErrorInfo * einfo,UnsetAddrList * uslist)7324 static int parse_and_tune(regex_t* reg, const UChar* pattern,
7325   const UChar* pattern_end, ParseEnv *scan_env, Node** rroot,
7326   OnigErrorInfo* einfo
7327 #ifdef USE_CALL
7328   , UnsetAddrList* uslist
7329 #endif
7330 )
7331 {
7332   int r;
7333   Node* root;
7334 
7335   root = NULL_NODE;
7336   if (IS_NOT_NULL(einfo)) {
7337     einfo->enc = reg->enc;
7338     einfo->par = (UChar* )NULL;
7339   }
7340 
7341   r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env);
7342   if (r != 0) goto err;
7343 
7344   r = reduce_string_list(root, reg->enc);
7345   if (r != 0) goto err;
7346 
7347   /* mixed use named group and no-named group */
7348   if (scan_env->num_named > 0 &&
7349       IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
7350       ! OPTON_CAPTURE_GROUP(reg->options)) {
7351     if (scan_env->num_named != scan_env->num_mem)
7352       r = disable_noname_group_capture(&root, reg, scan_env);
7353     else
7354       r = numbered_ref_check(root);
7355 
7356     if (r != 0) goto err;
7357   }
7358 
7359   r = check_backrefs(root, scan_env);
7360   if (r != 0) goto err;
7361 
7362 #ifdef USE_CALL
7363   if (scan_env->num_call > 0) {
7364     r = unset_addr_list_init(uslist, scan_env->num_call);
7365     if (r != 0) goto err;
7366     scan_env->unset_addr_list = uslist;
7367     r = tune_call(root, scan_env, 0);
7368     if (r != 0) goto err_unset;
7369     r = tune_call2(root);
7370     if (r != 0) goto err_unset;
7371     r = recursive_call_check_trav(root, scan_env, 0);
7372     if (r  < 0) goto err_unset;
7373     r = infinite_recursive_call_check_trav(root, scan_env);
7374     if (r != 0) goto err_unset;
7375 
7376     tune_called_state(root, 0);
7377   }
7378 
7379   reg->num_call = scan_env->num_call;
7380 #endif
7381 
7382 #ifdef ONIG_DEBUG_PARSE
7383   fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth);
7384 #endif
7385 
7386   r = tune_tree(root, reg, 0, scan_env);
7387   if (r != 0) {
7388 #ifdef ONIG_DEBUG_PARSE
7389     fprintf(DBGFP, "TREE (error in tune)\n");
7390     print_tree(DBGFP, root);
7391     fprintf(DBGFP, "\n");
7392 #endif
7393     goto err_unset;
7394   }
7395 
7396   if (scan_env->backref_num != 0) {
7397     set_parent_node_trav(root, NULL_NODE);
7398     r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env);
7399     if (r != 0) goto err_unset;
7400     set_empty_status_check_trav(root, scan_env);
7401   }
7402 
7403   *rroot = root;
7404   return r;
7405 
7406  err_unset:
7407 #ifdef USE_CALL
7408   if (scan_env->num_call > 0) {
7409     unset_addr_list_end(uslist);
7410   }
7411 #endif
7412  err:
7413   if (IS_NOT_NULL(scan_env->error)) {
7414     if (IS_NOT_NULL(einfo)) {
7415       einfo->par     = scan_env->error;
7416       einfo->par_end = scan_env->error_end;
7417     }
7418   }
7419 
7420   onig_node_free(root);
7421   if (IS_NOT_NULL(scan_env->mem_env_dynamic))
7422     xfree(scan_env->mem_env_dynamic);
7423 
7424   *rroot = NULL_NODE;
7425   return r;
7426 }
7427 
7428 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)7429 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
7430              OnigErrorInfo* einfo)
7431 {
7432   int r;
7433   Node* root;
7434   ParseEnv scan_env;
7435 #ifdef USE_CALL
7436   UnsetAddrList uslist = {0};
7437 #endif
7438 
7439 #ifdef ONIG_DEBUG
7440   fprintf(DBGFP, "\nPATTERN: /");
7441   print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
7442   fprintf(DBGFP, "/\n");
7443   fprintf(DBGFP, "OPTIONS:");
7444   print_options(DBGFP, reg->options);
7445   fprintf(DBGFP, "\n");
7446 #endif
7447 
7448   if (reg->ops_alloc == 0) {
7449     r = ops_init(reg, OPS_INIT_SIZE);
7450     if (r != 0) {
7451       if (IS_NOT_NULL(einfo)) {
7452         einfo->enc = reg->enc;
7453         einfo->par = (UChar* )NULL;
7454       }
7455       return r;
7456     }
7457   }
7458   else
7459     reg->ops_used = 0;
7460 
7461   r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo
7462 #ifdef USE_CALL
7463                      , &uslist
7464 #endif
7465                     );
7466   if (r != 0) return r;
7467 
7468 #ifdef ONIG_DEBUG_PARSE
7469   fprintf(DBGFP, "TREE (after tune)\n");
7470   print_tree(DBGFP, root);
7471   fprintf(DBGFP, "\n");
7472 #endif
7473 
7474   reg->capture_history = scan_env.cap_history;
7475   reg->push_mem_start  = scan_env.backtrack_mem | scan_env.cap_history;
7476 
7477 #ifdef USE_CALLOUT
7478   if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
7479     reg->push_mem_end = reg->push_mem_start;
7480   }
7481   else {
7482     if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7483       reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7484     else
7485       reg->push_mem_end = reg->push_mem_start &
7486                         (scan_env.backrefed_mem | scan_env.cap_history);
7487   }
7488 #else
7489   if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7490     reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7491   else
7492     reg->push_mem_end = reg->push_mem_start &
7493                       (scan_env.backrefed_mem | scan_env.cap_history);
7494 #endif
7495 
7496   clear_optimize_info(reg);
7497 #ifndef ONIG_DONT_OPTIMIZE
7498   r = set_optimize_info_from_tree(root, reg, &scan_env);
7499   if (r != 0)  {
7500 #ifdef USE_CALL
7501     if (scan_env.num_call > 0) {
7502       unset_addr_list_end(&uslist);
7503     }
7504 #endif
7505     goto err;
7506   }
7507 #endif
7508 
7509   if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
7510     xfree(scan_env.mem_env_dynamic);
7511     scan_env.mem_env_dynamic = (MemEnv* )NULL;
7512   }
7513 
7514   r = compile_tree(root, reg, &scan_env);
7515   if (r == 0) {
7516     if (scan_env.keep_num > 0) {
7517       r = add_op(reg, OP_UPDATE_VAR);
7518       if (r != 0) goto err;
7519 
7520       COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
7521       COP(reg)->update_var.id   = 0; /* not used */
7522       COP(reg)->update_var.clear = FALSE;
7523     }
7524 
7525     r = add_op(reg, OP_END);
7526     if (r != 0) goto err;
7527 
7528 #ifdef USE_CALL
7529     if (scan_env.num_call > 0) {
7530       r = fix_unset_addr_list(&uslist, reg);
7531       unset_addr_list_end(&uslist);
7532       if (r != 0) goto err;
7533     }
7534 #endif
7535 
7536     r = ops_resize(reg, reg->ops_used);
7537     if (r != ONIG_NORMAL) goto err;
7538 
7539     set_addr_in_repeat_range(reg);
7540 
7541     if ((reg->push_mem_end != 0)
7542 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
7543         || (reg->num_repeat      != 0)
7544         || (reg->num_empty_check != 0)
7545 #endif
7546 #ifdef USE_CALLOUT
7547         || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
7548 #endif
7549 #ifdef USE_CALL
7550         || scan_env.num_call > 0
7551 #endif
7552         )
7553       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
7554     else {
7555       if (reg->push_mem_start != 0)
7556         reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
7557       else
7558         reg->stack_pop_level = STACK_POP_LEVEL_FREE;
7559     }
7560 
7561     r = ops_make_string_pool(reg);
7562     if (r != 0) goto err;
7563   }
7564 #ifdef USE_CALL
7565   else if (scan_env.num_call > 0) {
7566     unset_addr_list_end(&uslist);
7567   }
7568 #endif
7569   onig_node_free(root);
7570 
7571 #ifdef ONIG_DEBUG_COMPILE
7572   onig_print_names(DBGFP, reg);
7573   onig_print_compiled_byte_code_list(DBGFP, reg);
7574 #endif
7575 
7576 #ifdef USE_DIRECT_THREADED_CODE
7577   /* opcode -> opaddr */
7578   onig_init_for_match_at(reg);
7579 #endif
7580 
7581   return r;
7582 
7583  err:
7584   if (IS_NOT_NULL(scan_env.error)) {
7585     if (IS_NOT_NULL(einfo)) {
7586       einfo->par     = scan_env.error;
7587       einfo->par_end = scan_env.error_end;
7588     }
7589   }
7590 
7591   onig_node_free(root);
7592   if (IS_NOT_NULL(scan_env.mem_env_dynamic))
7593       xfree(scan_env.mem_env_dynamic);
7594   return r;
7595 }
7596 
7597 
7598 static int onig_inited = 0;
7599 
7600 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)7601 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
7602               OnigEncoding enc, OnigSyntaxType* syntax)
7603 {
7604   int r;
7605 
7606   xmemset(reg, 0, sizeof(*reg));
7607 
7608   if (onig_inited == 0) {
7609 #if 0
7610     return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
7611 #else
7612     r = onig_initialize(&enc, 1);
7613     if (r != 0)
7614       return ONIGERR_FAIL_TO_INITIALIZE;
7615 
7616     onig_warning("You didn't call onig_initialize() explicitly");
7617 #endif
7618   }
7619 
7620   if (IS_NULL(reg))
7621     return ONIGERR_INVALID_ARGUMENT;
7622 
7623   if (ONIGENC_IS_UNDEF(enc))
7624     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
7625 
7626   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
7627       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
7628     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
7629   }
7630 
7631   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
7632     option |= syntax->options;
7633     option &= ~ONIG_OPTION_SINGLELINE;
7634   }
7635   else
7636     option |= syntax->options;
7637 
7638   if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) {
7639     case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR |
7640                         ONIGENC_CASE_FOLD_TURKISH_AZERI);
7641     case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY;
7642   }
7643 
7644   (reg)->enc            = enc;
7645   (reg)->options        = option;
7646   (reg)->syntax         = syntax;
7647   (reg)->optimize       = 0;
7648   (reg)->exact          = (UChar* )NULL;
7649   (reg)->extp           = (RegexExt* )NULL;
7650   (reg)->ops            = (Operation* )NULL;
7651   (reg)->ops_curr       = (Operation* )NULL;
7652   (reg)->ops_used       = 0;
7653   (reg)->ops_alloc      = 0;
7654   (reg)->name_table     = (void* )NULL;
7655   (reg)->case_fold_flag = case_fold_flag;
7656   return 0;
7657 }
7658 
7659 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7660 onig_new_without_alloc(regex_t* reg,
7661                        const UChar* pattern, const UChar* pattern_end,
7662                        OnigOptionType option, OnigEncoding enc,
7663                        OnigSyntaxType* syntax, OnigErrorInfo* einfo)
7664 {
7665   int r;
7666 
7667   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7668   if (r != 0) return r;
7669 
7670   r = onig_compile(reg, pattern, pattern_end, einfo);
7671   return r;
7672 }
7673 
7674 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7675 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
7676          OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
7677          OnigErrorInfo* einfo)
7678 {
7679   int r;
7680 
7681   *reg = (regex_t* )xmalloc(sizeof(regex_t));
7682   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
7683 
7684   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7685   if (r != 0) {
7686     xfree(*reg);
7687     *reg = NULL;
7688     return r;
7689   }
7690 
7691   r = onig_compile(*reg, pattern, pattern_end, einfo);
7692   if (r != 0) {
7693     onig_free(*reg);
7694     *reg = NULL;
7695   }
7696   return r;
7697 }
7698 
7699 extern int
onig_initialize(OnigEncoding encodings[],int n)7700 onig_initialize(OnigEncoding encodings[], int n)
7701 {
7702   int i;
7703   int r;
7704 
7705   if (onig_inited != 0)
7706     return 0;
7707 
7708   onigenc_init();
7709 
7710   onig_inited = 1;
7711 
7712   for (i = 0; i < n; i++) {
7713     OnigEncoding enc = encodings[i];
7714     r = onig_initialize_encoding(enc);
7715     if (r != 0)
7716       return r;
7717   }
7718 
7719   return ONIG_NORMAL;
7720 }
7721 
7722 typedef struct EndCallListItem {
7723   struct EndCallListItem* next;
7724   void (*func)(void);
7725 } EndCallListItemType;
7726 
7727 static EndCallListItemType* EndCallTop;
7728 
onig_add_end_call(void (* func)(void))7729 extern void onig_add_end_call(void (*func)(void))
7730 {
7731   EndCallListItemType* item;
7732 
7733   item = (EndCallListItemType* )xmalloc(sizeof(*item));
7734   if (item == 0) return ;
7735 
7736   item->next = EndCallTop;
7737   item->func = func;
7738 
7739   EndCallTop = item;
7740 }
7741 
7742 static void
exec_end_call_list(void)7743 exec_end_call_list(void)
7744 {
7745   EndCallListItemType* prev;
7746   void (*func)(void);
7747 
7748   while (EndCallTop != 0) {
7749     func = EndCallTop->func;
7750     (*func)();
7751 
7752     prev = EndCallTop;
7753     EndCallTop = EndCallTop->next;
7754     xfree(prev);
7755   }
7756 }
7757 
7758 extern int
onig_end(void)7759 onig_end(void)
7760 {
7761   exec_end_call_list();
7762 
7763 #ifdef USE_CALLOUT
7764   onig_global_callout_names_free();
7765 #endif
7766 
7767   onigenc_end();
7768 
7769   onig_inited = 0;
7770 
7771   return 0;
7772 }
7773 
7774 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7775 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7776 {
7777   OnigCodePoint n, *data;
7778   OnigCodePoint low, high, x;
7779 
7780   GET_CODE_POINT(n, p);
7781   data = (OnigCodePoint* )p;
7782   data++;
7783 
7784   for (low = 0, high = n; low < high; ) {
7785     x = (low + high) >> 1;
7786     if (code > data[x * 2 + 1])
7787       low = x + 1;
7788     else
7789       high = x;
7790   }
7791 
7792   return ((low < n && code >= data[low * 2]) ? 1 : 0);
7793 }
7794 
7795 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7796 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7797 {
7798   int found;
7799   CClassNode* cc = (CClassNode* )cc_arg;
7800 
7801   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7802     if (IS_NULL(cc->mbuf)) {
7803       found = 0;
7804     }
7805     else {
7806       found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7807     }
7808   }
7809   else {
7810     found = BITSET_AT(cc->bs, code) != 0;
7811   }
7812 
7813   if (IS_NCCLASS_NOT(cc))
7814     return !found;
7815   else
7816     return found;
7817 }
7818 
7819 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7820 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7821 {
7822   int len;
7823 
7824   if (ONIGENC_MBC_MINLEN(enc) > 1) {
7825     len = 2;
7826   }
7827   else {
7828     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7829     if (len < 0) return 0;
7830   }
7831   return onig_is_code_in_cc_len(len, code, cc);
7832 }
7833 
7834 
7835 #define MANY_REPEAT_OF_ANYCHAR   20
7836 
7837 typedef enum {
7838   MJ_NO     = 0,
7839   MJ_YES    = 1,
7840   MJ_IGNORE = 2,
7841 } MJ_RESULT;
7842 
7843 static MJ_RESULT
mostly_just_anychar(Node * node,int in_reluctant)7844 mostly_just_anychar(Node* node, int in_reluctant)
7845 {
7846   MJ_RESULT r;
7847 
7848   r = MJ_NO;
7849   switch (NODE_TYPE(node)) {
7850   case NODE_LIST:
7851     {
7852       int found = FALSE;
7853       do {
7854         r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
7855         if (r == MJ_NO) break;
7856         if (r == MJ_YES) found = TRUE;
7857       } while (IS_NOT_NULL(node = NODE_CDR(node)));
7858       if (r == MJ_IGNORE) {
7859         if (found == TRUE) r = MJ_YES;
7860       }
7861     }
7862     break;
7863 
7864   case NODE_ALT:
7865     r = MJ_IGNORE;
7866     do {
7867       r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
7868       if (r == MJ_YES) break;
7869     } while (IS_NOT_NULL(node = NODE_CDR(node)));
7870     break;
7871 
7872   case NODE_QUANT:
7873     {
7874       QuantNode* qn = QUANT_(node);
7875 
7876       if (qn->upper == 0)
7877         r = MJ_IGNORE;
7878       else {
7879         if (in_reluctant == FALSE) {
7880           if (qn->greedy != 0 &&
7881               (! IS_INFINITE_REPEAT(qn->upper) &&
7882                qn->upper <= MANY_REPEAT_OF_ANYCHAR)) {
7883             in_reluctant = TRUE;
7884           }
7885         }
7886         r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
7887       }
7888     }
7889     break;
7890 
7891   case NODE_ANCHOR:
7892     switch (ANCHOR_(node)->type) {
7893     case ANCR_PREC_READ:
7894     case ANCR_PREC_READ_NOT:
7895     case ANCR_LOOK_BEHIND:
7896     case ANCR_LOOK_BEHIND_NOT:
7897     case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */
7898       r = MJ_IGNORE;
7899       break;
7900     default:
7901       break;
7902     }
7903     break;
7904 
7905   case NODE_BAG:
7906     {
7907       BagNode* en = BAG_(node);
7908 
7909       if (en->type == BAG_IF_ELSE) {
7910         if (IS_NOT_NULL(en->te.Then)) {
7911           r = mostly_just_anychar(en->te.Then, in_reluctant);
7912           if (r == MJ_YES) break;
7913         }
7914         if (IS_NOT_NULL(en->te.Else)) {
7915           r = mostly_just_anychar(en->te.Else, in_reluctant);
7916         }
7917       }
7918       else {
7919         r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
7920       }
7921     }
7922     break;
7923 
7924   case NODE_CTYPE:
7925     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
7926       r = MJ_YES;
7927     else
7928       r = MJ_NO;
7929     break;
7930 
7931   case NODE_STRING:
7932     if (NODE_STRING_LEN(node) == 0) {
7933       r = MJ_IGNORE;
7934       break;
7935     }
7936     /* fall */
7937   case NODE_CCLASS:
7938     r = MJ_NO;
7939     break;
7940 
7941 #ifdef USE_CALL
7942   case NODE_CALL:
7943     /* ignore call */
7944 #endif
7945   case NODE_BACKREF:
7946   case NODE_GIMMICK:
7947     r = MJ_IGNORE;
7948     break;
7949 
7950   default:
7951     break;
7952   }
7953 
7954   return r;
7955 }
7956 
7957 #define MAX_CALLS_IN_DETECT   10
7958 
7959 typedef struct {
7960   int prec_read;
7961   int look_behind;
7962   int backref;
7963   int backref_with_level;
7964   int call;
7965   int anychar_reluctant_many;
7966   int empty_check_nest_level;
7967   int max_empty_check_nest_level;
7968   int heavy_element;
7969 } SlowElementCount;
7970 
7971 static int
detect_can_be_slow(Node * node,SlowElementCount * ct,int ncall,int calls[])7972 detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[])
7973 {
7974   int r;
7975 
7976   r = 0;
7977   switch (NODE_TYPE(node)) {
7978   case NODE_LIST:
7979   case NODE_ALT:
7980     do {
7981       r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls);
7982       if (r != 0) return r;
7983     } while (IS_NOT_NULL(node = NODE_CDR(node)));
7984     break;
7985 
7986   case NODE_QUANT:
7987     {
7988       int prev_heavy_element;
7989       QuantNode* qn;
7990       Node* body;
7991 
7992       qn = QUANT_(node);
7993       body = NODE_BODY(node);
7994 
7995       if (qn->emptiness != BODY_IS_NOT_EMPTY) {
7996         prev_heavy_element = ct->heavy_element;
7997         ct->empty_check_nest_level++;
7998         if (ct->empty_check_nest_level > ct->max_empty_check_nest_level)
7999           ct->max_empty_check_nest_level = ct->empty_check_nest_level;
8000       }
8001       else if (IS_INFINITE_REPEAT(qn->upper) ||
8002                qn->upper > MANY_REPEAT_OF_ANYCHAR) {
8003         MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0));
8004         if (mr == MJ_YES)
8005           ct->anychar_reluctant_many++;
8006       }
8007 
8008       r = detect_can_be_slow(body, ct, ncall, calls);
8009 
8010       if (qn->emptiness != BODY_IS_NOT_EMPTY) {
8011         if (NODE_IS_INPEEK(node)) {
8012           if (ct->empty_check_nest_level > 2) {
8013             if (prev_heavy_element == ct->heavy_element)
8014               ct->heavy_element++;
8015           }
8016         }
8017         ct->empty_check_nest_level--;
8018       }
8019     }
8020     break;
8021 
8022   case NODE_ANCHOR:
8023     switch (ANCHOR_(node)->type) {
8024     case ANCR_PREC_READ:
8025     case ANCR_PREC_READ_NOT:
8026       ct->prec_read++;
8027       break;
8028     case ANCR_LOOK_BEHIND:
8029     case ANCR_LOOK_BEHIND_NOT:
8030       ct->look_behind++;
8031       break;
8032     default:
8033       break;
8034     }
8035 
8036     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
8037       r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
8038     break;
8039 
8040   case NODE_BAG:
8041     {
8042       BagNode* en = BAG_(node);
8043 
8044       r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
8045       if (r != 0) return r;
8046 
8047       if (en->type == BAG_IF_ELSE) {
8048         if (IS_NOT_NULL(en->te.Then)) {
8049           r = detect_can_be_slow(en->te.Then, ct, ncall, calls);
8050           if (r != 0) return r;
8051         }
8052         if (IS_NOT_NULL(en->te.Else)) {
8053           r = detect_can_be_slow(en->te.Else, ct, ncall, calls);
8054           if (r != 0) return r;
8055         }
8056       }
8057     }
8058     break;
8059 
8060 #ifdef USE_BACKREF_WITH_LEVEL
8061   case NODE_BACKREF:
8062     if (NODE_IS_NEST_LEVEL(node))
8063       ct->backref_with_level++;
8064     else
8065       ct->backref++;
8066     break;
8067 #endif
8068 
8069 #ifdef USE_CALL
8070   case NODE_CALL:
8071     {
8072       int i;
8073       int found;
8074       int gnum;
8075 
8076       gnum = CALL_(node)->called_gnum;
8077       ct->call++;
8078 
8079       if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) &&
8080           NODE_IS_IN_REAL_REPEAT(node)) {
8081          ct->heavy_element += 10;
8082       }
8083 
8084       found = FALSE;
8085       for (i = 0; i < ncall; i++) {
8086         if (gnum == calls[i]) {
8087           found = TRUE;
8088           break;
8089         }
8090       }
8091 
8092       if (! found) {
8093         if (ncall + 1 < MAX_CALLS_IN_DETECT) {
8094           calls[ncall] = gnum;
8095           r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls);
8096         }
8097         else {
8098           ct->heavy_element++;
8099         }
8100       }
8101     }
8102     break;
8103 #endif
8104 
8105   default:
8106     break;
8107   }
8108 
8109   return r;
8110 }
8111 
8112 extern int
onig_detect_can_be_slow_pattern(const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax)8113 onig_detect_can_be_slow_pattern(const UChar* pattern,
8114   const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
8115   OnigSyntaxType* syntax)
8116 {
8117   int r;
8118   regex_t* reg;
8119   Node* root;
8120   ParseEnv scan_env;
8121   SlowElementCount count;
8122   int calls[MAX_CALLS_IN_DETECT];
8123 #ifdef USE_CALL
8124   UnsetAddrList  uslist = {0};
8125 #endif
8126 
8127   reg = (regex_t* )xmalloc(sizeof(regex_t));
8128   if (IS_NULL(reg)) return ONIGERR_MEMORY;
8129 
8130   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
8131   if (r != 0) {
8132     xfree(reg);
8133     return r;
8134   }
8135 
8136   r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL
8137 #ifdef USE_CALL
8138                      , &uslist
8139 #endif
8140                     );
8141   if (r != 0) goto err;
8142 
8143 #ifdef USE_CALL
8144   if (scan_env.num_call > 0) {
8145     unset_addr_list_end(&uslist);
8146   }
8147 #endif
8148 
8149   count.prec_read          = 0;
8150   count.look_behind        = 0;
8151   count.backref            = 0;
8152   count.backref_with_level = 0;
8153   count.call               = 0;
8154   count.anychar_reluctant_many     = 0;
8155   count.empty_check_nest_level     = 0;
8156   count.max_empty_check_nest_level = 0;
8157   count.heavy_element = 0;
8158 
8159   r = detect_can_be_slow(root, &count, 0, calls);
8160   if (r == 0) {
8161     int n = count.prec_read + count.look_behind
8162           + count.backref + count.backref_with_level + count.call
8163           + count.anychar_reluctant_many;
8164     if (count.heavy_element != 0)
8165       n += count.heavy_element * 10;
8166 
8167     r = n;
8168   }
8169 
8170   if (IS_NOT_NULL(scan_env.mem_env_dynamic))
8171     xfree(scan_env.mem_env_dynamic);
8172 
8173  err:
8174   onig_node_free(root);
8175   onig_free(reg);
8176   return r;
8177 }
8178 
8179 
8180 #ifdef ONIG_DEBUG_PARSE
8181 
8182 #ifdef USE_CALL
8183 static void
p_string(FILE * f,int len,UChar * s)8184 p_string(FILE* f, int len, UChar* s)
8185 {
8186   fputs(":", f);
8187   while (len-- > 0) { fputc(*s++, f); }
8188 }
8189 #endif
8190 
8191 static void
Indent(FILE * f,int indent)8192 Indent(FILE* f, int indent)
8193 {
8194   int i;
8195   for (i = 0; i < indent; i++) putc(' ', f);
8196 }
8197 
8198 static void
print_indent_tree(FILE * f,Node * node,int indent)8199 print_indent_tree(FILE* f, Node* node, int indent)
8200 {
8201   static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" };
8202 
8203   int i;
8204   NodeType type;
8205   UChar* p;
8206   int add = 3;
8207 
8208   Indent(f, indent);
8209   if (IS_NULL(node)) {
8210     fprintf(f, "ERROR: null node!!!\n");
8211     exit(0);
8212   }
8213 
8214   type = NODE_TYPE(node);
8215   switch (type) {
8216   case NODE_LIST:
8217   case NODE_ALT:
8218     if (type == NODE_LIST)
8219       fprintf(f, "<list:%p>\n", node);
8220     else
8221       fprintf(f, "<alt:%p>\n", node);
8222 
8223     print_indent_tree(f, NODE_CAR(node), indent + add);
8224     while (IS_NOT_NULL(node = NODE_CDR(node))) {
8225       if (NODE_TYPE(node) != type) {
8226         fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
8227         exit(0);
8228       }
8229       print_indent_tree(f, NODE_CAR(node), indent + add);
8230     }
8231     break;
8232 
8233   case NODE_STRING:
8234     {
8235       char* str;
8236       char* mode;
8237 
8238       if (NODE_STRING_IS_CRUDE(node))
8239         mode = "-crude";
8240       else if (NODE_IS_IGNORECASE(node))
8241         mode = "-ignorecase";
8242       else
8243         mode = "";
8244 
8245       if (STR_(node)->s == STR_(node)->end)
8246         str = "empty-string";
8247       else
8248         str = "string";
8249 
8250       fprintf(f, "<%s%s:%p>", str, mode, node);
8251       for (p = STR_(node)->s; p < STR_(node)->end; p++) {
8252         if (*p >= 0x20 && *p < 0x7f)
8253           fputc(*p, f);
8254         else {
8255           fprintf(f, " 0x%02x", *p);
8256         }
8257       }
8258     }
8259     break;
8260 
8261   case NODE_CCLASS:
8262 #define CCLASS_MBUF_MAX_OUTPUT_NUM   10
8263 
8264     fprintf(f, "<cclass:%p>", node);
8265     if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
8266     if (CCLASS_(node)->mbuf) {
8267       BBuf* bbuf = CCLASS_(node)->mbuf;
8268       fprintf(f, " mbuf(%u) ", bbuf->used);
8269       for (i = 0; i < bbuf->used && i < CCLASS_MBUF_MAX_OUTPUT_NUM; i++) {
8270         if (i > 0) fprintf(f, ",");
8271         fprintf(f, "%0x", bbuf->p[i]);
8272       }
8273       if (i < bbuf->used) fprintf(f, "...");
8274     }
8275     break;
8276 
8277   case NODE_CTYPE:
8278     fprintf(f, "<ctype:%p> ", node);
8279     switch (CTYPE_(node)->ctype) {
8280     case CTYPE_ANYCHAR:
8281       fprintf(f, "anychar");
8282       break;
8283 
8284     case ONIGENC_CTYPE_WORD:
8285       if (CTYPE_(node)->not != 0)
8286         fputs("not word", f);
8287       else
8288         fputs("word",     f);
8289 
8290       if (CTYPE_(node)->ascii_mode != 0)
8291         fputs(" (ascii)", f);
8292 
8293       break;
8294 
8295     default:
8296       fprintf(f, "ERROR: undefined ctype.\n");
8297       exit(0);
8298     }
8299     break;
8300 
8301   case NODE_ANCHOR:
8302     fprintf(f, "<anchor:%p> ", node);
8303     switch (ANCHOR_(node)->type) {
8304     case ANCR_BEGIN_BUF:        fputs("begin buf",      f); break;
8305     case ANCR_END_BUF:          fputs("end buf",        f); break;
8306     case ANCR_BEGIN_LINE:       fputs("begin line",     f); break;
8307     case ANCR_END_LINE:         fputs("end line",       f); break;
8308     case ANCR_SEMI_END_BUF:     fputs("semi end buf",   f); break;
8309     case ANCR_BEGIN_POSITION:   fputs("begin position", f); break;
8310 
8311     case ANCR_WORD_BOUNDARY:    fputs("word boundary",     f); break;
8312     case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
8313 #ifdef USE_WORD_BEGIN_END
8314     case ANCR_WORD_BEGIN:       fputs("word begin", f);     break;
8315     case ANCR_WORD_END:         fputs("word end", f);       break;
8316 #endif
8317     case ANCR_TEXT_SEGMENT_BOUNDARY:
8318       fputs("text-segment boundary", f); break;
8319     case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
8320       fputs("no text-segment boundary", f); break;
8321     case ANCR_PREC_READ:
8322       fprintf(f, "prec read\n");
8323       print_indent_tree(f, NODE_BODY(node), indent + add);
8324       break;
8325     case ANCR_PREC_READ_NOT:
8326       fprintf(f, "prec read not\n");
8327       print_indent_tree(f, NODE_BODY(node), indent + add);
8328       break;
8329     case ANCR_LOOK_BEHIND:
8330       fprintf(f, "look behind\n");
8331       print_indent_tree(f, NODE_BODY(node), indent + add);
8332       break;
8333     case ANCR_LOOK_BEHIND_NOT:
8334       fprintf(f, "look behind not\n");
8335       print_indent_tree(f, NODE_BODY(node), indent + add);
8336       break;
8337 
8338     default:
8339       fprintf(f, "ERROR: undefined anchor type.\n");
8340       break;
8341     }
8342     break;
8343 
8344   case NODE_BACKREF:
8345     {
8346       int* p;
8347       BackRefNode* br = BACKREF_(node);
8348       p = BACKREFS_P(br);
8349       fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
8350       for (i = 0; i < br->back_num; i++) {
8351         if (i > 0) fputs(", ", f);
8352         fprintf(f, "%d", p[i]);
8353       }
8354 #ifdef USE_BACKREF_WITH_LEVEL
8355       if (NODE_IS_NEST_LEVEL(node)) {
8356         fprintf(f, ", level: %d", br->nest_level);
8357       }
8358 #endif
8359     }
8360     break;
8361 
8362 #ifdef USE_CALL
8363   case NODE_CALL:
8364     {
8365       CallNode* cn = CALL_(node);
8366       fprintf(f, "<call:%p>", node);
8367       fprintf(f, " num: %d, name", cn->called_gnum);
8368       p_string(f, cn->name_end - cn->name, cn->name);
8369       if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion");
8370       if (NODE_IS_INPEEK(node))    fprintf(f, ", in-peek");
8371       if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat");
8372     }
8373     break;
8374 #endif
8375 
8376   case NODE_QUANT:
8377     {
8378       fprintf(f, "<quantifier:%p>{%d,%d}%s%s%s", node,
8379               QUANT_(node)->lower, QUANT_(node)->upper,
8380               (QUANT_(node)->greedy ? "" : "?"),
8381               QUANT_(node)->include_referred == 0 ? "" : " referred",
8382               emptiness_name[QUANT_(node)->emptiness]);
8383       if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek");
8384       fprintf(f, "\n");
8385       print_indent_tree(f, NODE_BODY(node), indent + add);
8386     }
8387     break;
8388 
8389   case NODE_BAG:
8390     {
8391       BagNode* bn = BAG_(node);
8392       fprintf(f, "<bag:%p> ", node);
8393       if (bn->type == BAG_IF_ELSE) {
8394         Node* Then;
8395         Node* Else;
8396 
8397         fprintf(f, "if-else\n");
8398         print_indent_tree(f, NODE_BODY(node), indent + add);
8399 
8400         Then = bn->te.Then;
8401         Else = bn->te.Else;
8402         if (IS_NULL(Then)) {
8403           Indent(f, indent + add);
8404           fprintf(f, "THEN empty\n");
8405         }
8406         else
8407           print_indent_tree(f, Then, indent + add);
8408 
8409         if (IS_NULL(Else)) {
8410           Indent(f, indent + add);
8411           fprintf(f, "ELSE empty\n");
8412         }
8413         else
8414           print_indent_tree(f, Else, indent + add);
8415       }
8416       else {
8417         switch (bn->type) {
8418         case BAG_OPTION:
8419           fprintf(f, "option:%d", bn->o.options);
8420           break;
8421         case BAG_MEMORY:
8422           fprintf(f, "memory:%d", bn->m.regnum);
8423           if (NODE_IS_CALLED(node)) {
8424             fprintf(f, ", called");
8425             if (NODE_IS_RECURSION(node))
8426               fprintf(f, ", recursion");
8427           }
8428           else if (NODE_IS_REFERENCED(node))
8429             fprintf(f, ", referenced");
8430 
8431           if (NODE_IS_FIXED_ADDR(node))
8432             fprintf(f, ", fixed-addr");
8433           if ((bn->m.called_state & IN_PEEK) != 0)
8434             fprintf(f, ", in-peek");
8435           break;
8436         case BAG_STOP_BACKTRACK:
8437           fprintf(f, "stop-bt");
8438           break;
8439         default:
8440           break;
8441         }
8442         fprintf(f, "\n");
8443         print_indent_tree(f, NODE_BODY(node), indent + add);
8444       }
8445     }
8446     break;
8447 
8448   case NODE_GIMMICK:
8449     fprintf(f, "<gimmick:%p> ", node);
8450     switch (GIMMICK_(node)->type) {
8451     case GIMMICK_FAIL:
8452       fprintf(f, "fail");
8453       break;
8454     case GIMMICK_SAVE:
8455       fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
8456       break;
8457     case GIMMICK_UPDATE_VAR:
8458       fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
8459       break;
8460 #ifdef USE_CALLOUT
8461     case GIMMICK_CALLOUT:
8462       switch (GIMMICK_(node)->detail_type) {
8463       case ONIG_CALLOUT_OF_CONTENTS:
8464         fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
8465         break;
8466       case ONIG_CALLOUT_OF_NAME:
8467         fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
8468         break;
8469       }
8470 #endif
8471     }
8472     break;
8473 
8474   default:
8475     fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
8476     break;
8477   }
8478 
8479   if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
8480       type != NODE_BAG)
8481     fprintf(f, "\n");
8482   fflush(f);
8483 }
8484 
8485 static void
print_tree(FILE * f,Node * node)8486 print_tree(FILE* f, Node* node)
8487 {
8488   print_indent_tree(f, node, 0);
8489 }
8490 #endif
8491