1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2020  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 #define OPS_INIT_SIZE  8
33 
34 typedef struct {
35   OnigLen min;
36   OnigLen max;
37 } MinMaxLen;
38 
39 typedef struct {
40   OnigLen min;
41   OnigLen max;
42   int min_is_sure;
43 } MinMaxCharLen;
44 
45 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
46 
47 static OnigLen node_min_byte_len(Node* node, ScanEnv* env);
48 
49 #if 0
50 typedef struct {
51   int  n;
52   int  alloc;
53   int* v;
54 } int_stack;
55 
56 static int
57 make_int_stack(int_stack** rs, int init_size)
58 {
59   int_stack* s;
60   int* v;
61 
62   *rs = 0;
63 
64   s = xmalloc(sizeof(*s));
65   if (IS_NULL(s)) return ONIGERR_MEMORY;
66 
67   v = (int* )xmalloc(sizeof(int) * init_size);
68   if (IS_NULL(v)) {
69     xfree(s);
70     return ONIGERR_MEMORY;
71   }
72 
73   s->n = 0;
74   s->alloc = init_size;
75   s->v = v;
76 
77   *rs = s;
78   return ONIG_NORMAL;
79 }
80 
81 static void
82 free_int_stack(int_stack* s)
83 {
84   if (IS_NOT_NULL(s)) {
85     if (IS_NOT_NULL(s->v))
86       xfree(s->v);
87     xfree(s);
88   }
89 }
90 
91 static int
92 int_stack_push(int_stack* s, int v)
93 {
94   if (s->n >= s->alloc) {
95     int new_size = s->alloc * 2;
96     int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
97     if (IS_NULL(nv)) return ONIGERR_MEMORY;
98 
99     s->alloc = new_size;
100     s->v = nv;
101   }
102 
103   s->v[s->n] = v;
104   s->n++;
105   return ONIG_NORMAL;
106 }
107 
108 static int
109 int_stack_pop(int_stack* s)
110 {
111   int v;
112 
113 #ifdef ONIG_DEBUG
114   if (s->n <= 0) {
115     fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s);
116     return 0;
117   }
118 #endif
119 
120   v = s->v[s->n];
121   s->n--;
122   return v;
123 }
124 #endif
125 
126 static int
ops_init(regex_t * reg,int init_alloc_size)127 ops_init(regex_t* reg, int init_alloc_size)
128 {
129   Operation* p;
130   size_t size;
131 
132   if (init_alloc_size > 0) {
133     size = sizeof(Operation) * init_alloc_size;
134     p = (Operation* )xrealloc(reg->ops, size);
135     CHECK_NULL_RETURN_MEMERR(p);
136 #ifdef USE_DIRECT_THREADED_CODE
137     {
138       enum OpCode* cp;
139       size = sizeof(enum OpCode) * init_alloc_size;
140       cp = (enum OpCode* )xrealloc(reg->ocs, size);
141       CHECK_NULL_RETURN_MEMERR(cp);
142       reg->ocs = cp;
143     }
144 #endif
145   }
146   else {
147     p  = (Operation* )0;
148 #ifdef USE_DIRECT_THREADED_CODE
149     reg->ocs = (enum OpCode* )0;
150 #endif
151   }
152 
153   reg->ops = p;
154   reg->ops_curr  = 0; /* !!! not yet done ops_new() */
155   reg->ops_alloc = init_alloc_size;
156   reg->ops_used  = 0;
157 
158   return ONIG_NORMAL;
159 }
160 
161 static int
ops_expand(regex_t * reg,int n)162 ops_expand(regex_t* reg, int n)
163 {
164 #define MIN_OPS_EXPAND_SIZE   4
165 
166 #ifdef USE_DIRECT_THREADED_CODE
167   enum OpCode* cp;
168 #endif
169   Operation* p;
170   size_t size;
171 
172   if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
173 
174   n += reg->ops_alloc;
175 
176   size = sizeof(Operation) * n;
177   p = (Operation* )xrealloc(reg->ops, size);
178   CHECK_NULL_RETURN_MEMERR(p);
179 
180 #ifdef USE_DIRECT_THREADED_CODE
181   size = sizeof(enum OpCode) * n;
182   cp = (enum OpCode* )xrealloc(reg->ocs, size);
183   CHECK_NULL_RETURN_MEMERR(cp);
184   reg->ocs = cp;
185 #endif
186 
187   reg->ops = p;
188   reg->ops_alloc = n;
189   if (reg->ops_used == 0)
190     reg->ops_curr = 0;
191   else
192     reg->ops_curr = reg->ops + (reg->ops_used - 1);
193 
194   return ONIG_NORMAL;
195 }
196 
197 static int
ops_new(regex_t * reg)198 ops_new(regex_t* reg)
199 {
200   int r;
201 
202   if (reg->ops_used >= reg->ops_alloc) {
203     r = ops_expand(reg, reg->ops_alloc);
204     if (r != ONIG_NORMAL) return r;
205   }
206 
207   reg->ops_curr = reg->ops + reg->ops_used;
208   reg->ops_used++;
209 
210   xmemset(reg->ops_curr, 0, sizeof(Operation));
211   return ONIG_NORMAL;
212 }
213 
214 static int
is_in_string_pool(regex_t * reg,UChar * s)215 is_in_string_pool(regex_t* reg, UChar* s)
216 {
217   return (s >= reg->string_pool && s < reg->string_pool_end);
218 }
219 
220 static void
ops_free(regex_t * reg)221 ops_free(regex_t* reg)
222 {
223   int i;
224 
225   if (IS_NULL(reg->ops)) return ;
226 
227   for (i = 0; i < (int )reg->ops_used; i++) {
228     enum OpCode opcode;
229     Operation* op;
230 
231     op = reg->ops + i;
232 
233 #ifdef USE_DIRECT_THREADED_CODE
234     opcode = *(reg->ocs + i);
235 #else
236     opcode = op->opcode;
237 #endif
238 
239     switch (opcode) {
240     case OP_STR_MBN:
241       if (! is_in_string_pool(reg, op->exact_len_n.s))
242         xfree(op->exact_len_n.s);
243       break;
244     case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N:
245       if (! is_in_string_pool(reg, op->exact_n.s))
246         xfree(op->exact_n.s);
247       break;
248     case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
249     case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
250     case OP_STR_MB2N3:
251       break;
252 
253     case OP_CCLASS_NOT: case OP_CCLASS:
254       xfree(op->cclass.bsp);
255       break;
256 
257     case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
258       xfree(op->cclass_mb.mb);
259       break;
260     case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
261       xfree(op->cclass_mix.mb);
262       xfree(op->cclass_mix.bsp);
263       break;
264 
265     case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
266       break;
267     case OP_BACKREF_MULTI:      case OP_BACKREF_MULTI_IC:
268     case OP_BACKREF_WITH_LEVEL:
269     case OP_BACKREF_WITH_LEVEL_IC:
270     case OP_BACKREF_CHECK:
271     case OP_BACKREF_CHECK_WITH_LEVEL:
272       if (op->backref_general.num != 1)
273         xfree(op->backref_general.ns);
274       break;
275 
276     default:
277       break;
278     }
279   }
280 
281   xfree(reg->ops);
282 #ifdef USE_DIRECT_THREADED_CODE
283   xfree(reg->ocs);
284   reg->ocs = 0;
285 #endif
286 
287   reg->ops = 0;
288   reg->ops_curr  = 0;
289   reg->ops_alloc = 0;
290   reg->ops_used  = 0;
291 }
292 
293 static int
ops_calc_size_of_string_pool(regex_t * reg)294 ops_calc_size_of_string_pool(regex_t* reg)
295 {
296   int i;
297   int total;
298 
299   if (IS_NULL(reg->ops)) return 0;
300 
301   total = 0;
302   for (i = 0; i < (int )reg->ops_used; i++) {
303     enum OpCode opcode;
304     Operation* op;
305 
306     op = reg->ops + i;
307 #ifdef USE_DIRECT_THREADED_CODE
308     opcode = *(reg->ocs + i);
309 #else
310     opcode = op->opcode;
311 #endif
312 
313     switch (opcode) {
314     case OP_STR_MBN:
315       total += op->exact_len_n.len * op->exact_len_n.n;
316       break;
317     case OP_STR_N:
318     case OP_STR_MB2N:
319       total += op->exact_n.n * 2;
320       break;
321     case OP_STR_MB3N:
322       total += op->exact_n.n * 3;
323       break;
324 
325     default:
326       break;
327     }
328   }
329 
330   return total;
331 }
332 
333 static int
ops_make_string_pool(regex_t * reg)334 ops_make_string_pool(regex_t* reg)
335 {
336   int i;
337   int len;
338   int size;
339   UChar* pool;
340   UChar* curr;
341 
342   size = ops_calc_size_of_string_pool(reg);
343   if (size <= 0) {
344     return 0;
345   }
346 
347   curr = pool = (UChar* )xmalloc((size_t )size);
348   CHECK_NULL_RETURN_MEMERR(pool);
349 
350   for (i = 0; i < (int )reg->ops_used; i++) {
351     enum OpCode opcode;
352     Operation* op;
353 
354     op = reg->ops + i;
355 #ifdef USE_DIRECT_THREADED_CODE
356     opcode = *(reg->ocs + i);
357 #else
358     opcode = op->opcode;
359 #endif
360 
361     switch (opcode) {
362     case OP_STR_MBN:
363       len = op->exact_len_n.len * op->exact_len_n.n;
364       xmemcpy(curr, op->exact_len_n.s, len);
365       xfree(op->exact_len_n.s);
366       op->exact_len_n.s = curr;
367       curr += len;
368       break;
369     case OP_STR_N:
370       len = op->exact_n.n;
371     copy:
372       xmemcpy(curr, op->exact_n.s, len);
373       xfree(op->exact_n.s);
374       op->exact_n.s = curr;
375       curr += len;
376       break;
377     case OP_STR_MB2N:
378       len = op->exact_n.n * 2;
379       goto copy;
380       break;
381     case OP_STR_MB3N:
382       len = op->exact_n.n * 3;
383       goto copy;
384       break;
385 
386     default:
387       break;
388     }
389   }
390 
391   reg->string_pool     = pool;
392   reg->string_pool_end = pool + size;
393   return 0;
394 }
395 
396 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)397 onig_get_default_case_fold_flag(void)
398 {
399   return OnigDefaultCaseFoldFlag;
400 }
401 
402 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)403 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
404 {
405   OnigDefaultCaseFoldFlag = case_fold_flag;
406   return 0;
407 }
408 
409 static int
len_multiply_cmp(OnigLen x,int y,OnigLen v)410 len_multiply_cmp(OnigLen x, int y, OnigLen v)
411 {
412   if (x == 0 || y == 0) return -1;
413 
414   if (x < INFINITE_LEN / y) {
415     OnigLen xy = x * (OnigLen )y;
416     if (xy > v) return 1;
417     else {
418       if (xy == v) return 0;
419       else return -1;
420     }
421   }
422   else
423     return v == INFINITE_LEN ? 0 : 1;
424 }
425 
426 extern int
onig_positive_int_multiply(int x,int y)427 onig_positive_int_multiply(int x, int y)
428 {
429   if (x == 0 || y == 0) return 0;
430 
431   if (x < ONIG_INT_MAX / y)
432     return x * y;
433   else
434     return -1;
435 }
436 
437 
438 static void
node_swap(Node * a,Node * b)439 node_swap(Node* a, Node* b)
440 {
441   Node c;
442 
443   c = *a; *a = *b; *b = c;
444 
445   if (NODE_TYPE(a) == NODE_STRING) {
446     StrNode* sn = STR_(a);
447     if (sn->capacity == 0) {
448       int len = (int )(sn->end - sn->s);
449       sn->s   = sn->buf;
450       sn->end = sn->s + len;
451     }
452   }
453 
454   if (NODE_TYPE(b) == NODE_STRING) {
455     StrNode* sn = STR_(b);
456     if (sn->capacity == 0) {
457       int len = (int )(sn->end - sn->s);
458       sn->s   = sn->buf;
459       sn->end = sn->s + len;
460     }
461   }
462 }
463 
464 static int
node_list_len(Node * list)465 node_list_len(Node* list)
466 {
467   int len;
468 
469   len = 1;
470   while (IS_NOT_NULL(NODE_CDR(list))) {
471     list = NODE_CDR(list);
472     len++;
473   }
474 
475   return len;
476 }
477 
478 static Node*
node_list_add(Node * list,Node * x)479 node_list_add(Node* list, Node* x)
480 {
481   Node *n;
482 
483   n = onig_node_new_list(x, NULL);
484   if (IS_NULL(n)) return NULL_NODE;
485 
486   if (IS_NOT_NULL(list)) {
487     while (IS_NOT_NULL(NODE_CDR(list)))
488       list = NODE_CDR(list);
489 
490     NODE_CDR(list) = n;
491   }
492 
493   return n;
494 }
495 
496 static int
node_str_node_cat(Node * node,Node * add)497 node_str_node_cat(Node* node, Node* add)
498 {
499   int r;
500 
501   if (NODE_STATUS(node) != NODE_STATUS(add))
502     return ONIGERR_TYPE_BUG;
503 
504   if (STR_(node)->flag != STR_(add)->flag)
505     return ONIGERR_TYPE_BUG;
506 
507   r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
508   if (r != 0) return r;
509 
510   return 0;
511 }
512 
513 static void
node_conv_to_str_node(Node * node,Node * ref_node)514 node_conv_to_str_node(Node* node, Node* ref_node)
515 {
516   xmemset(node, 0, sizeof(*node));
517   NODE_SET_TYPE(node, NODE_STRING);
518   NODE_STATUS(node) = NODE_STATUS(ref_node);
519 
520   STR_(node)->flag     = STR_(ref_node)->flag;
521   STR_(node)->s        = STR_(node)->buf;
522   STR_(node)->end      = STR_(node)->buf;
523   STR_(node)->capacity = 0;
524 }
525 
526 static OnigLen
distance_add(OnigLen d1,OnigLen d2)527 distance_add(OnigLen d1, OnigLen d2)
528 {
529   if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
530     return INFINITE_LEN;
531   else {
532     if (d1 <= INFINITE_LEN - d2) return d1 + d2;
533     else return INFINITE_LEN;
534   }
535 }
536 
537 static OnigLen
distance_multiply(OnigLen d,int m)538 distance_multiply(OnigLen d, int m)
539 {
540   if (m == 0) return 0;
541 
542   if (d < INFINITE_LEN / m)
543     return d * m;
544   else
545     return INFINITE_LEN;
546 }
547 
548 static int
bitset_is_empty(BitSetRef bs)549 bitset_is_empty(BitSetRef bs)
550 {
551   int i;
552 
553   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {
554     if (bs[i] != 0) return 0;
555   }
556   return 1;
557 }
558 
559 #ifdef USE_CALL
560 
561 static int
unset_addr_list_init(UnsetAddrList * list,int size)562 unset_addr_list_init(UnsetAddrList* list, int size)
563 {
564   UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
565   CHECK_NULL_RETURN_MEMERR(p);
566 
567   list->num   = 0;
568   list->alloc = size;
569   list->us    = p;
570   return 0;
571 }
572 
573 static void
unset_addr_list_end(UnsetAddrList * list)574 unset_addr_list_end(UnsetAddrList* list)
575 {
576   if (IS_NOT_NULL(list->us))
577     xfree(list->us);
578 }
579 
580 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)581 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
582 {
583   UnsetAddr* p;
584   int size;
585 
586   if (list->num >= list->alloc) {
587     size = list->alloc * 2;
588     p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
589     CHECK_NULL_RETURN_MEMERR(p);
590     list->alloc = size;
591     list->us    = p;
592   }
593 
594   list->us[list->num].offset = offset;
595   list->us[list->num].target = node;
596   list->num++;
597   return 0;
598 }
599 #endif /* USE_CALL */
600 
601 enum CharLenReturnType {
602   CHAR_LEN_NORMAL = 0,       /* fixed or variable */
603   CHAR_LEN_TOP_ALT_FIXED = 1
604 };
605 
606 static int
mmcl_fixed(MinMaxCharLen * c)607 mmcl_fixed(MinMaxCharLen* c)
608 {
609   return (c->min == c->max && c->min != INFINITE_LEN);
610 }
611 
612 static void
mmcl_set(MinMaxCharLen * l,OnigLen len)613 mmcl_set(MinMaxCharLen* l, OnigLen len)
614 {
615   l->min = len;
616   l->max = len;
617   l->min_is_sure = TRUE;
618 }
619 
620 static void
mmcl_set_min_max(MinMaxCharLen * l,OnigLen min,OnigLen max,int min_is_sure)621 mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure)
622 {
623   l->min = min;
624   l->max = max;
625   l->min_is_sure = min_is_sure;
626 }
627 
628 static void
mmcl_add(MinMaxCharLen * to,MinMaxCharLen * add)629 mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add)
630 {
631   to->min = distance_add(to->min, add->min);
632   to->max = distance_add(to->max, add->max);
633 
634   to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0;
635 }
636 
637 static void
mmcl_multiply(MinMaxCharLen * to,int m)638 mmcl_multiply(MinMaxCharLen* to, int m)
639 {
640   to->min = distance_multiply(to->min, m);
641   to->max = distance_multiply(to->max, m);
642 }
643 
644 static void
mmcl_repeat_range_multiply(MinMaxCharLen * to,int mlow,int mhigh)645 mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh)
646 {
647   to->min = distance_multiply(to->min, mlow);
648 
649   if (IS_INFINITE_REPEAT(mhigh))
650     to->max = INFINITE_LEN;
651   else
652     to->max = distance_multiply(to->max, mhigh);
653 }
654 
655 static void
mmcl_alt_merge(MinMaxCharLen * to,MinMaxCharLen * alt)656 mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)
657 {
658   if (to->min > alt->min) {
659     to->min = alt->min;
660     if (alt->min_is_sure != 0)
661       to->min_is_sure = TRUE;
662   }
663 
664   if (to->max < alt->max) to->max = alt->max;
665 }
666 
667 static int
mml_is_equal(MinMaxLen * a,MinMaxLen * b)668 mml_is_equal(MinMaxLen* a, MinMaxLen* b)
669 {
670   return a->min == b->min && a->max == b->max;
671 }
672 
673 static void
mml_set_min_max(MinMaxLen * l,OnigLen min,OnigLen max)674 mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max)
675 {
676   l->min = min;
677   l->max = max;
678 }
679 
680 static void
mml_clear(MinMaxLen * l)681 mml_clear(MinMaxLen* l)
682 {
683   l->min = l->max = 0;
684 }
685 
686 static void
mml_copy(MinMaxLen * to,MinMaxLen * from)687 mml_copy(MinMaxLen* to, MinMaxLen* from)
688 {
689   to->min = from->min;
690   to->max = from->max;
691 }
692 
693 static void
mml_add(MinMaxLen * to,MinMaxLen * add)694 mml_add(MinMaxLen* to, MinMaxLen* add)
695 {
696   to->min = distance_add(to->min, add->min);
697   to->max = distance_add(to->max, add->max);
698 }
699 
700 static void
mml_alt_merge(MinMaxLen * to,MinMaxLen * alt)701 mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)
702 {
703   if (to->min > alt->min) to->min = alt->min;
704   if (to->max < alt->max) to->max = alt->max;
705 }
706 
707 /* fixed size pattern node only */
708 static int
node_char_len1(Node * node,regex_t * reg,MinMaxCharLen * ci,ScanEnv * env,int level)709 node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
710                int level)
711 {
712   MinMaxCharLen tci;
713   int r = CHAR_LEN_NORMAL;
714 
715   level++;
716 
717   switch (NODE_TYPE(node)) {
718   case NODE_LIST:
719     {
720       int first = TRUE;
721       do {
722         r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
723         if (r < 0) break;
724         if (first == TRUE) {
725           *ci = tci;
726           first = FALSE;
727         }
728         else
729           mmcl_add(ci, &tci);
730       } while (IS_NOT_NULL(node = NODE_CDR(node)));
731     }
732     break;
733 
734   case NODE_ALT:
735     {
736       int fixed;
737 
738       r = node_char_len1(NODE_CAR(node), reg, ci, env, level);
739       if (r < 0) break;
740 
741       fixed = TRUE;
742       while (IS_NOT_NULL(node = NODE_CDR(node))) {
743         r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
744         if (r < 0) break;
745         if (! mmcl_fixed(&tci))
746           fixed = FALSE;
747         mmcl_alt_merge(ci, &tci);
748       }
749       if (r < 0) break;
750 
751       r = CHAR_LEN_NORMAL;
752       if (mmcl_fixed(ci)) break;
753 
754       if (fixed == TRUE && level == 1) {
755         r = CHAR_LEN_TOP_ALT_FIXED;
756       }
757     }
758     break;
759 
760   case NODE_STRING:
761     {
762       OnigLen clen;
763       StrNode* sn = STR_(node);
764       UChar *s = sn->s;
765 
766       if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
767         /* Such a case is possible.
768            ex. /(?i)(?<=\1)(a)/
769            Backref node refer to capture group, but it doesn't tune yet.
770          */
771         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
772         break;
773       }
774 
775       clen = 0;
776       while (s < sn->end) {
777         s += enclen(reg->enc, s);
778         clen = distance_add(clen, 1);
779       }
780       mmcl_set(ci, clen);
781     }
782     break;
783 
784   case NODE_QUANT:
785     {
786       QuantNode* qn = QUANT_(node);
787 
788       if (qn->lower == qn->upper) {
789         if (qn->upper == 0) {
790           mmcl_set(ci, 0);
791         }
792         else {
793           r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
794           if (r < 0) break;
795           mmcl_multiply(ci, qn->lower);
796         }
797       }
798       else {
799         r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
800         if (r < 0) break;
801         mmcl_repeat_range_multiply(ci, qn->lower, qn->upper);
802       }
803     }
804     break;
805 
806 #ifdef USE_CALL
807   case NODE_CALL:
808     if (NODE_IS_RECURSION(node))
809       mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
810     else
811       r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
812     break;
813 #endif
814 
815   case NODE_CTYPE:
816   case NODE_CCLASS:
817     mmcl_set(ci, 1);
818     break;
819 
820   case NODE_BAG:
821     {
822       BagNode* en = BAG_(node);
823 
824       switch (en->type) {
825       case BAG_MEMORY:
826         if (NODE_IS_FIXED_CLEN(node)) {
827           mmcl_set_min_max(ci, en->min_char_len, en->max_char_len,
828                            NODE_IS_FIXED_CLEN_MIN_SURE(node));
829         }
830         else {
831           r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
832           if (r < 0) break;
833 
834           en->min_char_len = ci->min;
835           en->max_char_len = ci->max;
836           NODE_STATUS_ADD(node, FIXED_CLEN);
837           if (ci->min_is_sure != 0)
838             NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE);
839         }
840         /* can't optimize look-behind if capture exists. */
841         ci->min_is_sure = FALSE;
842         break;
843       case BAG_OPTION:
844       case BAG_STOP_BACKTRACK:
845         r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
846         break;
847       case BAG_IF_ELSE:
848         {
849           MinMaxCharLen eci;
850 
851           r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
852           if (r < 0) break;
853 
854           if (IS_NOT_NULL(en->te.Then)) {
855             r = node_char_len1(en->te.Then, reg, &tci, env, level);
856             if (r < 0) break;
857             mmcl_add(ci, &tci);
858           }
859 
860           if (IS_NOT_NULL(en->te.Else)) {
861             r = node_char_len1(en->te.Else, reg, &eci, env, level);
862             if (r < 0) break;
863           }
864           else {
865             mmcl_set(&eci, 0);
866           }
867 
868           mmcl_alt_merge(ci, &eci);
869         }
870         break;
871       default: /* never come here */
872         r = ONIGERR_PARSER_BUG;
873         break;
874       }
875     }
876     break;
877 
878   case NODE_ANCHOR:
879     mmcl_set(ci, 0);
880     /* can't optimize look-behind if anchor exists. */
881     ci->min_is_sure = FALSE;
882     break;
883 
884   case NODE_GIMMICK:
885   zero:
886     mmcl_set(ci, 0);
887     break;
888 
889   case NODE_BACKREF:
890     if (NODE_IS_CHECKER(node))
891       goto zero;
892 
893     if (NODE_IS_RECURSION(node)) {
894 #ifdef USE_BACKREF_WITH_LEVEL
895       if (NODE_IS_NEST_LEVEL(node)) {
896         mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
897         break;
898       }
899 #endif
900 
901       mmcl_set_min_max(ci, 0, 0, FALSE);
902       break;
903     }
904 
905     {
906       int i;
907       int* backs;
908       MemEnv* mem_env = SCANENV_MEMENV(env);
909       BackRefNode* br = BACKREF_(node);
910 
911       backs = BACKREFS_P(br);
912       r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level);
913       if (r < 0) break;
914       if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE;
915 
916       for (i = 1; i < br->back_num; i++) {
917         r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level);
918         if (r < 0) break;
919         if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE;
920         mmcl_alt_merge(ci, &tci);
921       }
922     }
923     break;
924 
925   default: /* never come here */
926     r = ONIGERR_PARSER_BUG;
927     break;
928   }
929 
930   return r;
931 }
932 
933 static int
node_char_len(Node * node,regex_t * reg,MinMaxCharLen * ci,ScanEnv * env)934 node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env)
935 {
936   return node_char_len1(node, reg, ci, env, 0);
937 }
938 
939 
940 static int
add_op(regex_t * reg,int opcode)941 add_op(regex_t* reg, int opcode)
942 {
943   int r;
944 
945   r = ops_new(reg);
946   if (r != ONIG_NORMAL) return r;
947 
948 #ifdef USE_DIRECT_THREADED_CODE
949   *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
950 #else
951   reg->ops_curr->opcode = opcode;
952 #endif
953 
954   return 0;
955 }
956 
957 static int compile_length_tree(Node* node, regex_t* reg);
958 static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
959 
960 
961 #define IS_NEED_STR_LEN_OP(op) \
962    ((op) == OP_STR_N    || (op) == OP_STR_MB2N ||\
963     (op) == OP_STR_MB3N || (op) == OP_STR_MBN)
964 
965 static int
select_str_opcode(int mb_len,int str_len)966 select_str_opcode(int mb_len, int str_len)
967 {
968   int op;
969 
970   switch (mb_len) {
971   case 1:
972     switch (str_len) {
973     case 1:  op = OP_STR_1; break;
974     case 2:  op = OP_STR_2; break;
975     case 3:  op = OP_STR_3; break;
976     case 4:  op = OP_STR_4; break;
977     case 5:  op = OP_STR_5; break;
978     default: op = OP_STR_N; break;
979     }
980     break;
981 
982   case 2:
983     switch (str_len) {
984     case 1:  op = OP_STR_MB2N1; break;
985     case 2:  op = OP_STR_MB2N2; break;
986     case 3:  op = OP_STR_MB2N3; break;
987     default: op = OP_STR_MB2N;  break;
988     }
989     break;
990 
991   case 3:
992     op = OP_STR_MB3N;
993     break;
994 
995   default:
996     op = OP_STR_MBN;
997     break;
998   }
999 
1000   return op;
1001 }
1002 
1003 static int
is_strict_real_node(Node * node)1004 is_strict_real_node(Node* node)
1005 {
1006   switch (NODE_TYPE(node)) {
1007   case NODE_STRING:
1008     {
1009       StrNode* sn = STR_(node);
1010       return (sn->end != sn->s);
1011     }
1012     break;
1013 
1014   case NODE_CCLASS:
1015   case NODE_CTYPE:
1016     return 1;
1017     break;
1018 
1019   default:
1020     return 0;
1021     break;
1022   }
1023 }
1024 
1025 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ScanEnv * env)1026 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
1027 {
1028   int r;
1029   int saved_num_empty_check;
1030   int emptiness;
1031   Node* body;
1032 
1033   body = NODE_BODY((Node* )qn);
1034   emptiness = qn->emptiness;
1035   saved_num_empty_check = reg->num_empty_check;
1036 
1037   if (emptiness != BODY_IS_NOT_EMPTY) {
1038     r = add_op(reg, OP_EMPTY_CHECK_START);
1039     if (r != 0) return r;
1040     COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
1041     reg->num_empty_check++;
1042   }
1043 
1044   r = compile_tree(body, reg, env);
1045   if (r != 0) return r;
1046 
1047   if (emptiness != BODY_IS_NOT_EMPTY) {
1048     if (emptiness == BODY_MAY_BE_EMPTY)
1049       r = add_op(reg, OP_EMPTY_CHECK_END);
1050     else if (emptiness == BODY_MAY_BE_EMPTY_MEM) {
1051       if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0)
1052         r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
1053       else
1054         r = add_op(reg, OP_EMPTY_CHECK_END);
1055     }
1056 #ifdef USE_CALL
1057     else if (emptiness == BODY_MAY_BE_EMPTY_REC)
1058       r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
1059 #endif
1060 
1061     if (r != 0) return r;
1062     COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
1063   }
1064   return r;
1065 }
1066 
1067 #ifdef USE_CALL
1068 static int
compile_call(CallNode * node,regex_t * reg,ScanEnv * env)1069 compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
1070 {
1071   int r;
1072   int offset;
1073 
1074   r = add_op(reg, OP_CALL);
1075   if (r != 0) return r;
1076 
1077   COP(reg)->call.addr = 0; /* dummy addr. */
1078 
1079   offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
1080   r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
1081   return r;
1082 }
1083 #endif
1084 
1085 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ScanEnv * env)1086 compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
1087 {
1088   int i, r;
1089 
1090   for (i = 0; i < n; i++) {
1091     r = compile_tree(node, reg, env);
1092     if (r != 0) return r;
1093   }
1094   return 0;
1095 }
1096 
1097 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)1098 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
1099                           regex_t* reg ARG_UNUSED)
1100 {
1101   return 1;
1102 }
1103 
1104 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)1105 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
1106 {
1107   int op;
1108   int r;
1109   int byte_len;
1110   UChar* p;
1111   UChar* end;
1112 
1113   op = select_str_opcode(mb_len, str_len);
1114   r = add_op(reg, op);
1115   if (r != 0) return r;
1116 
1117   byte_len = mb_len * str_len;
1118   end = s + byte_len;
1119 
1120   if (op == OP_STR_MBN) {
1121     p = onigenc_strdup(reg->enc, s, end);
1122     CHECK_NULL_RETURN_MEMERR(p);
1123 
1124     COP(reg)->exact_len_n.len = mb_len;
1125     COP(reg)->exact_len_n.n   = str_len;
1126     COP(reg)->exact_len_n.s   = p;
1127   }
1128   else if (IS_NEED_STR_LEN_OP(op)) {
1129     p = onigenc_strdup(reg->enc, s, end);
1130     CHECK_NULL_RETURN_MEMERR(p);
1131     COP(reg)->exact_n.n = str_len;
1132     COP(reg)->exact_n.s = p;
1133   }
1134   else {
1135     xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
1136     xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
1137   }
1138 
1139   return 0;
1140 }
1141 
1142 static int
compile_length_string_node(Node * node,regex_t * reg)1143 compile_length_string_node(Node* node, regex_t* reg)
1144 {
1145   int rlen, r, len, prev_len, slen;
1146   UChar *p, *prev;
1147   StrNode* sn;
1148   OnigEncoding enc = reg->enc;
1149 
1150   sn = STR_(node);
1151   if (sn->end <= sn->s)
1152     return 0;
1153 
1154   p = prev = sn->s;
1155   prev_len = enclen(enc, p);
1156   p += prev_len;
1157   slen = 1;
1158   rlen = 0;
1159 
1160   for (; p < sn->end; ) {
1161     len = enclen(enc, p);
1162     if (len == prev_len) {
1163       slen++;
1164     }
1165     else {
1166       r = add_compile_string_length(prev, prev_len, slen, reg);
1167       rlen += r;
1168       prev = p;
1169       slen = 1;
1170       prev_len = len;
1171     }
1172     p += len;
1173   }
1174 
1175   r = add_compile_string_length(prev, prev_len, slen, reg);
1176   rlen += r;
1177   return rlen;
1178 }
1179 
1180 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)1181 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
1182 {
1183   if (sn->end <= sn->s)
1184     return 0;
1185 
1186   return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
1187                                    reg);
1188 }
1189 
1190 static int
compile_string_node(Node * node,regex_t * reg)1191 compile_string_node(Node* node, regex_t* reg)
1192 {
1193   int r, len, prev_len, slen;
1194   UChar *p, *prev, *end;
1195   StrNode* sn;
1196   OnigEncoding enc = reg->enc;
1197 
1198   sn = STR_(node);
1199   if (sn->end <= sn->s)
1200     return 0;
1201 
1202   end = sn->end;
1203 
1204   p = prev = sn->s;
1205   prev_len = enclen(enc, p);
1206   p += prev_len;
1207   slen = 1;
1208 
1209   for (; p < end; ) {
1210     len = enclen(enc, p);
1211     if (len == prev_len) {
1212       slen++;
1213     }
1214     else {
1215       r = add_compile_string(prev, prev_len, slen, reg);
1216       if (r != 0) return r;
1217 
1218       prev  = p;
1219       slen  = 1;
1220       prev_len = len;
1221     }
1222 
1223     p += len;
1224   }
1225 
1226   return add_compile_string(prev, prev_len, slen, reg);
1227 }
1228 
1229 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)1230 compile_string_crude_node(StrNode* sn, regex_t* reg)
1231 {
1232   if (sn->end <= sn->s)
1233     return 0;
1234 
1235   return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
1236 }
1237 
1238 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)1239 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
1240 {
1241   size_t len;
1242   void* p;
1243 
1244   len = (size_t )mbuf->used;
1245   p = xmalloc(len);
1246   if (IS_NULL(p)) return NULL;
1247 
1248   xmemcpy(p, mbuf->p, len);
1249   return p;
1250 }
1251 
1252 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)1253 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
1254 {
1255   return 1;
1256 }
1257 
1258 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)1259 compile_cclass_node(CClassNode* cc, regex_t* reg)
1260 {
1261   int r;
1262 
1263   if (IS_NULL(cc->mbuf)) {
1264     r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
1265     if (r != 0) return r;
1266 
1267     COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
1268     CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
1269     xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
1270   }
1271   else {
1272     void* p;
1273 
1274     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
1275       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
1276       if (r != 0) return r;
1277 
1278       p = set_multi_byte_cclass(cc->mbuf, reg);
1279       CHECK_NULL_RETURN_MEMERR(p);
1280       COP(reg)->cclass_mb.mb = p;
1281     }
1282     else {
1283       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
1284       if (r != 0) return r;
1285 
1286       COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
1287       CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
1288       xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
1289 
1290       p = set_multi_byte_cclass(cc->mbuf, reg);
1291       CHECK_NULL_RETURN_MEMERR(p);
1292       COP(reg)->cclass_mix.mb = p;
1293     }
1294   }
1295 
1296   return 0;
1297 }
1298 
1299 static void
set_addr_in_repeat_range(regex_t * reg)1300 set_addr_in_repeat_range(regex_t* reg)
1301 {
1302   int i;
1303 
1304   for (i = 0; i < reg->num_repeat; i++) {
1305     RepeatRange* p = reg->repeat_range + i;
1306     int offset = p->u.offset;
1307     p->u.pcode = reg->ops + offset;
1308   }
1309 }
1310 
1311 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1312 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1313 {
1314 #define REPEAT_RANGE_ALLOC  4
1315 
1316   RepeatRange* p;
1317 
1318   if (reg->repeat_range_alloc == 0) {
1319     p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1320     CHECK_NULL_RETURN_MEMERR(p);
1321     reg->repeat_range = p;
1322     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1323   }
1324   else if (reg->repeat_range_alloc <= id) {
1325     int n;
1326     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1327     p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1328     CHECK_NULL_RETURN_MEMERR(p);
1329     reg->repeat_range = p;
1330     reg->repeat_range_alloc = n;
1331   }
1332   else {
1333     p = reg->repeat_range;
1334   }
1335 
1336   p[id].lower    = lower;
1337   p[id].upper    = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1338   p[id].u.offset = ops_index;
1339   return 0;
1340 }
1341 
1342 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ScanEnv * env)1343 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1344                           regex_t* reg, ScanEnv* env)
1345 {
1346   int r;
1347   int num_repeat = reg->num_repeat++;
1348 
1349   r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1350   if (r != 0) return r;
1351 
1352   COP(reg)->repeat.id   = num_repeat;
1353   COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1354 
1355   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1356                          COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1357   if (r != 0) return r;
1358 
1359   r = compile_quant_body_with_empty_check(qn, reg, env);
1360   if (r != 0) return r;
1361 
1362   r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1363   if (r != 0) return r;
1364 
1365   COP(reg)->repeat_inc.id = num_repeat;
1366   return r;
1367 }
1368 
1369 static int
is_anychar_infinite_greedy(QuantNode * qn)1370 is_anychar_infinite_greedy(QuantNode* qn)
1371 {
1372   if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1373       NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1374     return 1;
1375   else
1376     return 0;
1377 }
1378 
1379 #define QUANTIFIER_EXPAND_LIMIT_SIZE   10
1380 #define CKN_ON   (ckn > 0)
1381 
1382 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1383 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1384 {
1385   int len, mod_tlen;
1386   int infinite = IS_INFINITE_REPEAT(qn->upper);
1387   enum BodyEmptyType emptiness = qn->emptiness;
1388   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1389 
1390   if (tlen < 0) return tlen;
1391   if (tlen == 0) return 0;
1392 
1393   /* anychar repeat */
1394   if (is_anychar_infinite_greedy(qn)) {
1395     if (qn->lower <= 1 ||
1396         len_multiply_cmp((OnigLen )tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1397       if (IS_NOT_NULL(qn->next_head_exact))
1398         return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1399       else
1400         return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1401     }
1402   }
1403 
1404   mod_tlen = tlen;
1405   if (emptiness != BODY_IS_NOT_EMPTY)
1406     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1407 
1408   if (infinite &&
1409       (qn->lower <= 1 ||
1410        len_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1411     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1412       len = OPSIZE_JUMP;
1413     }
1414     else {
1415       len = tlen * qn->lower;
1416     }
1417 
1418     if (qn->greedy) {
1419 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1420       if (IS_NOT_NULL(qn->head_exact))
1421         len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1422       else
1423 #endif
1424       if (IS_NOT_NULL(qn->next_head_exact))
1425         len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1426       else
1427         len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1428     }
1429     else
1430       len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1431   }
1432   else if (qn->upper == 0) {
1433     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1434       len = OPSIZE_JUMP + tlen;
1435     }
1436     else
1437       len = 0;
1438   }
1439   else if (!infinite && qn->greedy &&
1440            (qn->upper == 1 ||
1441             len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1442                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1443     len = tlen * qn->lower;
1444     len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1445   }
1446   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1447     len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1448   }
1449   else {
1450     len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1451   }
1452 
1453   return len;
1454 }
1455 
1456 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ScanEnv * env)1457 compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
1458 {
1459   int i, r, mod_tlen;
1460   int infinite = IS_INFINITE_REPEAT(qn->upper);
1461   enum BodyEmptyType emptiness = qn->emptiness;
1462   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1463 
1464   if (tlen < 0) return tlen;
1465   if (tlen == 0) return 0;
1466 
1467   if (is_anychar_infinite_greedy(qn) &&
1468       (qn->lower <= 1 ||
1469        len_multiply_cmp((OnigLen )tlen, qn->lower,
1470                         QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1471     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1472     if (r != 0) return r;
1473     if (IS_NOT_NULL(qn->next_head_exact)) {
1474       r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1475                  OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1476       if (r != 0) return r;
1477 
1478       COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1479       return 0;
1480     }
1481     else {
1482       r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1483                  OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1484       return r;
1485     }
1486   }
1487 
1488   mod_tlen = tlen;
1489   if (emptiness != BODY_IS_NOT_EMPTY)
1490     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1491 
1492   if (infinite &&
1493       (qn->lower <= 1 ||
1494        len_multiply_cmp((OnigLen )tlen, qn->lower,
1495                         QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1496     int addr;
1497 
1498     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1499       r = add_op(reg, OP_JUMP);
1500       if (r != 0) return r;
1501       if (qn->greedy) {
1502 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1503         if (IS_NOT_NULL(qn->head_exact))
1504           COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1505         else
1506 #endif
1507         if (IS_NOT_NULL(qn->next_head_exact))
1508           COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1509         else
1510           COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1511       }
1512       else {
1513         COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1514       }
1515     }
1516     else {
1517       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1518       if (r != 0) return r;
1519     }
1520 
1521     if (qn->greedy) {
1522 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1523       if (IS_NOT_NULL(qn->head_exact)) {
1524         r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1525         if (r != 0) return r;
1526         COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1527         COP(reg)->push_or_jump_exact1.c    = STR_(qn->head_exact)->s[0];
1528 
1529         r = compile_quant_body_with_empty_check(qn, reg, env);
1530         if (r != 0) return r;
1531 
1532         addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1533       }
1534       else
1535 #endif
1536       if (IS_NOT_NULL(qn->next_head_exact)) {
1537         r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1538         if (r != 0) return r;
1539         COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1540         COP(reg)->push_if_peek_next.c    = STR_(qn->next_head_exact)->s[0];
1541 
1542         r = compile_quant_body_with_empty_check(qn, reg, env);
1543         if (r != 0) return r;
1544 
1545         addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1546       }
1547       else {
1548         r = add_op(reg, OP_PUSH);
1549         if (r != 0) return r;
1550         COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1551 
1552         r = compile_quant_body_with_empty_check(qn, reg, env);
1553         if (r != 0) return r;
1554 
1555         addr = -(mod_tlen + (int )OPSIZE_PUSH);
1556       }
1557 
1558       r = add_op(reg, OP_JUMP);
1559       if (r != 0) return r;
1560       COP(reg)->jump.addr = addr;
1561     }
1562     else {
1563       r = add_op(reg, OP_JUMP);
1564       if (r != 0) return r;
1565       COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1566 
1567       r = compile_quant_body_with_empty_check(qn, reg, env);
1568       if (r != 0) return r;
1569 
1570       r = add_op(reg, OP_PUSH);
1571       if (r != 0) return r;
1572       COP(reg)->push.addr = -mod_tlen;
1573     }
1574   }
1575   else if (qn->upper == 0) {
1576     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1577       r = add_op(reg, OP_JUMP);
1578       if (r != 0) return r;
1579       COP(reg)->jump.addr = tlen + SIZE_INC;
1580 
1581       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1582     }
1583     else {
1584       /* Nothing output */
1585       r = 0;
1586     }
1587   }
1588   else if (! infinite && qn->greedy &&
1589            (qn->upper == 1 ||
1590             len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1591                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1592     int n = qn->upper - qn->lower;
1593 
1594     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1595     if (r != 0) return r;
1596 
1597     for (i = 0; i < n; i++) {
1598       int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1599       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1600 
1601       r = add_op(reg, OP_PUSH);
1602       if (r != 0) return r;
1603       COP(reg)->push.addr = v;
1604 
1605       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1606       if (r != 0) return r;
1607     }
1608   }
1609   else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1610     r = add_op(reg, OP_PUSH);
1611     if (r != 0) return r;
1612     COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1613 
1614     r = add_op(reg, OP_JUMP);
1615     if (r != 0) return r;
1616     COP(reg)->jump.addr = tlen + SIZE_INC;
1617 
1618     r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1619   }
1620   else {
1621     r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1622   }
1623   return r;
1624 }
1625 
1626 static int
compile_length_option_node(BagNode * node,regex_t * reg)1627 compile_length_option_node(BagNode* node, regex_t* reg)
1628 {
1629   int tlen;
1630 
1631   tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1632 
1633   return tlen;
1634 }
1635 
1636 static int
compile_option_node(BagNode * node,regex_t * reg,ScanEnv * env)1637 compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
1638 {
1639   int r;
1640 
1641   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1642 
1643   return r;
1644 }
1645 
1646 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1647 compile_length_bag_node(BagNode* node, regex_t* reg)
1648 {
1649   int len;
1650   int tlen;
1651 
1652   if (node->type == BAG_OPTION)
1653     return compile_length_option_node(node, reg);
1654 
1655   if (NODE_BAG_BODY(node)) {
1656     tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1657     if (tlen < 0) return tlen;
1658   }
1659   else
1660     tlen = 0;
1661 
1662   switch (node->type) {
1663   case BAG_MEMORY:
1664 #ifdef USE_CALL
1665 
1666     if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1667       len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1668       return len;
1669     }
1670 
1671     if (NODE_IS_CALLED(node)) {
1672       len = OPSIZE_MEM_START_PUSH + tlen
1673         + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1674       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1675         len += (NODE_IS_RECURSION(node)
1676                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1677       else
1678         len += (NODE_IS_RECURSION(node)
1679                 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1680     }
1681     else if (NODE_IS_RECURSION(node)) {
1682       len = OPSIZE_MEM_START_PUSH;
1683       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1684                      ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1685     }
1686     else
1687 #endif
1688     {
1689       if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1690         len = OPSIZE_MEM_START_PUSH;
1691       else
1692         len = OPSIZE_MEM_START;
1693 
1694       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1695                      ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1696     }
1697     break;
1698 
1699   case BAG_STOP_BACKTRACK:
1700     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1701       int v;
1702       QuantNode* qn;
1703 
1704       qn = QUANT_(NODE_BAG_BODY(node));
1705       tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1706       if (tlen < 0) return tlen;
1707 
1708       v = onig_positive_int_multiply(qn->lower, tlen);
1709       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1710       len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP;
1711     }
1712     else {
1713       len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1714     }
1715     break;
1716 
1717   case BAG_IF_ELSE:
1718     {
1719       Node* cond = NODE_BAG_BODY(node);
1720       Node* Then = node->te.Then;
1721       Node* Else = node->te.Else;
1722 
1723       len = compile_length_tree(cond, reg);
1724       if (len < 0) return len;
1725       len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK;
1726 
1727       if (IS_NOT_NULL(Then)) {
1728         tlen = compile_length_tree(Then, reg);
1729         if (tlen < 0) return tlen;
1730         len += tlen;
1731       }
1732 
1733       len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK;
1734 
1735       if (IS_NOT_NULL(Else)) {
1736         tlen = compile_length_tree(Else, reg);
1737         if (tlen < 0) return tlen;
1738         len += tlen;
1739       }
1740     }
1741     break;
1742 
1743   case BAG_OPTION:
1744     /* never come here, but set for escape warning */
1745     len = 0;
1746     break;
1747   }
1748 
1749   return len;
1750 }
1751 
1752 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ScanEnv * env)1753 compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
1754 {
1755   int r;
1756 
1757 #ifdef USE_CALL
1758   if (NODE_IS_CALLED(node)) {
1759     int len;
1760 
1761     r = add_op(reg, OP_CALL);
1762     if (r != 0) return r;
1763 
1764     node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1765     NODE_STATUS_ADD(node, FIXED_ADDR);
1766     COP(reg)->call.addr = (int )node->m.called_addr;
1767 
1768     if (node->m.regnum == 0) {
1769       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1770       len += OPSIZE_RETURN;
1771 
1772       r = add_op(reg, OP_JUMP);
1773       if (r != 0) return r;
1774       COP(reg)->jump.addr = len + SIZE_INC;
1775 
1776       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1777       if (r != 0) return r;
1778 
1779       r = add_op(reg, OP_RETURN);
1780       return r;
1781     }
1782     else {
1783       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1784       len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1785       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1786         len += (NODE_IS_RECURSION(node)
1787                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1788       else
1789         len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1790 
1791       r = add_op(reg, OP_JUMP);
1792       if (r != 0) return r;
1793       COP(reg)->jump.addr = len + SIZE_INC;
1794     }
1795   }
1796 #endif
1797 
1798   if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1799     r = add_op(reg, OP_MEM_START_PUSH);
1800   else
1801     r = add_op(reg, OP_MEM_START);
1802   if (r != 0) return r;
1803   COP(reg)->memory_start.num = node->m.regnum;
1804 
1805   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1806   if (r != 0) return r;
1807 
1808 #ifdef USE_CALL
1809   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1810     r = add_op(reg, (NODE_IS_RECURSION(node)
1811                      ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1812   else
1813     r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1814   if (r != 0) return r;
1815   COP(reg)->memory_end.num = node->m.regnum;
1816 
1817   if (NODE_IS_CALLED(node)) {
1818     if (r != 0) return r;
1819     r = add_op(reg, OP_RETURN);
1820   }
1821 #else
1822   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1823     r = add_op(reg, OP_MEM_END_PUSH);
1824   else
1825     r = add_op(reg, OP_MEM_END);
1826   if (r != 0) return r;
1827   COP(reg)->memory_end.num = node->m.regnum;
1828 #endif
1829 
1830   return r;
1831 }
1832 
1833 static int
compile_bag_node(BagNode * node,regex_t * reg,ScanEnv * env)1834 compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
1835 {
1836   int r, len;
1837 
1838   switch (node->type) {
1839   case BAG_MEMORY:
1840     r = compile_bag_memory_node(node, reg, env);
1841     break;
1842 
1843   case BAG_OPTION:
1844     r = compile_option_node(node, reg, env);
1845     break;
1846 
1847   case BAG_STOP_BACKTRACK:
1848     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1849       QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1850       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1851       if (r != 0) return r;
1852 
1853       len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1854       if (len < 0) return len;
1855 
1856       r = add_op(reg, OP_PUSH);
1857       if (r != 0) return r;
1858       COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP;
1859 
1860       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1861       if (r != 0) return r;
1862       r = add_op(reg, OP_POP);
1863       if (r != 0) return r;
1864 
1865       r = add_op(reg, OP_JUMP);
1866       if (r != 0) return r;
1867       COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP);
1868     }
1869     else {
1870       MemNumType mid;
1871 
1872       ID_ENTRY(env, mid);
1873       r = add_op(reg, OP_MARK);
1874       if (r != 0) return r;
1875       COP(reg)->mark.id = mid;
1876       COP(reg)->mark.save_pos = 0;
1877 
1878       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1879       if (r != 0) return r;
1880       r = add_op(reg, OP_CUT_TO_MARK);
1881       if (r != 0) return r;
1882       COP(reg)->cut_to_mark.id = mid;
1883       COP(reg)->cut_to_mark.restore_pos = 0;
1884     }
1885     break;
1886 
1887   case BAG_IF_ELSE:
1888     {
1889       int cond_len, then_len, else_len, jump_len;
1890       MemNumType mid;
1891       Node* cond = NODE_BAG_BODY(node);
1892       Node* Then = node->te.Then;
1893       Node* Else = node->te.Else;
1894 
1895       ID_ENTRY(env, mid);
1896 
1897       r = add_op(reg, OP_MARK);
1898       if (r != 0) return r;
1899       COP(reg)->mark.id = mid;
1900       COP(reg)->mark.save_pos = 0;
1901 
1902       cond_len = compile_length_tree(cond, reg);
1903       if (cond_len < 0) return cond_len;
1904       if (IS_NOT_NULL(Then)) {
1905         then_len = compile_length_tree(Then, reg);
1906         if (then_len < 0) return then_len;
1907       }
1908       else
1909         then_len = 0;
1910 
1911       jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP;
1912 
1913       r = add_op(reg, OP_PUSH);
1914       if (r != 0) return r;
1915       COP(reg)->push.addr = SIZE_INC + jump_len;
1916 
1917       r = compile_tree(cond, reg, env);
1918       if (r != 0) return r;
1919       r = add_op(reg, OP_CUT_TO_MARK);
1920       if (r != 0) return r;
1921       COP(reg)->cut_to_mark.id = mid;
1922       COP(reg)->cut_to_mark.restore_pos = 0;
1923 
1924       if (IS_NOT_NULL(Then)) {
1925         r = compile_tree(Then, reg, env);
1926         if (r != 0) return r;
1927       }
1928 
1929       if (IS_NOT_NULL(Else)) {
1930         else_len = compile_length_tree(Else, reg);
1931         if (else_len < 0) return else_len;
1932       }
1933       else
1934         else_len = 0;
1935 
1936       r = add_op(reg, OP_JUMP);
1937       if (r != 0) return r;
1938       COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC;
1939 
1940       r = add_op(reg, OP_CUT_TO_MARK);
1941       if (r != 0) return r;
1942       COP(reg)->cut_to_mark.id = mid;
1943       COP(reg)->cut_to_mark.restore_pos = 0;
1944 
1945       if (IS_NOT_NULL(Else)) {
1946         r = compile_tree(Else, reg, env);
1947       }
1948     }
1949     break;
1950   }
1951 
1952   return r;
1953 }
1954 
1955 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1956 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1957 {
1958   int len;
1959   int tlen = 0;
1960 
1961   if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1962     tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1963     if (tlen < 0) return tlen;
1964   }
1965 
1966   switch (node->type) {
1967   case ANCR_PREC_READ:
1968     len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1969     break;
1970   case ANCR_PREC_READ_NOT:
1971     len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
1972     break;
1973   case ANCR_LOOK_BEHIND:
1974     if (node->char_min_len == node->char_max_len)
1975       len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK;
1976     else {
1977       len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR;
1978 
1979       if (IS_NOT_NULL(node->lead_node)) {
1980         int llen = compile_length_tree(node->lead_node, reg);
1981         if (llen < 0) return llen;
1982 
1983         len += OPSIZE_MOVE + llen;
1984       }
1985     }
1986     break;
1987   case ANCR_LOOK_BEHIND_NOT:
1988     if (node->char_min_len == node->char_max_len)
1989       len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP;
1990     else {
1991       len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP;
1992 
1993       if (IS_NOT_NULL(node->lead_node)) {
1994         int llen = compile_length_tree(node->lead_node, reg);
1995         if (llen < 0) return llen;
1996 
1997         len += OPSIZE_MOVE + llen;
1998       }
1999     }
2000     break;
2001 
2002   case ANCR_WORD_BOUNDARY:
2003   case ANCR_NO_WORD_BOUNDARY:
2004 #ifdef USE_WORD_BEGIN_END
2005   case ANCR_WORD_BEGIN:
2006   case ANCR_WORD_END:
2007 #endif
2008     len = OPSIZE_WORD_BOUNDARY;
2009     break;
2010 
2011   case ANCR_TEXT_SEGMENT_BOUNDARY:
2012   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2013     len = SIZE_OPCODE;
2014     break;
2015 
2016   default:
2017     len = SIZE_OPCODE;
2018     break;
2019   }
2020 
2021   return len;
2022 }
2023 
2024 static int
compile_anchor_look_behind_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2025 compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
2026 {
2027   int r;
2028 
2029   if (node->char_min_len == node->char_max_len) {
2030     MemNumType mid;
2031 
2032     ID_ENTRY(env, mid);
2033     r = add_op(reg, OP_MARK);
2034     if (r != 0) return r;
2035     COP(reg)->mark.id = mid;
2036     COP(reg)->mark.save_pos = FALSE;
2037 
2038     r = add_op(reg, OP_STEP_BACK_START);
2039     if (r != 0) return r;
2040     COP(reg)->step_back_start.initial   = node->char_min_len;
2041     COP(reg)->step_back_start.remaining = 0;
2042     COP(reg)->step_back_start.addr      = 1;
2043 
2044     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2045     if (r != 0) return r;
2046 
2047     r = add_op(reg, OP_CUT_TO_MARK);
2048     if (r != 0) return r;
2049     COP(reg)->cut_to_mark.id = mid;
2050     COP(reg)->cut_to_mark.restore_pos = FALSE;
2051   }
2052   else {
2053     MemNumType mid1, mid2;
2054     OnigLen diff;
2055 
2056     if (IS_NOT_NULL(node->lead_node)) {
2057       MinMaxCharLen ci;
2058 
2059       r = node_char_len(node->lead_node, reg, &ci, env);
2060       if (r < 0) return r;
2061       r = add_op(reg, OP_MOVE);
2062       if (r != 0) return r;
2063       COP(reg)->move.n = -((RelPositionType )ci.min);
2064       r = compile_tree(node->lead_node, reg, env);
2065       if (r != 0) return r;
2066     }
2067 
2068     ID_ENTRY(env, mid1);
2069     r = add_op(reg, OP_SAVE_VAL);
2070     if (r != 0) return r;
2071     COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2072     COP(reg)->save_val.id   = mid1;
2073 
2074     r = add_op(reg, OP_UPDATE_VAR);
2075     if (r != 0) return r;
2076     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2077 
2078     ID_ENTRY(env, mid2);
2079     r = add_op(reg, OP_MARK);
2080     if (r != 0) return r;
2081     COP(reg)->mark.id = mid2;
2082     COP(reg)->mark.save_pos = FALSE;
2083 
2084     r = add_op(reg, OP_PUSH);
2085     if (r != 0) return r;
2086     COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
2087 
2088     r = add_op(reg, OP_JUMP);
2089     if (r != 0) return r;
2090     COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL;
2091 
2092     r = add_op(reg, OP_UPDATE_VAR);
2093     if (r != 0) return r;
2094     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2095     COP(reg)->update_var.id    = mid1;
2096     COP(reg)->update_var.clear = FALSE;
2097     r = add_op(reg, OP_FAIL);
2098     if (r != 0) return r;
2099 
2100     r = add_op(reg, OP_STEP_BACK_START);
2101     if (r != 0) return r;
2102 
2103     if (node->char_max_len != INFINITE_LEN)
2104       diff = node->char_max_len - node->char_min_len;
2105     else
2106       diff = INFINITE_LEN;
2107 
2108     COP(reg)->step_back_start.initial   = node->char_min_len;
2109     COP(reg)->step_back_start.remaining = diff;
2110     COP(reg)->step_back_start.addr      = 2;
2111 
2112     r = add_op(reg, OP_STEP_BACK_NEXT);
2113     if (r != 0) return r;
2114 
2115     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2116     if (r != 0) return r;
2117 
2118     r = add_op(reg, OP_CHECK_POSITION);
2119     if (r != 0) return r;
2120     COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2121 
2122     r = add_op(reg, OP_CUT_TO_MARK);
2123     if (r != 0) return r;
2124     COP(reg)->cut_to_mark.id = mid2;
2125     COP(reg)->cut_to_mark.restore_pos = FALSE;
2126 
2127     r = add_op(reg, OP_UPDATE_VAR);
2128     if (r != 0) return r;
2129     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2130     COP(reg)->update_var.id    = mid1;
2131     COP(reg)->update_var.clear = TRUE;
2132   }
2133 
2134   return r;
2135 }
2136 
2137 static int
compile_anchor_look_behind_not_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2138 compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
2139                                     ScanEnv* env)
2140 {
2141   int r;
2142   int len;
2143 
2144   len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2145 
2146   if (node->char_min_len == node->char_max_len) {
2147     MemNumType mid;
2148 
2149     ID_ENTRY(env, mid);
2150     r = add_op(reg, OP_MARK);
2151     if (r != 0) return r;
2152     COP(reg)->mark.id = mid;
2153     COP(reg)->mark.save_pos = FALSE;
2154 
2155     r = add_op(reg, OP_PUSH);
2156     if (r != 0) return r;
2157     COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL;
2158 
2159     r = add_op(reg, OP_STEP_BACK_START);
2160     if (r != 0) return r;
2161     COP(reg)->step_back_start.initial   = node->char_min_len;
2162     COP(reg)->step_back_start.remaining = 0;
2163     COP(reg)->step_back_start.addr      = 1;
2164 
2165     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2166     if (r != 0) return r;
2167 
2168     r = add_op(reg, OP_POP_TO_MARK);
2169     if (r != 0) return r;
2170     COP(reg)->pop_to_mark.id = mid;
2171     r = add_op(reg, OP_FAIL);
2172     if (r != 0) return r;
2173     r = add_op(reg, OP_POP);
2174   }
2175   else {
2176     MemNumType mid1, mid2;
2177     OnigLen diff;
2178 
2179     ID_ENTRY(env, mid1);
2180     r = add_op(reg, OP_SAVE_VAL);
2181     if (r != 0) return r;
2182     COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2183     COP(reg)->save_val.id   = mid1;
2184 
2185     r = add_op(reg, OP_UPDATE_VAR);
2186     if (r != 0) return r;
2187     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2188 
2189     ID_ENTRY(env, mid2);
2190     r = add_op(reg, OP_MARK);
2191     if (r != 0) return r;
2192     COP(reg)->mark.id = mid2;
2193     COP(reg)->mark.save_pos = FALSE;
2194 
2195     r = add_op(reg, OP_PUSH);
2196     if (r != 0) return r;
2197     COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL;
2198 
2199     if (IS_NOT_NULL(node->lead_node)) {
2200       int clen;
2201       MinMaxCharLen ci;
2202 
2203       clen = compile_length_tree(node->lead_node, reg);
2204       COP(reg)->push.addr += OPSIZE_MOVE + clen;
2205 
2206       r = node_char_len(node->lead_node, reg, &ci, env);
2207       if (r < 0) return r;
2208       r = add_op(reg, OP_MOVE);
2209       if (r != 0) return r;
2210       COP(reg)->move.n = -((RelPositionType )ci.min);
2211 
2212       r = compile_tree(node->lead_node, reg, env);
2213       if (r != 0) return r;
2214     }
2215 
2216     r = add_op(reg, OP_STEP_BACK_START);
2217     if (r != 0) return r;
2218 
2219     if (node->char_max_len != INFINITE_LEN)
2220       diff = node->char_max_len - node->char_min_len;
2221     else
2222       diff = INFINITE_LEN;
2223 
2224     COP(reg)->step_back_start.initial   = node->char_min_len;
2225     COP(reg)->step_back_start.remaining = diff;
2226     COP(reg)->step_back_start.addr      = 2;
2227 
2228     r = add_op(reg, OP_STEP_BACK_NEXT);
2229     if (r != 0) return r;
2230 
2231     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2232     if (r != 0) return r;
2233 
2234     r = add_op(reg, OP_CHECK_POSITION);
2235     if (r != 0) return r;
2236     COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2237 
2238     r = add_op(reg, OP_POP_TO_MARK);
2239     if (r != 0) return r;
2240     COP(reg)->pop_to_mark.id = mid2;
2241 
2242     r = add_op(reg, OP_UPDATE_VAR);
2243     if (r != 0) return r;
2244     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2245     COP(reg)->update_var.id   = mid1;
2246     COP(reg)->update_var.clear = FALSE;
2247 
2248     r = add_op(reg, OP_POP); /* pop save val */
2249     if (r != 0) return r;
2250     r = add_op(reg, OP_FAIL);
2251     if (r != 0) return r;
2252 
2253     r = add_op(reg, OP_UPDATE_VAR);
2254     if (r != 0) return r;
2255     COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2256     COP(reg)->update_var.id   = mid1;
2257     COP(reg)->update_var.clear = FALSE;
2258 
2259     r = add_op(reg, OP_POP); /* pop mark */
2260     if (r != 0) return r;
2261     r = add_op(reg, OP_POP); /* pop save val */
2262   }
2263 
2264   return r;
2265 }
2266 
2267 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2268 compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
2269 {
2270   int r, len;
2271   enum OpCode op;
2272   MemNumType mid;
2273 
2274   switch (node->type) {
2275   case ANCR_BEGIN_BUF:      r = add_op(reg, OP_BEGIN_BUF);      break;
2276   case ANCR_END_BUF:        r = add_op(reg, OP_END_BUF);        break;
2277   case ANCR_BEGIN_LINE:     r = add_op(reg, OP_BEGIN_LINE);     break;
2278   case ANCR_END_LINE:       r = add_op(reg, OP_END_LINE);       break;
2279   case ANCR_SEMI_END_BUF:   r = add_op(reg, OP_SEMI_END_BUF);   break;
2280   case ANCR_BEGIN_POSITION:
2281     r = add_op(reg, OP_CHECK_POSITION);
2282     if (r != 0) return r;
2283     COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START;
2284     break;
2285 
2286   case ANCR_WORD_BOUNDARY:
2287     op = OP_WORD_BOUNDARY;
2288   word:
2289     r = add_op(reg, op);
2290     if (r != 0) return r;
2291     COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
2292     break;
2293 
2294   case ANCR_NO_WORD_BOUNDARY:
2295     op = OP_NO_WORD_BOUNDARY; goto word;
2296     break;
2297 #ifdef USE_WORD_BEGIN_END
2298   case ANCR_WORD_BEGIN:
2299     op = OP_WORD_BEGIN; goto word;
2300     break;
2301   case ANCR_WORD_END:
2302     op = OP_WORD_END; goto word;
2303     break;
2304 #endif
2305 
2306   case ANCR_TEXT_SEGMENT_BOUNDARY:
2307   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2308     {
2309       enum TextSegmentBoundaryType type;
2310 
2311       r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
2312       if (r != 0) return r;
2313 
2314       type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
2315 #ifdef USE_UNICODE_WORD_BREAK
2316       if (NODE_IS_TEXT_SEGMENT_WORD(node))
2317         type = WORD_BOUNDARY;
2318 #endif
2319 
2320       COP(reg)->text_segment_boundary.type = type;
2321       COP(reg)->text_segment_boundary.not =
2322         (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
2323     }
2324     break;
2325 
2326   case ANCR_PREC_READ:
2327     {
2328       ID_ENTRY(env, mid);
2329       r = add_op(reg, OP_MARK);
2330       if (r != 0) return r;
2331       COP(reg)->mark.id = mid;
2332       COP(reg)->mark.save_pos = TRUE;
2333 
2334       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2335       if (r != 0) return r;
2336 
2337       r = add_op(reg, OP_CUT_TO_MARK);
2338       if (r != 0) return r;
2339       COP(reg)->cut_to_mark.id = mid;
2340       COP(reg)->cut_to_mark.restore_pos = TRUE;
2341     }
2342     break;
2343 
2344   case ANCR_PREC_READ_NOT:
2345     {
2346       len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2347       if (len < 0) return len;
2348 
2349       ID_ENTRY(env, mid);
2350       r = add_op(reg, OP_PUSH);
2351       if (r != 0) return r;
2352       COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len +
2353                             OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
2354 
2355       r = add_op(reg, OP_MARK);
2356       if (r != 0) return r;
2357       COP(reg)->mark.id = mid;
2358       COP(reg)->mark.save_pos = FALSE;
2359 
2360       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2361       if (r != 0) return r;
2362 
2363       r = add_op(reg, OP_POP_TO_MARK);
2364       if (r != 0) return r;
2365       COP(reg)->pop_to_mark.id = mid;
2366 
2367       r = add_op(reg, OP_POP);
2368       if (r != 0) return r;
2369       r = add_op(reg, OP_FAIL);
2370     }
2371     break;
2372 
2373   case ANCR_LOOK_BEHIND:
2374     r = compile_anchor_look_behind_node(node, reg, env);
2375     break;
2376 
2377   case ANCR_LOOK_BEHIND_NOT:
2378     r = compile_anchor_look_behind_not_node(node, reg, env);
2379     break;
2380 
2381   default:
2382     return ONIGERR_TYPE_BUG;
2383     break;
2384   }
2385 
2386   return r;
2387 }
2388 
2389 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)2390 compile_gimmick_node(GimmickNode* node, regex_t* reg)
2391 {
2392   int r = 0;
2393 
2394   switch (node->type) {
2395   case GIMMICK_FAIL:
2396     r = add_op(reg, OP_FAIL);
2397     break;
2398 
2399   case GIMMICK_SAVE:
2400     r = add_op(reg, OP_SAVE_VAL);
2401     if (r != 0) return r;
2402     COP(reg)->save_val.type = node->detail_type;
2403     COP(reg)->save_val.id   = node->id;
2404     break;
2405 
2406   case GIMMICK_UPDATE_VAR:
2407     r = add_op(reg, OP_UPDATE_VAR);
2408     if (r != 0) return r;
2409     COP(reg)->update_var.type = node->detail_type;
2410     COP(reg)->update_var.id   = node->id;
2411     COP(reg)->update_var.clear = FALSE;
2412     break;
2413 
2414 #ifdef USE_CALLOUT
2415   case GIMMICK_CALLOUT:
2416     switch (node->detail_type) {
2417     case ONIG_CALLOUT_OF_CONTENTS:
2418     case ONIG_CALLOUT_OF_NAME:
2419       {
2420         if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
2421           r = add_op(reg, OP_CALLOUT_NAME);
2422           if (r != 0) return r;
2423           COP(reg)->callout_name.id  = node->id;
2424           COP(reg)->callout_name.num = node->num;
2425         }
2426         else {
2427           r = add_op(reg, OP_CALLOUT_CONTENTS);
2428           if (r != 0) return r;
2429           COP(reg)->callout_contents.num = node->num;
2430         }
2431       }
2432       break;
2433 
2434     default:
2435       r = ONIGERR_TYPE_BUG;
2436       break;
2437     }
2438 #endif
2439   }
2440 
2441   return r;
2442 }
2443 
2444 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)2445 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
2446 {
2447   int len;
2448 
2449   switch (node->type) {
2450   case GIMMICK_FAIL:
2451     len = OPSIZE_FAIL;
2452     break;
2453 
2454   case GIMMICK_SAVE:
2455     len = OPSIZE_SAVE_VAL;
2456     break;
2457 
2458   case GIMMICK_UPDATE_VAR:
2459     len = OPSIZE_UPDATE_VAR;
2460     break;
2461 
2462 #ifdef USE_CALLOUT
2463   case GIMMICK_CALLOUT:
2464     switch (node->detail_type) {
2465     case ONIG_CALLOUT_OF_CONTENTS:
2466       len = OPSIZE_CALLOUT_CONTENTS;
2467       break;
2468     case ONIG_CALLOUT_OF_NAME:
2469       len = OPSIZE_CALLOUT_NAME;
2470       break;
2471 
2472     default:
2473       len = ONIGERR_TYPE_BUG;
2474       break;
2475     }
2476     break;
2477 #endif
2478   }
2479 
2480   return len;
2481 }
2482 
2483 static int
compile_length_tree(Node * node,regex_t * reg)2484 compile_length_tree(Node* node, regex_t* reg)
2485 {
2486   int len, r;
2487 
2488   switch (NODE_TYPE(node)) {
2489   case NODE_LIST:
2490     len = 0;
2491     do {
2492       r = compile_length_tree(NODE_CAR(node), reg);
2493       if (r < 0) return r;
2494       len += r;
2495     } while (IS_NOT_NULL(node = NODE_CDR(node)));
2496     r = len;
2497     break;
2498 
2499   case NODE_ALT:
2500     {
2501       int n;
2502 
2503       n = r = 0;
2504       do {
2505         r += compile_length_tree(NODE_CAR(node), reg);
2506         n++;
2507       } while (IS_NOT_NULL(node = NODE_CDR(node)));
2508       r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
2509     }
2510     break;
2511 
2512   case NODE_STRING:
2513     if (NODE_STRING_IS_CRUDE(node))
2514       r = compile_length_string_crude_node(STR_(node), reg);
2515     else
2516       r = compile_length_string_node(node, reg);
2517     break;
2518 
2519   case NODE_CCLASS:
2520     r = compile_length_cclass_node(CCLASS_(node), reg);
2521     break;
2522 
2523   case NODE_CTYPE:
2524     r = SIZE_OPCODE;
2525     break;
2526 
2527   case NODE_BACKREF:
2528     r = OPSIZE_BACKREF;
2529     break;
2530 
2531 #ifdef USE_CALL
2532   case NODE_CALL:
2533     r = OPSIZE_CALL;
2534     break;
2535 #endif
2536 
2537   case NODE_QUANT:
2538     r = compile_length_quantifier_node(QUANT_(node), reg);
2539     break;
2540 
2541   case NODE_BAG:
2542     r = compile_length_bag_node(BAG_(node), reg);
2543     break;
2544 
2545   case NODE_ANCHOR:
2546     r = compile_length_anchor_node(ANCHOR_(node), reg);
2547     break;
2548 
2549   case NODE_GIMMICK:
2550     r = compile_length_gimmick_node(GIMMICK_(node), reg);
2551     break;
2552 
2553   default:
2554     return ONIGERR_TYPE_BUG;
2555     break;
2556   }
2557 
2558   return r;
2559 }
2560 
2561 static int
compile_tree(Node * node,regex_t * reg,ScanEnv * env)2562 compile_tree(Node* node, regex_t* reg, ScanEnv* env)
2563 {
2564   int n, len, pos, r = 0;
2565 
2566   switch (NODE_TYPE(node)) {
2567   case NODE_LIST:
2568     do {
2569       r = compile_tree(NODE_CAR(node), reg, env);
2570     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2571     break;
2572 
2573   case NODE_ALT:
2574     {
2575       Node* x = node;
2576       len = 0;
2577       do {
2578         len += compile_length_tree(NODE_CAR(x), reg);
2579         if (IS_NOT_NULL(NODE_CDR(x))) {
2580           len += OPSIZE_PUSH + OPSIZE_JUMP;
2581         }
2582       } while (IS_NOT_NULL(x = NODE_CDR(x)));
2583       pos = COP_CURR_OFFSET(reg) + 1 + len;  /* goal position */
2584 
2585       do {
2586         len = compile_length_tree(NODE_CAR(node), reg);
2587         if (IS_NOT_NULL(NODE_CDR(node))) {
2588           enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2589           r = add_op(reg, push);
2590           if (r != 0) break;
2591           COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2592         }
2593         r = compile_tree(NODE_CAR(node), reg, env);
2594         if (r != 0) break;
2595         if (IS_NOT_NULL(NODE_CDR(node))) {
2596           len = pos - (COP_CURR_OFFSET(reg) + 1);
2597           r = add_op(reg, OP_JUMP);
2598           if (r != 0) break;
2599           COP(reg)->jump.addr = len;
2600         }
2601       } while (IS_NOT_NULL(node = NODE_CDR(node)));
2602     }
2603     break;
2604 
2605   case NODE_STRING:
2606     if (NODE_STRING_IS_CRUDE(node))
2607       r = compile_string_crude_node(STR_(node), reg);
2608     else
2609       r = compile_string_node(node, reg);
2610     break;
2611 
2612   case NODE_CCLASS:
2613     r = compile_cclass_node(CCLASS_(node), reg);
2614     break;
2615 
2616   case NODE_CTYPE:
2617     {
2618       int op;
2619 
2620       switch (CTYPE_(node)->ctype) {
2621       case CTYPE_ANYCHAR:
2622         r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR);
2623         break;
2624 
2625       case ONIGENC_CTYPE_WORD:
2626         if (CTYPE_(node)->ascii_mode == 0) {
2627           op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2628         }
2629         else {
2630           op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2631         }
2632         r = add_op(reg, op);
2633         break;
2634 
2635       default:
2636         return ONIGERR_TYPE_BUG;
2637         break;
2638       }
2639     }
2640     break;
2641 
2642   case NODE_BACKREF:
2643     {
2644       BackRefNode* br = BACKREF_(node);
2645 
2646       if (NODE_IS_CHECKER(node)) {
2647 #ifdef USE_BACKREF_WITH_LEVEL
2648         if (NODE_IS_NEST_LEVEL(node)) {
2649           r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2650           if (r != 0) return r;
2651           COP(reg)->backref_general.nest_level = br->nest_level;
2652         }
2653         else
2654 #endif
2655           {
2656             r = add_op(reg, OP_BACKREF_CHECK);
2657             if (r != 0) return r;
2658           }
2659         goto add_bacref_mems;
2660       }
2661       else {
2662 #ifdef USE_BACKREF_WITH_LEVEL
2663         if (NODE_IS_NEST_LEVEL(node)) {
2664           if (NODE_IS_IGNORECASE(node))
2665             r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2666           else
2667             r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2668 
2669           if (r != 0) return r;
2670           COP(reg)->backref_general.nest_level = br->nest_level;
2671           goto add_bacref_mems;
2672         }
2673         else
2674 #endif
2675         if (br->back_num == 1) {
2676           n = br->back_static[0];
2677           if (NODE_IS_IGNORECASE(node)) {
2678             r = add_op(reg, OP_BACKREF_N_IC);
2679             if (r != 0) return r;
2680             COP(reg)->backref_n.n1 = n;
2681           }
2682           else {
2683             switch (n) {
2684             case 1:  r = add_op(reg, OP_BACKREF1); break;
2685             case 2:  r = add_op(reg, OP_BACKREF2); break;
2686             default:
2687               r = add_op(reg, OP_BACKREF_N);
2688               if (r != 0) return r;
2689               COP(reg)->backref_n.n1 = n;
2690               break;
2691             }
2692           }
2693         }
2694         else {
2695           int num;
2696           int* p;
2697 
2698           r = add_op(reg, NODE_IS_IGNORECASE(node) ?
2699                      OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2700           if (r != 0) return r;
2701 
2702         add_bacref_mems:
2703           num = br->back_num;
2704           COP(reg)->backref_general.num = num;
2705           if (num == 1) {
2706             COP(reg)->backref_general.n1 = br->back_static[0];
2707           }
2708           else {
2709             int i, j;
2710             MemNumType* ns;
2711 
2712             ns = xmalloc(sizeof(MemNumType) * num);
2713             CHECK_NULL_RETURN_MEMERR(ns);
2714             COP(reg)->backref_general.ns = ns;
2715             p = BACKREFS_P(br);
2716             for (i = num - 1, j = 0; i >= 0; i--, j++) {
2717               ns[j] = p[i];
2718             }
2719           }
2720         }
2721       }
2722     }
2723     break;
2724 
2725 #ifdef USE_CALL
2726   case NODE_CALL:
2727     r = compile_call(CALL_(node), reg, env);
2728     break;
2729 #endif
2730 
2731   case NODE_QUANT:
2732     r = compile_quantifier_node(QUANT_(node), reg, env);
2733     break;
2734 
2735   case NODE_BAG:
2736     r = compile_bag_node(BAG_(node), reg, env);
2737     break;
2738 
2739   case NODE_ANCHOR:
2740     r = compile_anchor_node(ANCHOR_(node), reg, env);
2741     break;
2742 
2743   case NODE_GIMMICK:
2744     r = compile_gimmick_node(GIMMICK_(node), reg);
2745     break;
2746 
2747   default:
2748 #ifdef ONIG_DEBUG
2749     fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2750 #endif
2751     break;
2752   }
2753 
2754   return r;
2755 }
2756 
2757 static int
make_named_capture_number_map(Node ** plink,GroupNumMap * map,int * counter)2758 make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)
2759 {
2760   int r = 0;
2761   Node* node = *plink;
2762 
2763   switch (NODE_TYPE(node)) {
2764   case NODE_LIST:
2765   case NODE_ALT:
2766     do {
2767       r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter);
2768     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2769     break;
2770 
2771   case NODE_QUANT:
2772     {
2773       Node** ptarget = &(NODE_BODY(node));
2774       Node*  old = *ptarget;
2775       r = make_named_capture_number_map(ptarget, map, counter);
2776       if (r != 0) return r;
2777       if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) {
2778         r = onig_reduce_nested_quantifier(node);
2779       }
2780     }
2781     break;
2782 
2783   case NODE_BAG:
2784     {
2785       BagNode* en = BAG_(node);
2786       if (en->type == BAG_MEMORY) {
2787         if (NODE_IS_NAMED_GROUP(node)) {
2788           (*counter)++;
2789           map[en->m.regnum].new_val = *counter;
2790           en->m.regnum = *counter;
2791           r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2792         }
2793         else {
2794           *plink = NODE_BODY(node);
2795           NODE_BODY(node) = NULL_NODE;
2796           onig_node_free(node);
2797           r = make_named_capture_number_map(plink, map, counter);
2798         }
2799       }
2800       else if (en->type == BAG_IF_ELSE) {
2801         r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter);
2802         if (r != 0) return r;
2803         if (IS_NOT_NULL(en->te.Then)) {
2804           r = make_named_capture_number_map(&(en->te.Then), map, counter);
2805           if (r != 0) return r;
2806         }
2807         if (IS_NOT_NULL(en->te.Else)) {
2808           r = make_named_capture_number_map(&(en->te.Else), map, counter);
2809           if (r != 0) return r;
2810         }
2811       }
2812       else
2813         r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2814     }
2815     break;
2816 
2817   case NODE_ANCHOR:
2818     if (IS_NOT_NULL(NODE_BODY(node)))
2819       r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2820     break;
2821 
2822   default:
2823     break;
2824   }
2825 
2826   return r;
2827 }
2828 
2829 static int
renumber_backref_node(Node * node,GroupNumMap * map)2830 renumber_backref_node(Node* node, GroupNumMap* map)
2831 {
2832   int i, pos, n, old_num;
2833   int *backs;
2834   BackRefNode* bn = BACKREF_(node);
2835 
2836   if (! NODE_IS_BY_NAME(node))
2837     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2838 
2839   old_num = bn->back_num;
2840   if (IS_NULL(bn->back_dynamic))
2841     backs = bn->back_static;
2842   else
2843     backs = bn->back_dynamic;
2844 
2845   for (i = 0, pos = 0; i < old_num; i++) {
2846     n = map[backs[i]].new_val;
2847     if (n > 0) {
2848       backs[pos] = n;
2849       pos++;
2850     }
2851   }
2852 
2853   bn->back_num = pos;
2854   return 0;
2855 }
2856 
2857 static int
renumber_backref_traverse(Node * node,GroupNumMap * map)2858 renumber_backref_traverse(Node* node, GroupNumMap* map)
2859 {
2860   int r = 0;
2861 
2862   switch (NODE_TYPE(node)) {
2863   case NODE_LIST:
2864   case NODE_ALT:
2865     do {
2866       r = renumber_backref_traverse(NODE_CAR(node), map);
2867     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2868     break;
2869 
2870   case NODE_QUANT:
2871     r = renumber_backref_traverse(NODE_BODY(node), map);
2872     break;
2873 
2874   case NODE_BAG:
2875     {
2876       BagNode* en = BAG_(node);
2877 
2878       r = renumber_backref_traverse(NODE_BODY(node), map);
2879       if (r != 0) return r;
2880 
2881       if (en->type == BAG_IF_ELSE) {
2882         if (IS_NOT_NULL(en->te.Then)) {
2883           r = renumber_backref_traverse(en->te.Then, map);
2884           if (r != 0) return r;
2885         }
2886         if (IS_NOT_NULL(en->te.Else)) {
2887           r = renumber_backref_traverse(en->te.Else, map);
2888           if (r != 0) return r;
2889         }
2890       }
2891     }
2892     break;
2893 
2894   case NODE_BACKREF:
2895     r = renumber_backref_node(node, map);
2896     break;
2897 
2898   case NODE_ANCHOR:
2899     if (IS_NOT_NULL(NODE_BODY(node)))
2900       r = renumber_backref_traverse(NODE_BODY(node), map);
2901     break;
2902 
2903   default:
2904     break;
2905   }
2906 
2907   return r;
2908 }
2909 
2910 static int
numbered_ref_check(Node * node)2911 numbered_ref_check(Node* node)
2912 {
2913   int r = 0;
2914 
2915   switch (NODE_TYPE(node)) {
2916   case NODE_LIST:
2917   case NODE_ALT:
2918     do {
2919       r = numbered_ref_check(NODE_CAR(node));
2920     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2921     break;
2922 
2923   case NODE_ANCHOR:
2924     if (IS_NULL(NODE_BODY(node)))
2925       break;
2926     /* fall */
2927   case NODE_QUANT:
2928     r = numbered_ref_check(NODE_BODY(node));
2929     break;
2930 
2931   case NODE_BAG:
2932     {
2933       BagNode* en = BAG_(node);
2934 
2935       r = numbered_ref_check(NODE_BODY(node));
2936       if (r != 0) return r;
2937 
2938       if (en->type == BAG_IF_ELSE) {
2939         if (IS_NOT_NULL(en->te.Then)) {
2940           r = numbered_ref_check(en->te.Then);
2941           if (r != 0) return r;
2942         }
2943         if (IS_NOT_NULL(en->te.Else)) {
2944           r = numbered_ref_check(en->te.Else);
2945           if (r != 0) return r;
2946         }
2947       }
2948     }
2949 
2950     break;
2951 
2952   case NODE_BACKREF:
2953     if (! NODE_IS_BY_NAME(node))
2954       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2955     break;
2956 
2957   default:
2958     break;
2959   }
2960 
2961   return r;
2962 }
2963 
2964 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)2965 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2966 {
2967   int r, i, pos, counter;
2968   MemStatusType loc;
2969   GroupNumMap* map;
2970 
2971   map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1));
2972   CHECK_NULL_RETURN_MEMERR(map);
2973   for (i = 1; i <= env->num_mem; i++) {
2974     map[i].new_val = 0;
2975   }
2976   counter = 0;
2977   r = make_named_capture_number_map(root, map, &counter);
2978   if (r != 0) return r;
2979 
2980   r = renumber_backref_traverse(*root, map);
2981   if (r != 0) return r;
2982 
2983   for (i = 1, pos = 1; i <= env->num_mem; i++) {
2984     if (map[i].new_val > 0) {
2985       SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
2986       pos++;
2987     }
2988   }
2989 
2990   loc = env->cap_history;
2991   MEM_STATUS_CLEAR(env->cap_history);
2992   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2993     if (MEM_STATUS_AT(loc, i)) {
2994       MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
2995     }
2996   }
2997 
2998   env->num_mem = env->num_named;
2999   reg->num_mem = env->num_named;
3000 
3001   return onig_renumber_name_table(reg, map);
3002 }
3003 
3004 #ifdef USE_CALL
3005 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)3006 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
3007 {
3008   int i, offset;
3009   BagNode* en;
3010   AbsAddrType addr;
3011   AbsAddrType* paddr;
3012 
3013   for (i = 0; i < uslist->num; i++) {
3014     if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) {
3015       if (NODE_IS_CALLED(uslist->us[i].target))
3016         return ONIGERR_PARSER_BUG;
3017       else {
3018         /* CASE: called node doesn't have called address.
3019            ex. /((|a\g<1>)(.){0}){0}\g<3>/
3020            group-1 doesn't called, but compiled into bytecodes,
3021            because group-3 is referred from outside.
3022         */
3023         continue;
3024       }
3025     }
3026 
3027     en = BAG_(uslist->us[i].target);
3028     addr   = en->m.called_addr;
3029     offset = uslist->us[i].offset;
3030 
3031     paddr = (AbsAddrType* )((char* )reg->ops + offset);
3032     *paddr = addr;
3033   }
3034   return 0;
3035 }
3036 #endif
3037 
3038 /* x is not included y ==>  1 : 0 */
3039 static int
is_exclusive(Node * x,Node * y,regex_t * reg)3040 is_exclusive(Node* x, Node* y, regex_t* reg)
3041 {
3042   int i, len;
3043   OnigCodePoint code;
3044   UChar *p;
3045   NodeType ytype;
3046 
3047  retry:
3048   ytype = NODE_TYPE(y);
3049   switch (NODE_TYPE(x)) {
3050   case NODE_CTYPE:
3051     {
3052       if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
3053           CTYPE_(y)->ctype == CTYPE_ANYCHAR)
3054         break;
3055 
3056       switch (ytype) {
3057       case NODE_CTYPE:
3058         if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
3059             CTYPE_(y)->not   != CTYPE_(x)->not &&
3060             CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
3061           return 1;
3062         else
3063           return 0;
3064         break;
3065 
3066       case NODE_CCLASS:
3067       swap:
3068         {
3069           Node* tmp;
3070           tmp = x; x = y; y = tmp;
3071           goto retry;
3072         }
3073         break;
3074 
3075       case NODE_STRING:
3076         goto swap;
3077         break;
3078 
3079       default:
3080         break;
3081       }
3082     }
3083     break;
3084 
3085   case NODE_CCLASS:
3086     {
3087       int range;
3088       CClassNode* xc = CCLASS_(x);
3089 
3090       switch (ytype) {
3091       case NODE_CTYPE:
3092         switch (CTYPE_(y)->ctype) {
3093         case CTYPE_ANYCHAR:
3094           return 0;
3095           break;
3096 
3097         case ONIGENC_CTYPE_WORD:
3098           if (CTYPE_(y)->not == 0) {
3099             if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
3100               range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3101               for (i = 0; i < range; i++) {
3102                 if (BITSET_AT(xc->bs, i)) {
3103                   if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
3104                 }
3105               }
3106               return 1;
3107             }
3108             return 0;
3109           }
3110           else {
3111             if (IS_NOT_NULL(xc->mbuf)) return 0;
3112             if (IS_NCCLASS_NOT(xc)) return 0;
3113 
3114             range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3115             for (i = 0; i < range; i++) {
3116               if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
3117                 if (BITSET_AT(xc->bs, i))
3118                   return 0;
3119               }
3120             }
3121             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
3122               if (BITSET_AT(xc->bs, i)) return 0;
3123             }
3124             return 1;
3125           }
3126           break;
3127 
3128         default:
3129           break;
3130         }
3131         break;
3132 
3133       case NODE_CCLASS:
3134         {
3135           int v;
3136           CClassNode* yc = CCLASS_(y);
3137 
3138           for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
3139             v = BITSET_AT(xc->bs, i);
3140             if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
3141               v = BITSET_AT(yc->bs, i);
3142               if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
3143                   (v == 0 && IS_NCCLASS_NOT(yc)))
3144                 return 0;
3145             }
3146           }
3147           if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
3148               (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
3149             return 1;
3150           return 0;
3151         }
3152         break;
3153 
3154       case NODE_STRING:
3155         goto swap;
3156         break;
3157 
3158       default:
3159         break;
3160       }
3161     }
3162     break;
3163 
3164   case NODE_STRING:
3165     {
3166       StrNode* xs = STR_(x);
3167 
3168       if (NODE_STRING_LEN(x) == 0)
3169         break;
3170 
3171       switch (ytype) {
3172       case NODE_CTYPE:
3173         switch (CTYPE_(y)->ctype) {
3174         case CTYPE_ANYCHAR:
3175           break;
3176 
3177         case ONIGENC_CTYPE_WORD:
3178           if (CTYPE_(y)->ascii_mode == 0) {
3179             if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
3180               return CTYPE_(y)->not;
3181             else
3182               return !(CTYPE_(y)->not);
3183           }
3184           else {
3185             if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
3186               return CTYPE_(y)->not;
3187             else
3188               return !(CTYPE_(y)->not);
3189           }
3190           break;
3191         default:
3192           break;
3193         }
3194         break;
3195 
3196       case NODE_CCLASS:
3197         {
3198           CClassNode* cc = CCLASS_(y);
3199 
3200           code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
3201                                      xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
3202           return onig_is_code_in_cc(reg->enc, code, cc) == 0;
3203         }
3204         break;
3205 
3206       case NODE_STRING:
3207         {
3208           UChar *q;
3209           StrNode* ys = STR_(y);
3210 
3211           len = NODE_STRING_LEN(x);
3212           if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
3213 
3214           for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
3215             if (*p != *q) return 1;
3216           }
3217         }
3218         break;
3219 
3220       default:
3221         break;
3222       }
3223     }
3224     break;
3225 
3226   default:
3227     break;
3228   }
3229 
3230   return 0;
3231 }
3232 
3233 static Node*
get_tree_head_literal(Node * node,int exact,regex_t * reg)3234 get_tree_head_literal(Node* node, int exact, regex_t* reg)
3235 {
3236   Node* n = NULL_NODE;
3237 
3238   switch (NODE_TYPE(node)) {
3239   case NODE_BACKREF:
3240   case NODE_ALT:
3241 #ifdef USE_CALL
3242   case NODE_CALL:
3243 #endif
3244     break;
3245 
3246   case NODE_CTYPE:
3247     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
3248       break;
3249     /* fall */
3250   case NODE_CCLASS:
3251     if (exact == 0) {
3252       n = node;
3253     }
3254     break;
3255 
3256   case NODE_LIST:
3257     n = get_tree_head_literal(NODE_CAR(node), exact, reg);
3258     break;
3259 
3260   case NODE_STRING:
3261     {
3262       StrNode* sn = STR_(node);
3263 
3264       if (sn->end <= sn->s)
3265         break;
3266 
3267       if (exact == 0 ||
3268           ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) {
3269         n = node;
3270       }
3271     }
3272     break;
3273 
3274   case NODE_QUANT:
3275     {
3276       QuantNode* qn = QUANT_(node);
3277       if (qn->lower > 0) {
3278         if (IS_NOT_NULL(qn->head_exact))
3279           n = qn->head_exact;
3280         else
3281           n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3282       }
3283     }
3284     break;
3285 
3286   case NODE_BAG:
3287     {
3288       BagNode* en = BAG_(node);
3289       switch (en->type) {
3290       case BAG_OPTION:
3291       case BAG_MEMORY:
3292       case BAG_STOP_BACKTRACK:
3293       case BAG_IF_ELSE:
3294         n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3295         break;
3296       }
3297     }
3298     break;
3299 
3300   case NODE_ANCHOR:
3301     if (ANCHOR_(node)->type == ANCR_PREC_READ)
3302       n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3303     break;
3304 
3305   case NODE_GIMMICK:
3306   default:
3307     break;
3308   }
3309 
3310   return n;
3311 }
3312 
3313 enum GetValue {
3314   GET_VALUE_NONE   = -1,
3315   GET_VALUE_IGNORE =  0,
3316   GET_VALUE_FOUND  =  1
3317 };
3318 
3319 static int
get_tree_tail_literal(Node * node,Node ** rnode,regex_t * reg)3320 get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)
3321 {
3322   int r;
3323 
3324   switch (NODE_TYPE(node)) {
3325   case NODE_LIST:
3326     if (IS_NULL(NODE_CDR(node))) {
3327       r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3328     }
3329     else {
3330       r = get_tree_tail_literal(NODE_CDR(node), rnode, reg);
3331       if (r == GET_VALUE_IGNORE) {
3332         r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3333       }
3334     }
3335     break;
3336 
3337 #ifdef USE_CALL
3338   case NODE_CALL:
3339     r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3340     break;
3341 #endif
3342 
3343   case NODE_CTYPE:
3344     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) {
3345       r = GET_VALUE_NONE;
3346       break;
3347     }
3348     /* fall */
3349   case NODE_CCLASS:
3350     *rnode = node;
3351     r = GET_VALUE_FOUND;
3352     break;
3353 
3354   case NODE_STRING:
3355     {
3356       StrNode* sn = STR_(node);
3357 
3358       if (sn->end <= sn->s) {
3359         r = GET_VALUE_IGNORE;
3360         break;
3361       }
3362 
3363       if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
3364         r = GET_VALUE_NONE;
3365         break;
3366       }
3367 
3368       *rnode = node;
3369       r = GET_VALUE_FOUND;
3370     }
3371     break;
3372 
3373   case NODE_QUANT:
3374     {
3375       QuantNode* qn = QUANT_(node);
3376       if (qn->lower != 0) {
3377         r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3378       }
3379       else
3380         r = GET_VALUE_NONE;
3381     }
3382     break;
3383 
3384   case NODE_BAG:
3385     {
3386       BagNode* en = BAG_(node);
3387 
3388       if (en->type == BAG_MEMORY) {
3389         if (NODE_IS_MARK1(node))
3390           r = GET_VALUE_NONE;
3391         else {
3392           NODE_STATUS_ADD(node, MARK1);
3393           r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3394           NODE_STATUS_REMOVE(node, MARK1);
3395         }
3396       }
3397       else {
3398         r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3399       }
3400     }
3401     break;
3402 
3403   case NODE_ANCHOR:
3404   case NODE_GIMMICK:
3405     r = GET_VALUE_IGNORE;
3406     break;
3407 
3408   case NODE_ALT:
3409   case NODE_BACKREF:
3410   default:
3411     r = GET_VALUE_NONE;
3412     break;
3413   }
3414 
3415   return r;
3416 }
3417 
3418 static int
check_called_node_in_look_behind(Node * node,int not)3419 check_called_node_in_look_behind(Node* node, int not)
3420 {
3421   int r;
3422 
3423   r = 0;
3424 
3425   switch (NODE_TYPE(node)) {
3426   case NODE_LIST:
3427   case NODE_ALT:
3428     do {
3429       r = check_called_node_in_look_behind(NODE_CAR(node), not);
3430     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3431     break;
3432 
3433   case NODE_QUANT:
3434     r = check_called_node_in_look_behind(NODE_BODY(node), not);
3435     break;
3436 
3437   case NODE_BAG:
3438     {
3439       BagNode* en = BAG_(node);
3440 
3441       if (en->type == BAG_MEMORY) {
3442         if (NODE_IS_MARK1(node))
3443           return 0;
3444         else {
3445           NODE_STATUS_ADD(node, MARK1);
3446           r = check_called_node_in_look_behind(NODE_BODY(node), not);
3447           NODE_STATUS_REMOVE(node, MARK1);
3448         }
3449       }
3450       else {
3451         r = check_called_node_in_look_behind(NODE_BODY(node), not);
3452         if (r == 0 && en->type == BAG_IF_ELSE) {
3453           if (IS_NOT_NULL(en->te.Then)) {
3454             r = check_called_node_in_look_behind(en->te.Then, not);
3455             if (r != 0) break;
3456           }
3457           if (IS_NOT_NULL(en->te.Else)) {
3458             r = check_called_node_in_look_behind(en->te.Else, not);
3459           }
3460         }
3461       }
3462     }
3463     break;
3464 
3465   case NODE_ANCHOR:
3466     if (IS_NOT_NULL(NODE_BODY(node)))
3467       r = check_called_node_in_look_behind(NODE_BODY(node), not);
3468     break;
3469 
3470   case NODE_GIMMICK:
3471     if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3472       return 1;
3473     break;
3474 
3475   default:
3476     break;
3477   }
3478 
3479   return r;
3480 }
3481 
3482 /* allowed node types in look-behind */
3483 #define ALLOWED_TYPE_IN_LB \
3484   ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
3485   | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
3486   | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK)
3487 
3488 #define ALLOWED_BAG_IN_LB       ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3489 #define ALLOWED_BAG_IN_LB_NOT   ( 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3490 
3491 #define ALLOWED_ANCHOR_IN_LB \
3492   ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
3493   | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
3494   | ANCR_WORD_BEGIN | ANCR_WORD_END \
3495   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3496 
3497 #define ALLOWED_ANCHOR_IN_LB_NOT \
3498   ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
3499   | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
3500   | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
3501   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3502 
3503 
3504 static int
check_node_in_look_behind(Node * node,int not,int * used)3505 check_node_in_look_behind(Node* node, int not, int* used)
3506 {
3507   static unsigned int
3508     bag_mask[2] = { ALLOWED_BAG_IN_LB, ALLOWED_BAG_IN_LB_NOT };
3509 
3510   static unsigned int
3511     anchor_mask[2] = { ALLOWED_ANCHOR_IN_LB, ALLOWED_ANCHOR_IN_LB_NOT };
3512 
3513   NodeType type;
3514   int r = 0;
3515 
3516   type = NODE_TYPE(node);
3517   if ((NODE_TYPE2BIT(type) & ALLOWED_TYPE_IN_LB) == 0)
3518     return 1;
3519 
3520   switch (type) {
3521   case NODE_LIST:
3522   case NODE_ALT:
3523     do {
3524       r = check_node_in_look_behind(NODE_CAR(node), not, used);
3525     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3526     break;
3527 
3528   case NODE_QUANT:
3529     r = check_node_in_look_behind(NODE_BODY(node), not, used);
3530     break;
3531 
3532   case NODE_BAG:
3533     {
3534       BagNode* en = BAG_(node);
3535       if (((1<<en->type) & bag_mask[not]) == 0)
3536         return 1;
3537 
3538       r = check_node_in_look_behind(NODE_BODY(node), not, used);
3539       if (r != 0) break;
3540 
3541       if (en->type == BAG_MEMORY) {
3542         if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE;
3543       }
3544       else if (en->type == BAG_IF_ELSE) {
3545         if (IS_NOT_NULL(en->te.Then)) {
3546           r = check_node_in_look_behind(en->te.Then, not, used);
3547           if (r != 0) break;
3548         }
3549         if (IS_NOT_NULL(en->te.Else)) {
3550           r = check_node_in_look_behind(en->te.Else, not, used);
3551         }
3552       }
3553     }
3554     break;
3555 
3556   case NODE_ANCHOR:
3557     type = ANCHOR_(node)->type;
3558     if ((type & anchor_mask[not]) == 0)
3559       return 1;
3560 
3561     if (IS_NOT_NULL(NODE_BODY(node)))
3562       r = check_node_in_look_behind(NODE_BODY(node), not, used);
3563     break;
3564 
3565   case NODE_GIMMICK:
3566     if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3567       return 1;
3568     break;
3569 
3570   case NODE_CALL:
3571     r = check_called_node_in_look_behind(NODE_BODY(node), not);
3572     break;
3573 
3574   default:
3575     break;
3576   }
3577   return r;
3578 }
3579 
3580 static OnigLen
node_min_byte_len(Node * node,ScanEnv * env)3581 node_min_byte_len(Node* node, ScanEnv* env)
3582 {
3583   OnigLen len;
3584   OnigLen tmin;
3585 
3586   len = 0;
3587   switch (NODE_TYPE(node)) {
3588   case NODE_BACKREF:
3589     if (! NODE_IS_CHECKER(node)) {
3590       int i;
3591       int* backs;
3592       MemEnv* mem_env = SCANENV_MEMENV(env);
3593       BackRefNode* br = BACKREF_(node);
3594       if (NODE_IS_RECURSION(node)) break;
3595 
3596       backs = BACKREFS_P(br);
3597       len = node_min_byte_len(mem_env[backs[0]].mem_node, env);
3598       for (i = 1; i < br->back_num; i++) {
3599         tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env);
3600         if (len > tmin) len = tmin;
3601       }
3602     }
3603     break;
3604 
3605 #ifdef USE_CALL
3606   case NODE_CALL:
3607     {
3608       Node* t = NODE_BODY(node);
3609       if (NODE_IS_RECURSION(node)) {
3610         if (NODE_IS_FIXED_MIN(t))
3611           len = BAG_(t)->min_len;
3612       }
3613       else
3614         len = node_min_byte_len(t, env);
3615     }
3616     break;
3617 #endif
3618 
3619   case NODE_LIST:
3620     do {
3621       tmin = node_min_byte_len(NODE_CAR(node), env);
3622       len = distance_add(len, tmin);
3623     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3624     break;
3625 
3626   case NODE_ALT:
3627     {
3628       Node *x, *y;
3629       y = node;
3630       do {
3631         x = NODE_CAR(y);
3632         tmin = node_min_byte_len(x, env);
3633         if (y == node) len = tmin;
3634         else if (len > tmin) len = tmin;
3635       } while (IS_NOT_NULL(y = NODE_CDR(y)));
3636     }
3637     break;
3638 
3639   case NODE_STRING:
3640     {
3641       StrNode* sn = STR_(node);
3642       len = (int )(sn->end - sn->s);
3643     }
3644     break;
3645 
3646   case NODE_CTYPE:
3647   case NODE_CCLASS:
3648     len = ONIGENC_MBC_MINLEN(env->enc);
3649     break;
3650 
3651   case NODE_QUANT:
3652     {
3653       QuantNode* qn = QUANT_(node);
3654 
3655       if (qn->lower > 0) {
3656         len = node_min_byte_len(NODE_BODY(node), env);
3657         len = distance_multiply(len, qn->lower);
3658       }
3659     }
3660     break;
3661 
3662   case NODE_BAG:
3663     {
3664       BagNode* en = BAG_(node);
3665       switch (en->type) {
3666       case BAG_MEMORY:
3667         if (NODE_IS_FIXED_MIN(node))
3668           len = en->min_len;
3669         else {
3670           if (NODE_IS_MARK1(node))
3671             len = 0;  /* recursive */
3672           else {
3673             NODE_STATUS_ADD(node, MARK1);
3674             len = node_min_byte_len(NODE_BODY(node), env);
3675             NODE_STATUS_REMOVE(node, MARK1);
3676 
3677             en->min_len = len;
3678             NODE_STATUS_ADD(node, FIXED_MIN);
3679           }
3680         }
3681         break;
3682 
3683       case BAG_OPTION:
3684       case BAG_STOP_BACKTRACK:
3685         len = node_min_byte_len(NODE_BODY(node), env);
3686         break;
3687       case BAG_IF_ELSE:
3688         {
3689           OnigLen elen;
3690 
3691           len = node_min_byte_len(NODE_BODY(node), env);
3692           if (IS_NOT_NULL(en->te.Then))
3693             len += node_min_byte_len(en->te.Then, env);
3694           if (IS_NOT_NULL(en->te.Else))
3695             elen = node_min_byte_len(en->te.Else, env);
3696           else elen = 0;
3697 
3698           if (elen < len) len = elen;
3699         }
3700         break;
3701       }
3702     }
3703     break;
3704 
3705   case NODE_GIMMICK:
3706     {
3707       GimmickNode* g = GIMMICK_(node);
3708       if (g->type == GIMMICK_FAIL) {
3709         len = INFINITE_LEN;
3710         break;
3711       }
3712     }
3713     /* fall */
3714   case NODE_ANCHOR:
3715   default:
3716     break;
3717   }
3718 
3719   return len;
3720 }
3721 
3722 static OnigLen
node_max_byte_len(Node * node,ScanEnv * env)3723 node_max_byte_len(Node* node, ScanEnv* env)
3724 {
3725   OnigLen len;
3726   OnigLen tmax;
3727 
3728   len = 0;
3729   switch (NODE_TYPE(node)) {
3730   case NODE_LIST:
3731     do {
3732       tmax = node_max_byte_len(NODE_CAR(node), env);
3733       len = distance_add(len, tmax);
3734     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3735     break;
3736 
3737   case NODE_ALT:
3738     do {
3739       tmax = node_max_byte_len(NODE_CAR(node), env);
3740       if (len < tmax) len = tmax;
3741     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3742     break;
3743 
3744   case NODE_STRING:
3745     {
3746       StrNode* sn = STR_(node);
3747       len = (OnigLen )(sn->end - sn->s);
3748     }
3749     break;
3750 
3751   case NODE_CTYPE:
3752   case NODE_CCLASS:
3753     len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
3754     break;
3755 
3756   case NODE_BACKREF:
3757     if (! NODE_IS_CHECKER(node)) {
3758       int i;
3759       int* backs;
3760       MemEnv* mem_env = SCANENV_MEMENV(env);
3761       BackRefNode* br = BACKREF_(node);
3762       if (NODE_IS_RECURSION(node)) {
3763 #ifdef USE_BACKREF_WITH_LEVEL
3764         if (NODE_IS_NEST_LEVEL(node)) {
3765           len = INFINITE_LEN;
3766         }
3767 #endif
3768         break;
3769       }
3770       backs = BACKREFS_P(br);
3771       for (i = 0; i < br->back_num; i++) {
3772         tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
3773         if (len < tmax) len = tmax;
3774       }
3775     }
3776     break;
3777 
3778 #ifdef USE_CALL
3779   case NODE_CALL:
3780     if (! NODE_IS_RECURSION(node))
3781       len = node_max_byte_len(NODE_BODY(node), env);
3782     else
3783       len = INFINITE_LEN;
3784     break;
3785 #endif
3786 
3787   case NODE_QUANT:
3788     {
3789       QuantNode* qn = QUANT_(node);
3790 
3791       if (qn->upper != 0) {
3792         len = node_max_byte_len(NODE_BODY(node), env);
3793         if (len != 0) {
3794           if (! IS_INFINITE_REPEAT(qn->upper))
3795             len = distance_multiply(len, qn->upper);
3796           else
3797             len = INFINITE_LEN;
3798         }
3799       }
3800     }
3801     break;
3802 
3803   case NODE_BAG:
3804     {
3805       BagNode* en = BAG_(node);
3806       switch (en->type) {
3807       case BAG_MEMORY:
3808         if (NODE_IS_FIXED_MAX(node))
3809           len = en->max_len;
3810         else {
3811           if (NODE_IS_MARK1(node))
3812             len = INFINITE_LEN;
3813           else {
3814             NODE_STATUS_ADD(node, MARK1);
3815             len = node_max_byte_len(NODE_BODY(node), env);
3816             NODE_STATUS_REMOVE(node, MARK1);
3817 
3818             en->max_len = len;
3819             NODE_STATUS_ADD(node, FIXED_MAX);
3820           }
3821         }
3822         break;
3823 
3824       case BAG_OPTION:
3825       case BAG_STOP_BACKTRACK:
3826         len = node_max_byte_len(NODE_BODY(node), env);
3827         break;
3828       case BAG_IF_ELSE:
3829         {
3830           OnigLen tlen, elen;
3831 
3832           len = node_max_byte_len(NODE_BODY(node), env);
3833           if (IS_NOT_NULL(en->te.Then)) {
3834             tlen = node_max_byte_len(en->te.Then, env);
3835             len = distance_add(len, tlen);
3836           }
3837           if (IS_NOT_NULL(en->te.Else))
3838             elen = node_max_byte_len(en->te.Else, env);
3839           else elen = 0;
3840 
3841           if (elen > len) len = elen;
3842         }
3843         break;
3844       }
3845     }
3846     break;
3847 
3848   case NODE_ANCHOR:
3849   case NODE_GIMMICK:
3850   default:
3851     break;
3852   }
3853 
3854   return len;
3855 }
3856 
3857 static int
check_backrefs(Node * node,ScanEnv * env)3858 check_backrefs(Node* node, ScanEnv* env)
3859 {
3860   int r;
3861 
3862   switch (NODE_TYPE(node)) {
3863   case NODE_LIST:
3864   case NODE_ALT:
3865     do {
3866       r = check_backrefs(NODE_CAR(node), env);
3867     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3868     break;
3869 
3870   case NODE_ANCHOR:
3871     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3872       r = 0;
3873       break;
3874     }
3875     /* fall */
3876   case NODE_QUANT:
3877     r = check_backrefs(NODE_BODY(node), env);
3878     break;
3879 
3880   case NODE_BAG:
3881     r = check_backrefs(NODE_BODY(node), env);
3882     {
3883       BagNode* en = BAG_(node);
3884 
3885       if (en->type == BAG_IF_ELSE) {
3886         if (r != 0) return r;
3887         if (IS_NOT_NULL(en->te.Then)) {
3888           r = check_backrefs(en->te.Then, env);
3889           if (r != 0) return r;
3890         }
3891         if (IS_NOT_NULL(en->te.Else)) {
3892           r = check_backrefs(en->te.Else, env);
3893         }
3894       }
3895     }
3896     break;
3897 
3898   case NODE_BACKREF:
3899     {
3900       int i;
3901       BackRefNode* br = BACKREF_(node);
3902       int* backs = BACKREFS_P(br);
3903       MemEnv* mem_env = SCANENV_MEMENV(env);
3904 
3905       for (i = 0; i < br->back_num; i++) {
3906         if (backs[i] > env->num_mem)
3907           return ONIGERR_INVALID_BACKREF;
3908 
3909         NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3910       }
3911       r = 0;
3912     }
3913     break;
3914 
3915   default:
3916     r = 0;
3917     break;
3918   }
3919 
3920   return r;
3921 }
3922 
3923 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ScanEnv * env)3924 set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
3925 {
3926   int r;
3927 
3928   switch (NODE_TYPE(node)) {
3929   case NODE_LIST:
3930   case NODE_ALT:
3931     do {
3932       r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3933     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3934     break;
3935 
3936   case NODE_ANCHOR:
3937     {
3938       AnchorNode* an = ANCHOR_(node);
3939 
3940       if (! ANCHOR_HAS_BODY(an)) {
3941         r = 0;
3942         break;
3943       }
3944 
3945       switch (an->type) {
3946       case ANCR_PREC_READ:
3947       case ANCR_LOOK_BEHIND:
3948         empty = NULL_NODE;
3949         break;
3950       default:
3951         break;
3952       }
3953       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3954     }
3955     break;
3956 
3957   case NODE_QUANT:
3958     {
3959       QuantNode* qn = QUANT_(node);
3960 
3961       if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3962       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3963     }
3964     break;
3965 
3966   case NODE_BAG:
3967     if (IS_NOT_NULL(NODE_BODY(node))) {
3968       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3969       if (r != 0) return r;
3970     }
3971     {
3972       BagNode* en = BAG_(node);
3973 
3974       if (en->type == BAG_MEMORY) {
3975         if (NODE_IS_BACKREF(node)) {
3976           if (IS_NOT_NULL(empty))
3977             SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3978         }
3979       }
3980       else if (en->type == BAG_IF_ELSE) {
3981         if (IS_NOT_NULL(en->te.Then)) {
3982           r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3983           if (r != 0) return r;
3984         }
3985         if (IS_NOT_NULL(en->te.Else)) {
3986           r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3987         }
3988       }
3989     }
3990     break;
3991 
3992   default:
3993     r = 0;
3994     break;
3995   }
3996 
3997   return r;
3998 }
3999 
4000 static int
is_ancestor_node(Node * node,Node * me)4001 is_ancestor_node(Node* node, Node* me)
4002 {
4003   Node* parent;
4004 
4005   while ((parent = NODE_PARENT(me)) != NULL_NODE) {
4006     if (parent == node) return 1;
4007     me = parent;
4008   }
4009   return 0;
4010 }
4011 
4012 static void
set_empty_status_check_trav(Node * node,ScanEnv * env)4013 set_empty_status_check_trav(Node* node, ScanEnv* env)
4014 {
4015   switch (NODE_TYPE(node)) {
4016   case NODE_LIST:
4017   case NODE_ALT:
4018     do {
4019       set_empty_status_check_trav(NODE_CAR(node), env);
4020     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4021     break;
4022 
4023   case NODE_ANCHOR:
4024     {
4025       AnchorNode* an = ANCHOR_(node);
4026 
4027       if (! ANCHOR_HAS_BODY(an)) break;
4028       set_empty_status_check_trav(NODE_BODY(node), env);
4029     }
4030     break;
4031 
4032   case NODE_QUANT:
4033     set_empty_status_check_trav(NODE_BODY(node), env);
4034     break;
4035 
4036   case NODE_BAG:
4037     if (IS_NOT_NULL(NODE_BODY(node)))
4038       set_empty_status_check_trav(NODE_BODY(node), env);
4039     {
4040       BagNode* en = BAG_(node);
4041 
4042       if (en->type == BAG_IF_ELSE) {
4043         if (IS_NOT_NULL(en->te.Then)) {
4044           set_empty_status_check_trav(en->te.Then, env);
4045         }
4046         if (IS_NOT_NULL(en->te.Else)) {
4047           set_empty_status_check_trav(en->te.Else, env);
4048         }
4049       }
4050     }
4051     break;
4052 
4053   case NODE_BACKREF:
4054     {
4055       int i;
4056       int* backs;
4057       MemEnv* mem_env = SCANENV_MEMENV(env);
4058       BackRefNode* br = BACKREF_(node);
4059       backs = BACKREFS_P(br);
4060       for (i = 0; i < br->back_num; i++) {
4061         Node* ernode = mem_env[backs[i]].empty_repeat_node;
4062         if (IS_NOT_NULL(ernode)) {
4063           if (! is_ancestor_node(ernode, node)) {
4064             MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]);
4065             NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
4066             NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
4067           }
4068         }
4069       }
4070     }
4071     break;
4072 
4073   default:
4074     break;
4075   }
4076 }
4077 
4078 static void
set_parent_node_trav(Node * node,Node * parent)4079 set_parent_node_trav(Node* node, Node* parent)
4080 {
4081   NODE_PARENT(node) = parent;
4082 
4083   switch (NODE_TYPE(node)) {
4084   case NODE_LIST:
4085   case NODE_ALT:
4086     do {
4087       set_parent_node_trav(NODE_CAR(node), node);
4088     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4089     break;
4090 
4091   case NODE_ANCHOR:
4092     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
4093     set_parent_node_trav(NODE_BODY(node), node);
4094     break;
4095 
4096   case NODE_QUANT:
4097     set_parent_node_trav(NODE_BODY(node), node);
4098     break;
4099 
4100   case NODE_BAG:
4101     if (IS_NOT_NULL(NODE_BODY(node)))
4102       set_parent_node_trav(NODE_BODY(node), node);
4103     {
4104       BagNode* en = BAG_(node);
4105 
4106       if (en->type == BAG_IF_ELSE) {
4107         if (IS_NOT_NULL(en->te.Then))
4108           set_parent_node_trav(en->te.Then, node);
4109         if (IS_NOT_NULL(en->te.Else)) {
4110           set_parent_node_trav(en->te.Else, node);
4111         }
4112       }
4113     }
4114     break;
4115 
4116   default:
4117     break;
4118   }
4119 }
4120 
4121 
4122 #ifdef USE_CALL
4123 
4124 #define RECURSION_EXIST        (1<<0)
4125 #define RECURSION_MUST         (1<<1)
4126 #define RECURSION_INFINITE     (1<<2)
4127 
4128 static int
infinite_recursive_call_check(Node * node,ScanEnv * env,int head)4129 infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
4130 {
4131   int ret;
4132   int r = 0;
4133 
4134   switch (NODE_TYPE(node)) {
4135   case NODE_LIST:
4136     {
4137       Node *x;
4138       OnigLen min;
4139 
4140       x = node;
4141       do {
4142         ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
4143         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4144         r |= ret;
4145         if (head != 0) {
4146           min = node_min_byte_len(NODE_CAR(x), env);
4147           if (min != 0) head = 0;
4148         }
4149       } while (IS_NOT_NULL(x = NODE_CDR(x)));
4150     }
4151     break;
4152 
4153   case NODE_ALT:
4154     {
4155       int must;
4156 
4157       must = RECURSION_MUST;
4158       do {
4159         ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
4160         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4161 
4162         r    |= (ret & RECURSION_EXIST);
4163         must &= ret;
4164       } while (IS_NOT_NULL(node = NODE_CDR(node)));
4165       r |= must;
4166     }
4167     break;
4168 
4169   case NODE_QUANT:
4170     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4171     if (r < 0) return r;
4172     if ((r & RECURSION_MUST) != 0) {
4173       if (QUANT_(node)->lower == 0)
4174         r &= ~RECURSION_MUST;
4175     }
4176     break;
4177 
4178   case NODE_ANCHOR:
4179     if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
4180       break;
4181     /* fall */
4182   case NODE_CALL:
4183     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4184     break;
4185 
4186   case NODE_BAG:
4187     {
4188       BagNode* en = BAG_(node);
4189 
4190       if (en->type == BAG_MEMORY) {
4191         if (NODE_IS_MARK2(node))
4192           return 0;
4193         else if (NODE_IS_MARK1(node))
4194           return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
4195                   : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
4196         else {
4197           NODE_STATUS_ADD(node, MARK2);
4198           r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4199           NODE_STATUS_REMOVE(node, MARK2);
4200         }
4201       }
4202       else if (en->type == BAG_IF_ELSE) {
4203         int eret;
4204 
4205         ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
4206         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4207         r |= ret;
4208         if (IS_NOT_NULL(en->te.Then)) {
4209           OnigLen min;
4210           if (head != 0) {
4211             min = node_min_byte_len(NODE_BODY(node), env);
4212           }
4213           else min = 0;
4214 
4215           ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
4216           if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4217           r |= ret;
4218         }
4219         if (IS_NOT_NULL(en->te.Else)) {
4220           eret = infinite_recursive_call_check(en->te.Else, env, head);
4221           if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
4222           r |= (eret & RECURSION_EXIST);
4223           if ((eret & RECURSION_MUST) == 0)
4224             r &= ~RECURSION_MUST;
4225         }
4226         else {
4227           r &= ~RECURSION_MUST;
4228         }
4229       }
4230       else {
4231         r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4232       }
4233     }
4234     break;
4235 
4236   default:
4237     break;
4238   }
4239 
4240   return r;
4241 }
4242 
4243 static int
infinite_recursive_call_check_trav(Node * node,ScanEnv * env)4244 infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
4245 {
4246   int r;
4247 
4248   switch (NODE_TYPE(node)) {
4249   case NODE_LIST:
4250   case NODE_ALT:
4251     do {
4252       r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
4253     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4254     break;
4255 
4256   case NODE_ANCHOR:
4257     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4258       r = 0;
4259       break;
4260     }
4261     /* fall */
4262   case NODE_QUANT:
4263     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4264     break;
4265 
4266   case NODE_BAG:
4267     {
4268       BagNode* en = BAG_(node);
4269 
4270       if (en->type == BAG_MEMORY) {
4271         if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
4272           int ret;
4273 
4274           NODE_STATUS_ADD(node, MARK1);
4275 
4276           ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
4277           if (ret < 0) return ret;
4278           else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
4279             return ONIGERR_NEVER_ENDING_RECURSION;
4280 
4281           NODE_STATUS_REMOVE(node, MARK1);
4282         }
4283       }
4284       else if (en->type == BAG_IF_ELSE) {
4285         if (IS_NOT_NULL(en->te.Then)) {
4286           r = infinite_recursive_call_check_trav(en->te.Then, env);
4287           if (r != 0) return r;
4288         }
4289         if (IS_NOT_NULL(en->te.Else)) {
4290           r = infinite_recursive_call_check_trav(en->te.Else, env);
4291           if (r != 0) return r;
4292         }
4293       }
4294     }
4295 
4296     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4297     break;
4298 
4299   default:
4300     r = 0;
4301     break;
4302   }
4303 
4304   return r;
4305 }
4306 
4307 static int
recursive_call_check(Node * node)4308 recursive_call_check(Node* node)
4309 {
4310   int r;
4311 
4312   switch (NODE_TYPE(node)) {
4313   case NODE_LIST:
4314   case NODE_ALT:
4315     r = 0;
4316     do {
4317       r |= recursive_call_check(NODE_CAR(node));
4318     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4319     break;
4320 
4321   case NODE_ANCHOR:
4322     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4323       r = 0;
4324       break;
4325     }
4326     /* fall */
4327   case NODE_QUANT:
4328     r = recursive_call_check(NODE_BODY(node));
4329     break;
4330 
4331   case NODE_CALL:
4332     r = recursive_call_check(NODE_BODY(node));
4333     if (r != 0) {
4334       if (NODE_IS_MARK1(NODE_BODY(node)))
4335         NODE_STATUS_ADD(node, RECURSION);
4336     }
4337     break;
4338 
4339   case NODE_BAG:
4340     {
4341       BagNode* en = BAG_(node);
4342 
4343       if (en->type == BAG_MEMORY) {
4344         if (NODE_IS_MARK2(node))
4345           return 0;
4346         else if (NODE_IS_MARK1(node))
4347           return 1; /* recursion */
4348         else {
4349           NODE_STATUS_ADD(node, MARK2);
4350           r = recursive_call_check(NODE_BODY(node));
4351           NODE_STATUS_REMOVE(node, MARK2);
4352         }
4353       }
4354       else if (en->type == BAG_IF_ELSE) {
4355         r = 0;
4356         if (IS_NOT_NULL(en->te.Then)) {
4357           r |= recursive_call_check(en->te.Then);
4358         }
4359         if (IS_NOT_NULL(en->te.Else)) {
4360           r |= recursive_call_check(en->te.Else);
4361         }
4362         r |= recursive_call_check(NODE_BODY(node));
4363       }
4364       else {
4365         r = recursive_call_check(NODE_BODY(node));
4366       }
4367     }
4368     break;
4369 
4370   default:
4371     r = 0;
4372     break;
4373   }
4374 
4375   return r;
4376 }
4377 
4378 #define IN_RECURSION         (1<<0)
4379 #define FOUND_CALLED_NODE    1
4380 
4381 static int
recursive_call_check_trav(Node * node,ScanEnv * env,int state)4382 recursive_call_check_trav(Node* node, ScanEnv* env, int state)
4383 {
4384   int r = 0;
4385 
4386   switch (NODE_TYPE(node)) {
4387   case NODE_LIST:
4388   case NODE_ALT:
4389     {
4390       int ret;
4391       do {
4392         ret = recursive_call_check_trav(NODE_CAR(node), env, state);
4393         if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
4394         else if (ret < 0) return ret;
4395       } while (IS_NOT_NULL(node = NODE_CDR(node)));
4396     }
4397     break;
4398 
4399   case NODE_QUANT:
4400     r = recursive_call_check_trav(NODE_BODY(node), env, state);
4401     if (QUANT_(node)->upper == 0) {
4402       if (r == FOUND_CALLED_NODE)
4403         QUANT_(node)->include_referred = 1;
4404     }
4405     break;
4406 
4407   case NODE_ANCHOR:
4408     {
4409       AnchorNode* an = ANCHOR_(node);
4410       if (ANCHOR_HAS_BODY(an))
4411         r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
4412     }
4413     break;
4414 
4415   case NODE_BAG:
4416     {
4417       int ret;
4418       int state1;
4419       BagNode* en = BAG_(node);
4420 
4421       if (en->type == BAG_MEMORY) {
4422         if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
4423           if (! NODE_IS_RECURSION(node)) {
4424             NODE_STATUS_ADD(node, MARK1);
4425             r = recursive_call_check(NODE_BODY(node));
4426             if (r != 0) {
4427               NODE_STATUS_ADD(node, RECURSION);
4428               MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
4429             }
4430             NODE_STATUS_REMOVE(node, MARK1);
4431           }
4432 
4433           if (NODE_IS_CALLED(node))
4434             r = FOUND_CALLED_NODE;
4435         }
4436       }
4437 
4438       state1 = state;
4439       if (NODE_IS_RECURSION(node))
4440         state1 |= IN_RECURSION;
4441 
4442       ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
4443       if (ret == FOUND_CALLED_NODE)
4444         r = FOUND_CALLED_NODE;
4445 
4446       if (en->type == BAG_IF_ELSE) {
4447         if (IS_NOT_NULL(en->te.Then)) {
4448           ret = recursive_call_check_trav(en->te.Then, env, state1);
4449           if (ret == FOUND_CALLED_NODE)
4450             r = FOUND_CALLED_NODE;
4451         }
4452         if (IS_NOT_NULL(en->te.Else)) {
4453           ret = recursive_call_check_trav(en->te.Else, env, state1);
4454           if (ret == FOUND_CALLED_NODE)
4455             r = FOUND_CALLED_NODE;
4456         }
4457       }
4458     }
4459     break;
4460 
4461   default:
4462     break;
4463   }
4464 
4465   return r;
4466 }
4467 
4468 #endif
4469 
4470 static void
remove_from_list(Node * prev,Node * a)4471 remove_from_list(Node* prev, Node* a)
4472 {
4473   if (NODE_CDR(prev) != a) return ;
4474 
4475   NODE_CDR(prev) = NODE_CDR(a);
4476   NODE_CDR(a) = NULL_NODE;
4477 }
4478 
4479 static int
reduce_string_list(Node * node)4480 reduce_string_list(Node* node)
4481 {
4482   int r = 0;
4483 
4484   switch (NODE_TYPE(node)) {
4485   case NODE_LIST:
4486     {
4487       Node* prev;
4488       Node* curr;
4489       Node* prev_node;
4490       Node* next_node;
4491 
4492       prev = NULL_NODE;
4493       do {
4494         next_node = NODE_CDR(node);
4495         curr = NODE_CAR(node);
4496         if (NODE_TYPE(curr) == NODE_STRING) {
4497           if (IS_NULL(prev)
4498               || STR_(curr)->flag  != STR_(prev)->flag
4499               || NODE_STATUS(curr) != NODE_STATUS(prev)) {
4500             prev = curr;
4501             prev_node = node;
4502           }
4503           else {
4504             r = node_str_node_cat(prev, curr);
4505             if (r != 0) return r;
4506             remove_from_list(prev_node, node);
4507             onig_node_free(node);
4508           }
4509         }
4510         else {
4511           prev = NULL_NODE;
4512           prev_node = node;
4513         }
4514 
4515         node = next_node;
4516       } while (r == 0 && IS_NOT_NULL(node));
4517     }
4518     break;
4519 
4520   case NODE_ALT:
4521     do {
4522       r = reduce_string_list(NODE_CAR(node));
4523     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4524     break;
4525 
4526   case NODE_ANCHOR:
4527     if (IS_NULL(NODE_BODY(node)))
4528       break;
4529     /* fall */
4530   case NODE_QUANT:
4531     r = reduce_string_list(NODE_BODY(node));
4532     break;
4533 
4534   case NODE_BAG:
4535     {
4536       BagNode* en = BAG_(node);
4537 
4538       r = reduce_string_list(NODE_BODY(node));
4539       if (r != 0) return r;
4540 
4541       if (en->type == BAG_IF_ELSE) {
4542         if (IS_NOT_NULL(en->te.Then)) {
4543           r = reduce_string_list(en->te.Then);
4544           if (r != 0) return r;
4545         }
4546         if (IS_NOT_NULL(en->te.Else)) {
4547           r = reduce_string_list(en->te.Else);
4548           if (r != 0) return r;
4549         }
4550       }
4551     }
4552     break;
4553 
4554   default:
4555     break;
4556   }
4557 
4558   return r;
4559 }
4560 
4561 
4562 #define IN_ALT          (1<<0)
4563 #define IN_NOT          (1<<1)
4564 #define IN_REAL_REPEAT  (1<<2)
4565 #define IN_VAR_REPEAT   (1<<3)
4566 #define IN_ZERO_REPEAT  (1<<4)
4567 #define IN_MULTI_ENTRY  (1<<5)
4568 #define IN_LOOK_BEHIND  (1<<6)
4569 
4570 
4571 /* divide different length alternatives in look-behind.
4572   (?<=A|B) ==> (?<=A)|(?<=B)
4573   (?<!A|B) ==> (?<!A)(?<!B)
4574 */
4575 static int
divide_look_behind_alternatives(Node * node)4576 divide_look_behind_alternatives(Node* node)
4577 {
4578   int r;
4579   int anc_type;
4580   Node *head, *np, *insert_node;
4581   AnchorNode* an;
4582 
4583   an = ANCHOR_(node);
4584   anc_type = an->type;
4585 
4586   head = NODE_ANCHOR_BODY(an);
4587   np = NODE_CAR(head);
4588   node_swap(node, head);
4589   NODE_CAR(node) = head;
4590   NODE_BODY(head) = np;
4591 
4592   np = node;
4593   while (IS_NOT_NULL(np = NODE_CDR(np))) {
4594     r = onig_node_copy(&insert_node, head);
4595     if (r != 0) return r;
4596     CHECK_NULL_RETURN_MEMERR(insert_node);
4597     NODE_BODY(insert_node) = NODE_CAR(np);
4598     NODE_CAR(np) = insert_node;
4599   }
4600 
4601   if (anc_type == ANCR_LOOK_BEHIND_NOT) {
4602     np = node;
4603     do {
4604       NODE_SET_TYPE(np, NODE_LIST);  /* alt -> list */
4605     } while (IS_NOT_NULL(np = NODE_CDR(np)));
4606   }
4607   return 0;
4608 }
4609 
4610 static int
node_reduce_in_look_behind(Node * node)4611 node_reduce_in_look_behind(Node* node)
4612 {
4613   NodeType type;
4614   Node* body;
4615 
4616   if (NODE_TYPE(node) != NODE_QUANT) return 0;
4617 
4618   body = NODE_BODY(node);
4619   type = NODE_TYPE(body);
4620   if (type == NODE_STRING || type == NODE_CTYPE ||
4621       type == NODE_CCLASS || type == NODE_BACKREF) {
4622     QuantNode* qn = QUANT_(node);
4623     qn->upper = qn->lower;
4624     if (qn->upper == 0)
4625       return 1; /* removed */
4626   }
4627 
4628   return 0;
4629 }
4630 
4631 static int
list_reduce_in_look_behind(Node * node)4632 list_reduce_in_look_behind(Node* node)
4633 {
4634   int r;
4635 
4636   switch (NODE_TYPE(node)) {
4637   case NODE_QUANT:
4638     r = node_reduce_in_look_behind(node);
4639     if (r > 0) r = 0;
4640     break;
4641 
4642   case NODE_LIST:
4643     do {
4644       r = node_reduce_in_look_behind(NODE_CAR(node));
4645       if (r <= 0) break;
4646     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4647     break;
4648 
4649   default:
4650     r = 0;
4651     break;
4652   }
4653 
4654   return r;
4655 }
4656 
4657 static int
alt_reduce_in_look_behind(Node * node,regex_t * reg,ScanEnv * env)4658 alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env)
4659 {
4660   int r;
4661 
4662   switch (NODE_TYPE(node)) {
4663   case NODE_ALT:
4664     do {
4665       r = list_reduce_in_look_behind(NODE_CAR(node));
4666     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4667     break;
4668 
4669   default:
4670     r = list_reduce_in_look_behind(node);
4671     break;
4672   }
4673 
4674   return r;
4675 }
4676 
4677 static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
4678 
4679 static int
tune_look_behind(Node * node,regex_t * reg,int state,ScanEnv * env)4680 tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env)
4681 {
4682   int r;
4683   int state1;
4684   int used;
4685   MinMaxCharLen ci;
4686   Node* body;
4687   AnchorNode* an = ANCHOR_(node);
4688 
4689   used = FALSE;
4690   r = check_node_in_look_behind(NODE_ANCHOR_BODY(an),
4691                                 an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0,
4692                                 &used);
4693   if (r < 0) return r;
4694   if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4695 
4696   if (an->type == ANCR_LOOK_BEHIND_NOT)
4697     state1 = state | IN_NOT | IN_LOOK_BEHIND;
4698   else
4699     state1 = state | IN_LOOK_BEHIND;
4700 
4701   body = NODE_ANCHOR_BODY(an);
4702   /* Execute tune_tree(body) before call node_char_len().
4703      Because case-fold expansion must be done before node_char_len().
4704    */
4705   r = tune_tree(body, reg, state1, env);
4706   if (r != 0) return r;
4707 
4708   r = alt_reduce_in_look_behind(body, reg, env);
4709   if (r != 0) return r;
4710 
4711   r = node_char_len(body, reg, &ci, env);
4712   if (r >= 0) {
4713     /* #177: overflow in onigenc_step_back() */
4714     if ((ci.max != INFINITE_LEN && ci.max > LOOK_BEHIND_MAX_CHAR_LEN)
4715       || ci.min > LOOK_BEHIND_MAX_CHAR_LEN) {
4716       return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4717     }
4718 
4719     if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) {
4720       if (an->type == ANCR_LOOK_BEHIND_NOT)
4721         r = onig_node_reset_fail(node);
4722       else
4723         r = onig_node_reset_empty(node);
4724 
4725       return r;
4726     }
4727 
4728     if (r == CHAR_LEN_TOP_ALT_FIXED) {
4729       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) {
4730         r = divide_look_behind_alternatives(node);
4731         if (r == 0)
4732           r = tune_tree(node, reg, state, env);
4733       }
4734       else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND))
4735         goto normal;
4736       else
4737         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4738     }
4739     else { /* CHAR_LEN_NORMAL */
4740     normal:
4741       if (ci.min == INFINITE_LEN) {
4742         r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4743       }
4744       else {
4745         if (ci.min != ci.max &&
4746             ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) {
4747           r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4748         }
4749         else {
4750           Node* tail;
4751 
4752           /* check lead_node is already set by double call after
4753              divide_look_behind_alternatives() */
4754           if (IS_NULL(an->lead_node)) {
4755             an->char_min_len = ci.min;
4756             an->char_max_len = ci.max;
4757             r = get_tree_tail_literal(body, &tail, reg);
4758             if (r == GET_VALUE_FOUND) {
4759               r = onig_node_copy(&(an->lead_node), tail);
4760               if (r != 0) return r;
4761             }
4762           }
4763           r = ONIG_NORMAL;
4764         }
4765       }
4766     }
4767   }
4768 
4769   return r;
4770 }
4771 
4772 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4773 tune_next(Node* node, Node* next_node, regex_t* reg)
4774 {
4775   NodeType type;
4776 
4777  retry:
4778   type = NODE_TYPE(node);
4779   if (type == NODE_QUANT) {
4780     QuantNode* qn = QUANT_(node);
4781     if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4782 #ifdef USE_QUANT_PEEK_NEXT
4783       Node* n = get_tree_head_literal(next_node, 1, reg);
4784       /* '\0': for UTF-16BE etc... */
4785       if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4786         qn->next_head_exact = n;
4787       }
4788 #endif
4789       /* automatic posseivation a*b ==> (?>a*)b */
4790       if (qn->lower <= 1) {
4791         if (is_strict_real_node(NODE_BODY(node))) {
4792           Node *x, *y;
4793           x = get_tree_head_literal(NODE_BODY(node), 0, reg);
4794           if (IS_NOT_NULL(x)) {
4795             y = get_tree_head_literal(next_node,  0, reg);
4796             if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4797               Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4798               CHECK_NULL_RETURN_MEMERR(en);
4799               NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4800               node_swap(node, en);
4801               NODE_BODY(node) = en;
4802             }
4803           }
4804         }
4805       }
4806     }
4807   }
4808   else if (type == NODE_BAG) {
4809     BagNode* en = BAG_(node);
4810     if (en->type == BAG_MEMORY) {
4811       node = NODE_BODY(node);
4812       goto retry;
4813     }
4814   }
4815   return 0;
4816 }
4817 
4818 
4819 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4820 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4821 {
4822   int i;
4823 
4824   for (i = 0; i < n; i++) {
4825     OnigCaseFoldCodeItem* item = items + i;
4826     if (item->code_len != 1) return 0;
4827   }
4828 
4829   return 1;
4830 }
4831 
4832 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],OnigLen * rmin,OnigLen * rmax)4833 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[],
4834                                      OnigLen* rmin, OnigLen* rmax)
4835 {
4836   int i;
4837   OnigLen len, minlen, maxlen;
4838 
4839   minlen = INFINITE_LEN;
4840   maxlen = 0;
4841   for (i = 0; i < n; i++) {
4842     OnigCaseFoldCodeItem* item = items + i;
4843 
4844     len = item->byte_len;
4845     if (len < minlen) minlen = len;
4846     if (len > maxlen) maxlen = len;
4847   }
4848 
4849   *rmin = minlen;
4850   *rmax = maxlen;
4851   return 0;
4852 }
4853 
4854 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4855 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4856                          int n, OnigCodePoint codes[])
4857 {
4858   int r, i, len;
4859   Node* node;
4860   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4861 
4862   *rnode = NULL_NODE;
4863   node = onig_node_new_str(NULL, NULL);
4864   CHECK_NULL_RETURN_MEMERR(node);
4865 
4866   for (i = 0; i < n; i++) {
4867     len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4868     if (len < 0) {
4869       r = len;
4870       goto err;
4871     }
4872 
4873     r = onig_node_str_cat(node, buf, buf + len);
4874     if (r != 0) goto err;
4875   }
4876 
4877   *rnode = node;
4878   return 0;
4879 
4880  err:
4881   onig_node_free(node);
4882   return r;
4883 }
4884 
4885 static int
unravel_cf_node_add(Node ** rlist,Node * add)4886 unravel_cf_node_add(Node** rlist, Node* add)
4887 {
4888   Node *list;
4889 
4890   list = *rlist;
4891   if (IS_NULL(list)) {
4892     list = onig_node_new_list(add, NULL);
4893     CHECK_NULL_RETURN_MEMERR(list);
4894     *rlist = list;
4895   }
4896   else {
4897     Node* r = node_list_add(list, add);
4898     CHECK_NULL_RETURN_MEMERR(r);
4899   }
4900 
4901   return 0;
4902 }
4903 
4904 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag)4905 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4906                       unsigned int flag)
4907 {
4908   int r;
4909   Node *sn, *list;
4910 
4911   list = *rlist;
4912   sn   = *rsn;
4913 
4914   if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4915     r = onig_node_str_cat(sn, s, end);
4916   }
4917   else {
4918     sn = onig_node_new_str(s, end);
4919     CHECK_NULL_RETURN_MEMERR(sn);
4920 
4921     STR_(sn)->flag = flag;
4922     r = unravel_cf_node_add(&list, sn);
4923   }
4924 
4925   if (r == 0) {
4926     *rlist = list;
4927     *rsn = sn;
4928   }
4929   return r;
4930 }
4931 
4932 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4933 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4934             OnigCaseFoldCodeItem items[], OnigEncoding enc,
4935             OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4936 {
4937   int r, i;
4938   Node* node;
4939 
4940   if (is_all_code_len_1_items(n, items)) {
4941     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4942 
4943     codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4944     for (i = 0; i < n; i++) {
4945       OnigCaseFoldCodeItem* item = items + i;
4946       codes[i+1] = item->code[0];
4947     }
4948     r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4949     if (r != 0) return r;
4950   }
4951   else {
4952     Node *snode, *alt, *curr;
4953 
4954     snode = onig_node_new_str(s, end);
4955     CHECK_NULL_RETURN_MEMERR(snode);
4956     node = curr = onig_node_new_alt(snode, NULL_NODE);
4957     if (IS_NULL(curr)) {
4958       onig_node_free(snode);
4959       return ONIGERR_MEMORY;
4960     }
4961 
4962     r = 0;
4963     for (i = 0; i < n; i++) {
4964       OnigCaseFoldCodeItem* item = items + i;
4965       r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4966       if (r != 0) {
4967         onig_node_free(node);
4968         return r;
4969       }
4970 
4971       alt = onig_node_new_alt(snode, NULL_NODE);
4972       if (IS_NULL(alt)) {
4973         onig_node_free(snode);
4974         onig_node_free(node);
4975         return ONIGERR_MEMORY;
4976       }
4977 
4978       NODE_CDR(curr) = alt;
4979       curr = alt;
4980     }
4981   }
4982 
4983   r = unravel_cf_node_add(rlist, node);
4984   if (r != 0) onig_node_free(node);
4985   return r;
4986 }
4987 
4988 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,OnigLen one_len)4989 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4990                 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4991                 UChar* s, OnigLen one_len)
4992 {
4993   int r, i, found;
4994 
4995   found = 0;
4996   for (i = 0; i < n; i++) {
4997     OnigCaseFoldCodeItem* item = items + i;
4998     if (item->byte_len == one_len) {
4999       if (item->code_len == 1) {
5000         found = 1;
5001       }
5002     }
5003   }
5004 
5005   if (found == 0) {
5006     r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */);
5007   }
5008   else {
5009     Node* node;
5010     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
5011 
5012     found = 0;
5013     codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
5014     for (i = 0; i < n; i++) {
5015       OnigCaseFoldCodeItem* item = items + i;
5016       if (item->byte_len == one_len) {
5017         if (item->code_len == 1) {
5018           codes[found++] = item->code[0];
5019         }
5020       }
5021     }
5022     r = onig_new_cclass_with_code_list(&node, enc, found, codes);
5023     if (r != 0) return r;
5024 
5025     r = unravel_cf_node_add(rlist, node);
5026     if (r != 0) onig_node_free(node);
5027 
5028     *rsn = NULL_NODE;
5029   }
5030 
5031   return r;
5032 }
5033 
5034 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)5035 unravel_case_fold_string(Node* node, regex_t* reg, int state)
5036 {
5037   int r, n, in_look_behind;
5038   OnigLen min_len, max_len, one_len;
5039   UChar *start, *end, *p, *q;
5040   StrNode* snode;
5041   Node *sn, *list;
5042   OnigEncoding enc;
5043   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5044 
5045   if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
5046 
5047   NODE_STATUS_REMOVE(node, IGNORECASE);
5048   snode = STR_(node);
5049   start = snode->s;
5050   end   = snode->end;
5051   if (start >= end) return 0;
5052 
5053   in_look_behind = (state & IN_LOOK_BEHIND) != 0;
5054   enc = reg->enc;
5055 
5056   list = sn = NULL_NODE;
5057   p = start;
5058   while (p < end) {
5059     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
5060                                            items);
5061     if (n < 0) {
5062       r = n;
5063       goto err;
5064     }
5065 
5066     one_len = (OnigLen )enclen(enc, p);
5067     if (n == 0) {
5068       q = p + one_len;
5069       r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */);
5070       if (r != 0) goto err;
5071     }
5072     else {
5073       if (in_look_behind != 0) {
5074         q = p + one_len;
5075         if (items[0].byte_len != one_len) {
5076           r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q,
5077                                                  items);
5078           if (r < 0) goto err;
5079           n = r;
5080         }
5081         r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
5082         if (r != 0) goto err;
5083       }
5084       else {
5085         get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
5086         if (min_len != max_len) {
5087           r = ONIGERR_PARSER_BUG;
5088           goto err;
5089         }
5090 
5091         q = p + max_len;
5092         r = unravel_cf_string_alt_or_cc_add(&list, n, items, enc,
5093                                             reg->case_fold_flag, p, q);
5094         if (r != 0) goto err;
5095         sn = NULL_NODE;
5096       }
5097     }
5098 
5099     p = q;
5100   }
5101 
5102   if (IS_NOT_NULL(list)) {
5103     if (node_list_len(list) == 1) {
5104       node_swap(node, NODE_CAR(list));
5105     }
5106     else {
5107       node_swap(node, list);
5108     }
5109     onig_node_free(list);
5110   }
5111   else {
5112     node_swap(node, sn);
5113     onig_node_free(sn);
5114   }
5115   return 0;
5116 
5117  err:
5118   if (IS_NOT_NULL(list))
5119     onig_node_free(list);
5120   else if (IS_NOT_NULL(sn))
5121     onig_node_free(sn);
5122 
5123   return r;
5124 }
5125 
5126 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5127 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)5128 quantifiers_memory_node_info(Node* node)
5129 {
5130   int r = BODY_MAY_BE_EMPTY;
5131 
5132   switch (NODE_TYPE(node)) {
5133   case NODE_LIST:
5134   case NODE_ALT:
5135     {
5136       int v;
5137       do {
5138         v = quantifiers_memory_node_info(NODE_CAR(node));
5139         if (v > r) r = v;
5140       } while (IS_NOT_NULL(node = NODE_CDR(node)));
5141     }
5142     break;
5143 
5144 #ifdef USE_CALL
5145   case NODE_CALL:
5146     if (NODE_IS_RECURSION(node)) {
5147       return BODY_MAY_BE_EMPTY_REC; /* tiny version */
5148     }
5149     else
5150       r = quantifiers_memory_node_info(NODE_BODY(node));
5151     break;
5152 #endif
5153 
5154   case NODE_QUANT:
5155     {
5156       QuantNode* qn = QUANT_(node);
5157       if (qn->upper != 0) {
5158         r = quantifiers_memory_node_info(NODE_BODY(node));
5159       }
5160     }
5161     break;
5162 
5163   case NODE_BAG:
5164     {
5165       BagNode* en = BAG_(node);
5166       switch (en->type) {
5167       case BAG_MEMORY:
5168         if (NODE_IS_RECURSION(node)) {
5169           return BODY_MAY_BE_EMPTY_REC;
5170         }
5171         return BODY_MAY_BE_EMPTY_MEM;
5172         break;
5173 
5174       case BAG_OPTION:
5175       case BAG_STOP_BACKTRACK:
5176         r = quantifiers_memory_node_info(NODE_BODY(node));
5177         break;
5178       case BAG_IF_ELSE:
5179         {
5180           int v;
5181           r = quantifiers_memory_node_info(NODE_BODY(node));
5182           if (IS_NOT_NULL(en->te.Then)) {
5183             v = quantifiers_memory_node_info(en->te.Then);
5184             if (v > r) r = v;
5185           }
5186           if (IS_NOT_NULL(en->te.Else)) {
5187             v = quantifiers_memory_node_info(en->te.Else);
5188             if (v > r) r = v;
5189           }
5190         }
5191         break;
5192       }
5193     }
5194     break;
5195 
5196   case NODE_BACKREF:
5197   case NODE_STRING:
5198   case NODE_CTYPE:
5199   case NODE_CCLASS:
5200   case NODE_ANCHOR:
5201   case NODE_GIMMICK:
5202   default:
5203     break;
5204   }
5205 
5206   return r;
5207 }
5208 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
5209 
5210 
5211 #ifdef USE_CALL
5212 
5213 #ifdef __GNUC__
5214 __inline
5215 #endif
5216 static int
tune_call_node_call(CallNode * cn,ScanEnv * env,int state)5217 tune_call_node_call(CallNode* cn, ScanEnv* env, int state)
5218 {
5219   MemEnv* mem_env = SCANENV_MEMENV(env);
5220 
5221   if (cn->by_number != 0) {
5222     int gnum = cn->group_num;
5223 
5224     if (env->num_named > 0 &&
5225         IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5226         ! OPTON_CAPTURE_GROUP(env->options)) {
5227       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
5228     }
5229 
5230     if (gnum > env->num_mem) {
5231       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
5232                                      cn->name, cn->name_end);
5233       return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5234     }
5235 
5236   set_call_attr:
5237     NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node;
5238     if (IS_NULL(NODE_CALL_BODY(cn))) {
5239       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5240                                      cn->name, cn->name_end);
5241       return ONIGERR_UNDEFINED_NAME_REFERENCE;
5242     }
5243   }
5244   else {
5245     int *refs;
5246 
5247     int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
5248     if (n <= 0) {
5249       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5250                                      cn->name, cn->name_end);
5251       return ONIGERR_UNDEFINED_NAME_REFERENCE;
5252     }
5253     else if (n > 1) {
5254       onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
5255                                      cn->name, cn->name_end);
5256       return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
5257     }
5258     else {
5259       cn->group_num = refs[0];
5260       goto set_call_attr;
5261     }
5262   }
5263 
5264   return 0;
5265 }
5266 
5267 static void
tune_call2_call(Node * node)5268 tune_call2_call(Node* node)
5269 {
5270   switch (NODE_TYPE(node)) {
5271   case NODE_LIST:
5272   case NODE_ALT:
5273     do {
5274       tune_call2_call(NODE_CAR(node));
5275     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5276     break;
5277 
5278   case NODE_QUANT:
5279     tune_call2_call(NODE_BODY(node));
5280     break;
5281 
5282   case NODE_ANCHOR:
5283     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5284       tune_call2_call(NODE_BODY(node));
5285     break;
5286 
5287   case NODE_BAG:
5288     {
5289       BagNode* en = BAG_(node);
5290 
5291       if (en->type == BAG_MEMORY) {
5292         if (! NODE_IS_MARK1(node)) {
5293           NODE_STATUS_ADD(node, MARK1);
5294           tune_call2_call(NODE_BODY(node));
5295           NODE_STATUS_REMOVE(node, MARK1);
5296         }
5297       }
5298       else if (en->type == BAG_IF_ELSE) {
5299         tune_call2_call(NODE_BODY(node));
5300         if (IS_NOT_NULL(en->te.Then))
5301           tune_call2_call(en->te.Then);
5302         if (IS_NOT_NULL(en->te.Else))
5303           tune_call2_call(en->te.Else);
5304       }
5305       else {
5306         tune_call2_call(NODE_BODY(node));
5307       }
5308     }
5309     break;
5310 
5311   case NODE_CALL:
5312     if (! NODE_IS_MARK1(node)) {
5313       NODE_STATUS_ADD(node, MARK1);
5314       {
5315         CallNode* cn = CALL_(node);
5316         Node* called = NODE_CALL_BODY(cn);
5317 
5318         cn->entry_count++;
5319 
5320         NODE_STATUS_ADD(called, CALLED);
5321         BAG_(called)->m.entry_count++;
5322         tune_call2_call(called);
5323       }
5324       NODE_STATUS_REMOVE(node, MARK1);
5325     }
5326     break;
5327 
5328   default:
5329     break;
5330   }
5331 }
5332 
5333 static int
tune_call(Node * node,ScanEnv * env,int state)5334 tune_call(Node* node, ScanEnv* env, int state)
5335 {
5336   int r;
5337 
5338   switch (NODE_TYPE(node)) {
5339   case NODE_LIST:
5340   case NODE_ALT:
5341     do {
5342       r = tune_call(NODE_CAR(node), env, state);
5343     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5344     break;
5345 
5346   case NODE_QUANT:
5347     if (QUANT_(node)->upper == 0)
5348       state |= IN_ZERO_REPEAT;
5349 
5350     r = tune_call(NODE_BODY(node), env, state);
5351     break;
5352 
5353   case NODE_ANCHOR:
5354     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5355       r = tune_call(NODE_BODY(node), env, state);
5356     else
5357       r = 0;
5358     break;
5359 
5360   case NODE_BAG:
5361     {
5362       BagNode* en = BAG_(node);
5363 
5364       if (en->type == BAG_MEMORY) {
5365         if ((state & IN_ZERO_REPEAT) != 0) {
5366           NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5367           BAG_(node)->m.entry_count--;
5368         }
5369         r = tune_call(NODE_BODY(node), env, state);
5370       }
5371       else if (en->type == BAG_IF_ELSE) {
5372         r = tune_call(NODE_BODY(node), env, state);
5373         if (r != 0) return r;
5374         if (IS_NOT_NULL(en->te.Then)) {
5375           r = tune_call(en->te.Then, env, state);
5376           if (r != 0) return r;
5377         }
5378         if (IS_NOT_NULL(en->te.Else))
5379           r = tune_call(en->te.Else, env, state);
5380       }
5381       else
5382         r = tune_call(NODE_BODY(node), env, state);
5383     }
5384     break;
5385 
5386   case NODE_CALL:
5387     if ((state & IN_ZERO_REPEAT) != 0) {
5388       NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5389       CALL_(node)->entry_count--;
5390     }
5391 
5392     r = tune_call_node_call(CALL_(node), env, state);
5393     break;
5394 
5395   default:
5396     r = 0;
5397     break;
5398   }
5399 
5400   return r;
5401 }
5402 
5403 static int
tune_call2(Node * node)5404 tune_call2(Node* node)
5405 {
5406   int r = 0;
5407 
5408   switch (NODE_TYPE(node)) {
5409   case NODE_LIST:
5410   case NODE_ALT:
5411     do {
5412       r = tune_call2(NODE_CAR(node));
5413     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5414     break;
5415 
5416   case NODE_QUANT:
5417     if (QUANT_(node)->upper != 0)
5418       r = tune_call2(NODE_BODY(node));
5419     break;
5420 
5421   case NODE_ANCHOR:
5422     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5423       r = tune_call2(NODE_BODY(node));
5424     break;
5425 
5426   case NODE_BAG:
5427     if (! NODE_IS_IN_ZERO_REPEAT(node))
5428       r = tune_call2(NODE_BODY(node));
5429 
5430     {
5431       BagNode* en = BAG_(node);
5432 
5433       if (r != 0) return r;
5434       if (en->type == BAG_IF_ELSE) {
5435         if (IS_NOT_NULL(en->te.Then)) {
5436           r = tune_call2(en->te.Then);
5437           if (r != 0) return r;
5438         }
5439         if (IS_NOT_NULL(en->te.Else))
5440           r = tune_call2(en->te.Else);
5441       }
5442     }
5443     break;
5444 
5445   case NODE_CALL:
5446     if (! NODE_IS_IN_ZERO_REPEAT(node)) {
5447       tune_call2_call(node);
5448     }
5449     break;
5450 
5451   default:
5452     break;
5453   }
5454 
5455   return r;
5456 }
5457 
5458 
5459 static void
tune_called_state_call(Node * node,int state)5460 tune_called_state_call(Node* node, int state)
5461 {
5462   switch (NODE_TYPE(node)) {
5463   case NODE_ALT:
5464     state |= IN_ALT;
5465     /* fall */
5466   case NODE_LIST:
5467     do {
5468       tune_called_state_call(NODE_CAR(node), state);
5469     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5470     break;
5471 
5472   case NODE_QUANT:
5473     {
5474       QuantNode* qn = QUANT_(node);
5475 
5476       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5477         state |= IN_REAL_REPEAT;
5478       if (qn->lower != qn->upper)
5479         state |= IN_VAR_REPEAT;
5480 
5481       tune_called_state_call(NODE_QUANT_BODY(qn), state);
5482     }
5483     break;
5484 
5485   case NODE_ANCHOR:
5486     {
5487       AnchorNode* an = ANCHOR_(node);
5488 
5489       switch (an->type) {
5490       case ANCR_PREC_READ_NOT:
5491       case ANCR_LOOK_BEHIND_NOT:
5492         state |= IN_NOT;
5493         /* fall */
5494       case ANCR_PREC_READ:
5495       case ANCR_LOOK_BEHIND:
5496         tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5497         break;
5498       default:
5499         break;
5500       }
5501     }
5502     break;
5503 
5504   case NODE_BAG:
5505     {
5506       BagNode* en = BAG_(node);
5507 
5508       if (en->type == BAG_MEMORY) {
5509         if (NODE_IS_MARK1(node)) {
5510           if ((~en->m.called_state & state) != 0) {
5511             en->m.called_state |= state;
5512             tune_called_state_call(NODE_BODY(node), state);
5513           }
5514         }
5515         else {
5516           NODE_STATUS_ADD(node, MARK1);
5517           en->m.called_state |= state;
5518           tune_called_state_call(NODE_BODY(node), state);
5519           NODE_STATUS_REMOVE(node, MARK1);
5520         }
5521       }
5522       else if (en->type == BAG_IF_ELSE) {
5523         state |= IN_ALT;
5524         tune_called_state_call(NODE_BODY(node), state);
5525         if (IS_NOT_NULL(en->te.Then)) {
5526           tune_called_state_call(en->te.Then, state);
5527         }
5528         if (IS_NOT_NULL(en->te.Else))
5529           tune_called_state_call(en->te.Else, state);
5530       }
5531       else {
5532         tune_called_state_call(NODE_BODY(node), state);
5533       }
5534     }
5535     break;
5536 
5537   case NODE_CALL:
5538     tune_called_state_call(NODE_BODY(node), state);
5539     break;
5540 
5541   default:
5542     break;
5543   }
5544 }
5545 
5546 static void
tune_called_state(Node * node,int state)5547 tune_called_state(Node* node, int state)
5548 {
5549   switch (NODE_TYPE(node)) {
5550   case NODE_ALT:
5551     state |= IN_ALT;
5552     /* fall */
5553   case NODE_LIST:
5554     do {
5555       tune_called_state(NODE_CAR(node), state);
5556     } while (IS_NOT_NULL(node = NODE_CDR(node)));
5557     break;
5558 
5559 #ifdef USE_CALL
5560   case NODE_CALL:
5561     tune_called_state_call(node, state);
5562     break;
5563 #endif
5564 
5565   case NODE_BAG:
5566     {
5567       BagNode* en = BAG_(node);
5568 
5569       switch (en->type) {
5570       case BAG_MEMORY:
5571         if (en->m.entry_count > 1)
5572           state |= IN_MULTI_ENTRY;
5573 
5574         en->m.called_state |= state;
5575         /* fall */
5576       case BAG_OPTION:
5577       case BAG_STOP_BACKTRACK:
5578         tune_called_state(NODE_BODY(node), state);
5579         break;
5580       case BAG_IF_ELSE:
5581         state |= IN_ALT;
5582         tune_called_state(NODE_BODY(node), state);
5583         if (IS_NOT_NULL(en->te.Then))
5584           tune_called_state(en->te.Then, state);
5585         if (IS_NOT_NULL(en->te.Else))
5586           tune_called_state(en->te.Else, state);
5587         break;
5588       }
5589     }
5590     break;
5591 
5592   case NODE_QUANT:
5593     {
5594       QuantNode* qn = QUANT_(node);
5595 
5596       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5597         state |= IN_REAL_REPEAT;
5598       if (qn->lower != qn->upper)
5599         state |= IN_VAR_REPEAT;
5600 
5601       tune_called_state(NODE_QUANT_BODY(qn), state);
5602     }
5603     break;
5604 
5605   case NODE_ANCHOR:
5606     {
5607       AnchorNode* an = ANCHOR_(node);
5608 
5609       switch (an->type) {
5610       case ANCR_PREC_READ_NOT:
5611       case ANCR_LOOK_BEHIND_NOT:
5612         state |= IN_NOT;
5613         /* fall */
5614       case ANCR_PREC_READ:
5615       case ANCR_LOOK_BEHIND:
5616         tune_called_state(NODE_ANCHOR_BODY(an), state);
5617         break;
5618       default:
5619         break;
5620       }
5621     }
5622     break;
5623 
5624   case NODE_BACKREF:
5625   case NODE_STRING:
5626   case NODE_CTYPE:
5627   case NODE_CCLASS:
5628   case NODE_GIMMICK:
5629   default:
5630     break;
5631   }
5632 }
5633 
5634 #endif  /* USE_CALL */
5635 
5636 
5637 #ifdef __GNUC__
5638 __inline
5639 #endif
5640 static int
tune_anchor(Node * node,regex_t * reg,int state,ScanEnv * env)5641 tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
5642 {
5643   int r;
5644   AnchorNode* an = ANCHOR_(node);
5645 
5646   switch (an->type) {
5647   case ANCR_PREC_READ:
5648     r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env);
5649     break;
5650   case ANCR_PREC_READ_NOT:
5651     r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
5652     break;
5653 
5654   case ANCR_LOOK_BEHIND:
5655   case ANCR_LOOK_BEHIND_NOT:
5656     r = tune_look_behind(node, reg, state, env);
5657     break;
5658 
5659   default:
5660     r = 0;
5661     break;
5662   }
5663 
5664   return r;
5665 }
5666 
5667 #ifdef __GNUC__
5668 __inline
5669 #endif
5670 static int
tune_quant(Node * node,regex_t * reg,int state,ScanEnv * env)5671 tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
5672 {
5673   int r;
5674   QuantNode* qn = QUANT_(node);
5675   Node* body = NODE_BODY(node);
5676 
5677   if ((state & IN_REAL_REPEAT) != 0) {
5678     NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5679   }
5680   if ((state & IN_MULTI_ENTRY) != 0) {
5681     NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5682   }
5683 
5684   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5685     OnigLen d = node_min_byte_len(body, env);
5686     if (d == 0) {
5687 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5688       qn->emptiness = quantifiers_memory_node_info(body);
5689 #else
5690       qn->emptiness = BODY_MAY_BE_EMPTY;
5691 #endif
5692     }
5693   }
5694 
5695   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5696     state |= IN_REAL_REPEAT;
5697   if (qn->lower != qn->upper)
5698     state |= IN_VAR_REPEAT;
5699 
5700   r = tune_tree(body, reg, state, env);
5701   if (r != 0) return r;
5702 
5703   /* expand string */
5704 #define EXPAND_STRING_MAX_LENGTH  100
5705   if (NODE_TYPE(body) == NODE_STRING) {
5706     if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5707         qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5708       int len = NODE_STRING_LEN(body);
5709 
5710       if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5711         int i, n = qn->lower;
5712         node_conv_to_str_node(node, body);
5713         for (i = 0; i < n; i++) {
5714           r = node_str_node_cat(node, body);
5715           if (r != 0) return r;
5716         }
5717         onig_node_free(body);
5718         return r;
5719       }
5720     }
5721   }
5722 
5723   if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5724     if (NODE_TYPE(body) == NODE_QUANT) {
5725       QuantNode* tqn = QUANT_(body);
5726       if (IS_NOT_NULL(tqn->head_exact)) {
5727         qn->head_exact  = tqn->head_exact;
5728         tqn->head_exact = NULL;
5729       }
5730     }
5731     else {
5732       qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg);
5733     }
5734   }
5735 
5736   return r;
5737 }
5738 
5739 /* tune_tree does the following work.
5740  1. check empty loop. (set qn->emptiness)
5741  2. expand ignore-case in char class.
5742  3. set memory status bit flags. (reg->mem_stats)
5743  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5744  5. find invalid patterns in look-behind.
5745  6. expand repeated string.
5746  */
5747 static int
tune_tree(Node * node,regex_t * reg,int state,ScanEnv * env)5748 tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
5749 {
5750   int r = 0;
5751 
5752   switch (NODE_TYPE(node)) {
5753   case NODE_LIST:
5754     {
5755       Node* prev = NULL_NODE;
5756       do {
5757         r = tune_tree(NODE_CAR(node), reg, state, env);
5758         if (IS_NOT_NULL(prev) && r == 0) {
5759           r = tune_next(prev, NODE_CAR(node), reg);
5760         }
5761         prev = NODE_CAR(node);
5762       } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5763     }
5764     break;
5765 
5766   case NODE_ALT:
5767     do {
5768       r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5769     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5770     break;
5771 
5772   case NODE_STRING:
5773     if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
5774       r = unravel_case_fold_string(node, reg, state);
5775     }
5776     break;
5777 
5778   case NODE_BACKREF:
5779     {
5780       int i;
5781       int* p;
5782       BackRefNode* br = BACKREF_(node);
5783       p = BACKREFS_P(br);
5784       for (i = 0; i < br->back_num; i++) {
5785         if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
5786         MEM_STATUS_ON(env->backrefed_mem, p[i]);
5787 #if 0
5788 #ifdef USE_BACKREF_WITH_LEVEL
5789         if (NODE_IS_NEST_LEVEL(node)) {
5790           MEM_STATUS_ON(env->backtrack_mem, p[i]);
5791         }
5792 #endif
5793 #else
5794         /* More precisely, it should be checked whether alt/repeat exists before
5795            the subject capture node, and then this backreference position
5796            exists before (or in) the capture node. */
5797         MEM_STATUS_ON(env->backtrack_mem, p[i]);
5798 #endif
5799       }
5800     }
5801     break;
5802 
5803   case NODE_BAG:
5804     {
5805       BagNode* en = BAG_(node);
5806 
5807       switch (en->type) {
5808       case BAG_OPTION:
5809         {
5810           OnigOptionType options = reg->options;
5811           reg->options = BAG_(node)->o.options;
5812           r = tune_tree(NODE_BODY(node), reg, state, env);
5813           reg->options = options;
5814         }
5815         break;
5816 
5817       case BAG_MEMORY:
5818 #ifdef USE_CALL
5819         state |= en->m.called_state;
5820 #endif
5821 
5822         if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5823             || NODE_IS_RECURSION(node)) {
5824           MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5825         }
5826         r = tune_tree(NODE_BODY(node), reg, state, env);
5827         break;
5828 
5829       case BAG_STOP_BACKTRACK:
5830         {
5831           Node* target = NODE_BODY(node);
5832           r = tune_tree(target, reg, state, env);
5833           if (NODE_TYPE(target) == NODE_QUANT) {
5834             QuantNode* tqn = QUANT_(target);
5835             if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5836                 tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
5837               if (is_strict_real_node(NODE_BODY(target)))
5838                 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5839             }
5840           }
5841         }
5842         break;
5843 
5844       case BAG_IF_ELSE:
5845         r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5846         if (r != 0) return r;
5847         if (IS_NOT_NULL(en->te.Then)) {
5848           r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5849           if (r != 0) return r;
5850         }
5851         if (IS_NOT_NULL(en->te.Else))
5852           r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5853         break;
5854       }
5855     }
5856     break;
5857 
5858   case NODE_QUANT:
5859     r = tune_quant(node, reg, state, env);
5860     break;
5861 
5862   case NODE_ANCHOR:
5863     r = tune_anchor(node, reg, state, env);
5864     break;
5865 
5866 #ifdef USE_CALL
5867   case NODE_CALL:
5868 #endif
5869   case NODE_CTYPE:
5870   case NODE_CCLASS:
5871   case NODE_GIMMICK:
5872   default:
5873     break;
5874   }
5875 
5876   return r;
5877 }
5878 
5879 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5880 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5881                                           UChar* s, UChar* end,
5882                                           UChar skip[], int* roffset)
5883 {
5884   int i, j, k, len, offset;
5885   int n, clen;
5886   UChar* p;
5887   OnigEncoding enc;
5888   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5889   UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5890 
5891   enc = reg->enc;
5892   offset = ENC_GET_SKIP_OFFSET(enc);
5893   if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5894     UChar* p = s;
5895     while (1) {
5896       len = enclen(enc, p);
5897       if (p + len >= end) {
5898         if (len == 1) offset = 1;
5899         else          offset = 0;
5900         break;
5901       }
5902       p += len;
5903     }
5904   }
5905 
5906   len = (int )(end - s);
5907   if (len + offset >= UCHAR_MAX)
5908     return ONIGERR_PARSER_BUG;
5909 
5910   *roffset = offset;
5911 
5912   for (i = 0; i < CHAR_MAP_SIZE; i++) {
5913     skip[i] = (UChar )(len + offset);
5914   }
5915 
5916   for (p = s; p < end; ) {
5917     int z;
5918 
5919     clen = enclen(enc, p);
5920     if (p + clen > end) clen = (int )(end - p);
5921 
5922     len = (int )(end - p);
5923     for (j = 0; j < clen; j++) {
5924       z = len - j + (offset - 1);
5925       if (z <= 0) break;
5926       skip[p[j]] = z;
5927     }
5928 
5929     if (case_expand != 0) {
5930       n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5931                                              p, end, items);
5932       for (k = 0; k < n; k++) {
5933         ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5934         for (j = 0; j < clen; j++) {
5935           z = len - j + (offset - 1);
5936           if (z <= 0) break;
5937           if (skip[buf[j]] > z)
5938             skip[buf[j]] = z;
5939         }
5940       }
5941     }
5942 
5943     p += clen;
5944   }
5945 
5946   return 0;
5947 }
5948 
5949 
5950 #define OPT_EXACT_MAXLEN   24
5951 
5952 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5953 #error Too big OPT_EXACT_MAXLEN
5954 #endif
5955 
5956 typedef struct {
5957   MinMaxLen        mm;
5958   OnigEncoding     enc;
5959   OnigCaseFoldType case_fold_flag;
5960   ScanEnv*         scan_env;
5961 } OptEnv;
5962 
5963 typedef struct {
5964   int left;
5965   int right;
5966 } OptAnc;
5967 
5968 typedef struct {
5969   MinMaxLen  mm;   /* position */
5970   OptAnc     anc;
5971   int        reach_end;
5972   int        len;
5973   UChar      s[OPT_EXACT_MAXLEN];
5974 } OptStr;
5975 
5976 typedef struct {
5977   MinMaxLen mm;     /* position */
5978   OptAnc    anc;
5979   int       value;  /* weighted value */
5980   UChar     map[CHAR_MAP_SIZE];
5981 } OptMap;
5982 
5983 typedef struct {
5984   MinMaxLen len;
5985   OptAnc  anc;
5986   OptStr  sb;     /* boundary */
5987   OptStr  sm;     /* middle */
5988   OptStr  spr;    /* prec read (?=...) */
5989   OptMap  map;    /* boundary */
5990 } OptNode;
5991 
5992 
5993 static int
map_position_value(OnigEncoding enc,int i)5994 map_position_value(OnigEncoding enc, int i)
5995 {
5996   static const short int Vals[] = {
5997      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
5998      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5999     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
6000      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
6001      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
6002      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
6003      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
6004      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
6005   };
6006 
6007   if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
6008     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
6009       return 20;
6010     else
6011       return (int )Vals[i];
6012   }
6013   else
6014     return 4;   /* Take it easy. */
6015 }
6016 
6017 static int
distance_value(MinMaxLen * mm)6018 distance_value(MinMaxLen* mm)
6019 {
6020   /* 1000 / (min-max-dist + 1) */
6021   static const short int dist_vals[] = {
6022     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
6023       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
6024       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
6025       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
6026       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
6027       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
6028       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
6029       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
6030       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
6031       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
6032   };
6033 
6034   OnigLen d;
6035 
6036   if (mm->max == INFINITE_LEN) return 0;
6037 
6038   d = mm->max - mm->min;
6039   if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
6040     /* return dist_vals[d] * 16 / (mm->min + 12); */
6041     return (int )dist_vals[d];
6042   else
6043     return 1;
6044 }
6045 
6046 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)6047 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
6048 {
6049   if (v2 <= 0) return -1;
6050   if (v1 <= 0) return  1;
6051 
6052   v1 *= distance_value(d1);
6053   v2 *= distance_value(d2);
6054 
6055   if (v2 > v1) return  1;
6056   if (v2 < v1) return -1;
6057 
6058   if (d2->min < d1->min) return  1;
6059   if (d2->min > d1->min) return -1;
6060   return 0;
6061 }
6062 
6063 static void
copy_opt_env(OptEnv * to,OptEnv * from)6064 copy_opt_env(OptEnv* to, OptEnv* from)
6065 {
6066   *to = *from;
6067 }
6068 
6069 static void
clear_opt_anc_info(OptAnc * a)6070 clear_opt_anc_info(OptAnc* a)
6071 {
6072   a->left  = 0;
6073   a->right = 0;
6074 }
6075 
6076 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)6077 copy_opt_anc_info(OptAnc* to, OptAnc* from)
6078 {
6079   *to = *from;
6080 }
6081 
6082 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)6083 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
6084                     OnigLen left_len, OnigLen right_len)
6085 {
6086   clear_opt_anc_info(to);
6087 
6088   to->left = left->left;
6089   if (left_len == 0) {
6090     to->left |= right->left;
6091   }
6092 
6093   to->right = right->right;
6094   if (right_len == 0) {
6095     to->right |= left->right;
6096   }
6097   else {
6098     to->right |= (left->right & ANCR_PREC_READ_NOT);
6099   }
6100 }
6101 
6102 static int
is_left(int a)6103 is_left(int a)
6104 {
6105   if (a == ANCR_END_BUF  || a == ANCR_SEMI_END_BUF ||
6106       a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
6107     return 0;
6108 
6109   return 1;
6110 }
6111 
6112 static int
is_set_opt_anc_info(OptAnc * to,int anc)6113 is_set_opt_anc_info(OptAnc* to, int anc)
6114 {
6115   if ((to->left & anc) != 0) return 1;
6116 
6117   return ((to->right & anc) != 0 ? 1 : 0);
6118 }
6119 
6120 static void
add_opt_anc_info(OptAnc * to,int anc)6121 add_opt_anc_info(OptAnc* to, int anc)
6122 {
6123   if (is_left(anc))
6124     to->left |= anc;
6125   else
6126     to->right |= anc;
6127 }
6128 
6129 static void
remove_opt_anc_info(OptAnc * to,int anc)6130 remove_opt_anc_info(OptAnc* to, int anc)
6131 {
6132   if (is_left(anc))
6133     to->left &= ~anc;
6134   else
6135     to->right &= ~anc;
6136 }
6137 
6138 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)6139 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
6140 {
6141   to->left  &= add->left;
6142   to->right &= add->right;
6143 }
6144 
6145 static int
is_full_opt_exact(OptStr * e)6146 is_full_opt_exact(OptStr* e)
6147 {
6148   return e->len >= OPT_EXACT_MAXLEN;
6149 }
6150 
6151 static void
clear_opt_exact(OptStr * e)6152 clear_opt_exact(OptStr* e)
6153 {
6154   mml_clear(&e->mm);
6155   clear_opt_anc_info(&e->anc);
6156   e->reach_end = 0;
6157   e->len       = 0;
6158   e->s[0]      = '\0';
6159 }
6160 
6161 static void
copy_opt_exact(OptStr * to,OptStr * from)6162 copy_opt_exact(OptStr* to, OptStr* from)
6163 {
6164   *to = *from;
6165 }
6166 
6167 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)6168 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
6169 {
6170   int i, j, len, r;
6171   UChar *p, *end;
6172   OptAnc tanc;
6173 
6174   r = 0;
6175   p = add->s;
6176   end = p + add->len;
6177   for (i = to->len; p < end; ) {
6178     len = enclen(enc, p);
6179     if (i + len > OPT_EXACT_MAXLEN) {
6180       r = 1; /* 1:full */
6181       break;
6182     }
6183     for (j = 0; j < len && p < end; j++)
6184       to->s[i++] = *p++;
6185   }
6186 
6187   to->len = i;
6188   to->reach_end = (p == end ? add->reach_end : 0);
6189 
6190   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
6191   if (! to->reach_end) tanc.right = 0;
6192   copy_opt_anc_info(&to->anc, &tanc);
6193 
6194   return r;
6195 }
6196 
6197 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)6198 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
6199 {
6200   int i, j, len;
6201   UChar *p;
6202 
6203   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
6204     len = enclen(enc, p);
6205     if (i + len > OPT_EXACT_MAXLEN) break;
6206     for (j = 0; j < len && p < end; j++)
6207       to->s[i++] = *p++;
6208   }
6209 
6210   to->len = i;
6211 
6212   if (p >= end)
6213     to->reach_end = 1;
6214 }
6215 
6216 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)6217 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
6218 {
6219   int i, j, len;
6220 
6221   if (add->len == 0 || to->len == 0) {
6222     clear_opt_exact(to);
6223     return ;
6224   }
6225 
6226   if (! mml_is_equal(&to->mm, &add->mm)) {
6227     clear_opt_exact(to);
6228     return ;
6229   }
6230 
6231   for (i = 0; i < to->len && i < add->len; ) {
6232     if (to->s[i] != add->s[i]) break;
6233     len = enclen(env->enc, to->s + i);
6234 
6235     for (j = 1; j < len; j++) {
6236       if (to->s[i+j] != add->s[i+j]) break;
6237     }
6238     if (j < len) break;
6239     i += len;
6240   }
6241 
6242   if (! add->reach_end || i < add->len || i < to->len) {
6243     to->reach_end = 0;
6244   }
6245   to->len = i;
6246 
6247   alt_merge_opt_anc_info(&to->anc, &add->anc);
6248   if (! to->reach_end) to->anc.right = 0;
6249 }
6250 
6251 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)6252 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
6253 {
6254   int vn, va;
6255 
6256   vn = now->len;
6257   va = alt->len;
6258 
6259   if (va == 0) {
6260     return ;
6261   }
6262   else if (vn == 0) {
6263     copy_opt_exact(now, alt);
6264     return ;
6265   }
6266   else if (vn <= 2 && va <= 2) {
6267     /* ByteValTable[x] is big value --> low price */
6268     va = map_position_value(enc, now->s[0]);
6269     vn = map_position_value(enc, alt->s[0]);
6270 
6271     if (now->len > 1) vn += 5;
6272     if (alt->len > 1) va += 5;
6273   }
6274 
6275   vn *= 2;
6276   va *= 2;
6277 
6278   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6279     copy_opt_exact(now, alt);
6280 }
6281 
6282 static void
clear_opt_map(OptMap * map)6283 clear_opt_map(OptMap* map)
6284 {
6285   static const OptMap clean_info = {
6286     {0, 0}, {0, 0}, 0,
6287     {
6288       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6289       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6290       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6291       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6292       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6293       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6294       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6295       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6296       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6297       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6298       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6299       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6300       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6301       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6302       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6303       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6304     }
6305   };
6306 
6307   xmemcpy(map, &clean_info, sizeof(OptMap));
6308 }
6309 
6310 static void
copy_opt_map(OptMap * to,OptMap * from)6311 copy_opt_map(OptMap* to, OptMap* from)
6312 {
6313   *to = *from;
6314 }
6315 
6316 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)6317 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
6318 {
6319   if (m->map[c] == 0) {
6320     m->map[c] = 1;
6321     m->value += map_position_value(enc, c);
6322   }
6323 }
6324 
6325 static void
select_opt_map(OptMap * now,OptMap * alt)6326 select_opt_map(OptMap* now, OptMap* alt)
6327 {
6328   static int z = 1<<15; /* 32768: something big value */
6329 
6330   int vn, va;
6331 
6332   if (alt->value == 0) return ;
6333   if (now->value == 0) {
6334     copy_opt_map(now, alt);
6335     return ;
6336   }
6337 
6338   vn = z / now->value;
6339   va = z / alt->value;
6340   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6341     copy_opt_map(now, alt);
6342 }
6343 
6344 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)6345 comp_opt_exact_or_map(OptStr* e, OptMap* m)
6346 {
6347 #define COMP_EM_BASE  20
6348   int ae, am;
6349   int case_value;
6350 
6351   if (m->value <= 0) return -1;
6352 
6353   case_value = 3;
6354   ae = COMP_EM_BASE * e->len * case_value;
6355   am = COMP_EM_BASE * 5 * 2 / m->value;
6356   return comp_distance_value(&e->mm, &m->mm, ae, am);
6357 }
6358 
6359 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)6360 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
6361 {
6362   int i, val;
6363 
6364   /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */
6365   if (to->value == 0) return ;
6366   if (add->value == 0 || to->mm.max < add->mm.min) {
6367     clear_opt_map(to);
6368     return ;
6369   }
6370 
6371   mml_alt_merge(&to->mm, &add->mm);
6372 
6373   val = 0;
6374   for (i = 0; i < CHAR_MAP_SIZE; i++) {
6375     if (add->map[i])
6376       to->map[i] = 1;
6377 
6378     if (to->map[i])
6379       val += map_position_value(enc, i);
6380   }
6381   to->value = val;
6382 
6383   alt_merge_opt_anc_info(&to->anc, &add->anc);
6384 }
6385 
6386 static void
set_bound_node_opt_info(OptNode * opt,MinMaxLen * plen)6387 set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen)
6388 {
6389   mml_copy(&(opt->sb.mm),  plen);
6390   mml_copy(&(opt->spr.mm), plen);
6391   mml_copy(&(opt->map.mm), plen);
6392 }
6393 
6394 static void
clear_node_opt_info(OptNode * opt)6395 clear_node_opt_info(OptNode* opt)
6396 {
6397   mml_clear(&opt->len);
6398   clear_opt_anc_info(&opt->anc);
6399   clear_opt_exact(&opt->sb);
6400   clear_opt_exact(&opt->sm);
6401   clear_opt_exact(&opt->spr);
6402   clear_opt_map(&opt->map);
6403 }
6404 
6405 static void
copy_node_opt_info(OptNode * to,OptNode * from)6406 copy_node_opt_info(OptNode* to, OptNode* from)
6407 {
6408   *to = *from;
6409 }
6410 
6411 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)6412 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
6413 {
6414   int sb_reach, sm_reach;
6415   OptAnc tanc;
6416 
6417   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
6418   copy_opt_anc_info(&to->anc, &tanc);
6419 
6420   if (add->sb.len > 0 && to->len.max == 0) {
6421     concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
6422     copy_opt_anc_info(&add->sb.anc, &tanc);
6423   }
6424 
6425   if (add->map.value > 0 && to->len.max == 0) {
6426     if (add->map.mm.max == 0)
6427       add->map.anc.left |= to->anc.left;
6428   }
6429 
6430   sb_reach = to->sb.reach_end;
6431   sm_reach = to->sm.reach_end;
6432 
6433   if (add->len.max != 0)
6434     to->sb.reach_end = to->sm.reach_end = 0;
6435 
6436   if (add->sb.len > 0) {
6437     if (sb_reach) {
6438       concat_opt_exact(&to->sb, &add->sb, enc);
6439       clear_opt_exact(&add->sb);
6440     }
6441     else if (sm_reach) {
6442       concat_opt_exact(&to->sm, &add->sb, enc);
6443       clear_opt_exact(&add->sb);
6444     }
6445   }
6446   select_opt_exact(enc, &to->sm, &add->sb);
6447   select_opt_exact(enc, &to->sm, &add->sm);
6448 
6449   if (to->spr.len > 0) {
6450     if (add->len.max > 0) {
6451       if (to->spr.mm.max == 0)
6452         select_opt_exact(enc, &to->sb, &to->spr);
6453       else
6454         select_opt_exact(enc, &to->sm, &to->spr);
6455     }
6456   }
6457   else if (add->spr.len > 0) {
6458     copy_opt_exact(&to->spr, &add->spr);
6459   }
6460 
6461   select_opt_map(&to->map, &add->map);
6462   mml_add(&to->len, &add->len);
6463 }
6464 
6465 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)6466 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
6467 {
6468   alt_merge_opt_anc_info(&to->anc, &add->anc);
6469   alt_merge_opt_exact(&to->sb,  &add->sb, env);
6470   alt_merge_opt_exact(&to->sm,  &add->sm, env);
6471   alt_merge_opt_exact(&to->spr, &add->spr, env);
6472   alt_merge_opt_map(env->enc, &to->map, &add->map);
6473 
6474   mml_alt_merge(&to->len, &add->len);
6475 }
6476 
6477 
6478 #define MAX_NODE_OPT_INFO_REF_COUNT    5
6479 
6480 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)6481 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
6482 {
6483   int i;
6484   int r;
6485   OptNode xo;
6486   OnigEncoding enc;
6487 
6488   r = 0;
6489   enc = env->enc;
6490   clear_node_opt_info(opt);
6491   set_bound_node_opt_info(opt, &env->mm);
6492 
6493   switch (NODE_TYPE(node)) {
6494   case NODE_LIST:
6495     {
6496       OptEnv nenv;
6497       Node* nd = node;
6498 
6499       copy_opt_env(&nenv, env);
6500       do {
6501         r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
6502         if (r == 0) {
6503           mml_add(&nenv.mm, &xo.len);
6504           concat_left_node_opt_info(enc, opt, &xo);
6505         }
6506       } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
6507     }
6508     break;
6509 
6510   case NODE_ALT:
6511     {
6512       Node* nd = node;
6513 
6514       do {
6515         r = optimize_nodes(NODE_CAR(nd), &xo, env);
6516         if (r == 0) {
6517           if (nd == node) copy_node_opt_info(opt, &xo);
6518           else            alt_merge_node_opt_info(opt, &xo, env);
6519         }
6520       } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
6521     }
6522     break;
6523 
6524   case NODE_STRING:
6525     {
6526       StrNode* sn = STR_(node);
6527       int slen = (int )(sn->end - sn->s);
6528 
6529       concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
6530       if (slen > 0) {
6531         add_char_opt_map(&opt->map, *(sn->s), enc);
6532       }
6533       mml_set_min_max(&opt->len, slen, slen);
6534     }
6535     break;
6536 
6537   case NODE_CCLASS:
6538     {
6539       int z;
6540       CClassNode* cc = CCLASS_(node);
6541 
6542       /* no need to check ignore case. (set in tune_tree()) */
6543 
6544       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
6545         OnigLen min = ONIGENC_MBC_MINLEN(enc);
6546         OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
6547 
6548         mml_set_min_max(&opt->len, min, max);
6549       }
6550       else {
6551         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6552           z = BITSET_AT(cc->bs, i);
6553           if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6554             add_char_opt_map(&opt->map, (UChar )i, enc);
6555           }
6556         }
6557         mml_set_min_max(&opt->len, 1, 1);
6558       }
6559     }
6560     break;
6561 
6562   case NODE_CTYPE:
6563     {
6564       int min, max;
6565       int range;
6566 
6567       max = ONIGENC_MBC_MAXLEN_DIST(enc);
6568 
6569       if (max == 1) {
6570         min = 1;
6571 
6572         switch (CTYPE_(node)->ctype) {
6573         case CTYPE_ANYCHAR:
6574           break;
6575 
6576         case ONIGENC_CTYPE_WORD:
6577           range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6578           if (CTYPE_(node)->not != 0) {
6579             for (i = 0; i < range; i++) {
6580               if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6581                 add_char_opt_map(&opt->map, (UChar )i, enc);
6582               }
6583             }
6584             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6585               add_char_opt_map(&opt->map, (UChar )i, enc);
6586             }
6587           }
6588           else {
6589             for (i = 0; i < range; i++) {
6590               if (ONIGENC_IS_CODE_WORD(enc, i)) {
6591                 add_char_opt_map(&opt->map, (UChar )i, enc);
6592               }
6593             }
6594           }
6595           break;
6596         }
6597       }
6598       else {
6599         min = ONIGENC_MBC_MINLEN(enc);
6600       }
6601       mml_set_min_max(&opt->len, min, max);
6602     }
6603     break;
6604 
6605   case NODE_ANCHOR:
6606     switch (ANCHOR_(node)->type) {
6607     case ANCR_BEGIN_BUF:
6608     case ANCR_BEGIN_POSITION:
6609     case ANCR_BEGIN_LINE:
6610     case ANCR_END_BUF:
6611     case ANCR_SEMI_END_BUF:
6612     case ANCR_END_LINE:
6613     case ANCR_PREC_READ_NOT:
6614     case ANCR_LOOK_BEHIND:
6615       add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6616       break;
6617 
6618     case ANCR_PREC_READ:
6619       {
6620         r = optimize_nodes(NODE_BODY(node), &xo, env);
6621         if (r == 0) {
6622           if (xo.sb.len > 0)
6623             copy_opt_exact(&opt->spr, &xo.sb);
6624           else if (xo.sm.len > 0)
6625             copy_opt_exact(&opt->spr, &xo.sm);
6626 
6627           opt->spr.reach_end = 0;
6628 
6629           if (xo.map.value > 0)
6630             copy_opt_map(&opt->map, &xo.map);
6631         }
6632       }
6633       break;
6634 
6635     case ANCR_LOOK_BEHIND_NOT:
6636       break;
6637     }
6638     break;
6639 
6640   case NODE_BACKREF:
6641     if (! NODE_IS_CHECKER(node)) {
6642       OnigLen min, max;
6643 
6644       min = node_min_byte_len(node, env->scan_env);
6645       max = node_max_byte_len(node, env->scan_env);
6646       mml_set_min_max(&opt->len, min, max);
6647     }
6648     break;
6649 
6650 #ifdef USE_CALL
6651   case NODE_CALL:
6652     if (NODE_IS_RECURSION(node))
6653       mml_set_min_max(&opt->len, 0, INFINITE_LEN);
6654     else {
6655       r = optimize_nodes(NODE_BODY(node), opt, env);
6656     }
6657     break;
6658 #endif
6659 
6660   case NODE_QUANT:
6661     {
6662       OnigLen min, max;
6663       QuantNode* qn = QUANT_(node);
6664 
6665       /* Issue #175
6666          ex. /\g<1>{0}(?<=|())/
6667 
6668          Empty and unused nodes in look-behind is removed in
6669          tune_look_behind().
6670          Called group nodes are assigned to be not called if the caller side is
6671          inside of zero-repetition.
6672          As a result, the nodes are considered unused.
6673        */
6674       if (qn->upper == 0) {
6675         mml_set_min_max(&opt->len, 0, 0);
6676         break;
6677       }
6678 
6679       r = optimize_nodes(NODE_BODY(node), &xo, env);
6680       if (r != 0) break;
6681 
6682       if (qn->lower > 0) {
6683         copy_node_opt_info(opt, &xo);
6684         if (xo.sb.len > 0) {
6685           if (xo.sb.reach_end) {
6686             for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6687               int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6688               if (rc > 0) break;
6689             }
6690             if (i < qn->lower) opt->sb.reach_end = 0;
6691           }
6692         }
6693 
6694         if (qn->lower != qn->upper) {
6695           opt->sb.reach_end = 0;
6696           opt->sm.reach_end = 0;
6697         }
6698         if (qn->lower > 1)
6699           opt->sm.reach_end = 0;
6700       }
6701 
6702       if (IS_INFINITE_REPEAT(qn->upper)) {
6703         if (env->mm.max == 0 &&
6704             NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6705           if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)))
6706             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6707           else
6708             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6709         }
6710 
6711         max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6712       }
6713       else {
6714         max = distance_multiply(xo.len.max, qn->upper);
6715       }
6716 
6717       min = distance_multiply(xo.len.min, qn->lower);
6718       mml_set_min_max(&opt->len, min, max);
6719     }
6720     break;
6721 
6722   case NODE_BAG:
6723     {
6724       BagNode* en = BAG_(node);
6725 
6726       switch (en->type) {
6727       case BAG_STOP_BACKTRACK:
6728       case BAG_OPTION:
6729         r = optimize_nodes(NODE_BODY(node), opt, env);
6730         break;
6731 
6732       case BAG_MEMORY:
6733 #ifdef USE_CALL
6734         en->opt_count++;
6735         if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6736           OnigLen min, max;
6737 
6738           min = 0;
6739           max = INFINITE_LEN;
6740           if (NODE_IS_FIXED_MIN(node)) min = en->min_len;
6741           if (NODE_IS_FIXED_MAX(node)) max = en->max_len;
6742           mml_set_min_max(&opt->len, min, max);
6743         }
6744         else
6745 #endif
6746           {
6747             r = optimize_nodes(NODE_BODY(node), opt, env);
6748             if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6749               if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6750                 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6751             }
6752           }
6753         break;
6754 
6755       case BAG_IF_ELSE:
6756         {
6757           OptEnv nenv;
6758 
6759           copy_opt_env(&nenv, env);
6760           r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6761           if (r == 0) {
6762             mml_add(&nenv.mm, &xo.len);
6763             concat_left_node_opt_info(enc, opt, &xo);
6764             if (IS_NOT_NULL(en->te.Then)) {
6765               r = optimize_nodes(en->te.Then, &xo, &nenv);
6766               if (r == 0) {
6767                 concat_left_node_opt_info(enc, opt, &xo);
6768               }
6769             }
6770 
6771             if (IS_NOT_NULL(en->te.Else)) {
6772               r = optimize_nodes(en->te.Else, &xo, env);
6773               if (r == 0)
6774                 alt_merge_node_opt_info(opt, &xo, env);
6775             }
6776           }
6777         }
6778         break;
6779       }
6780     }
6781     break;
6782 
6783   case NODE_GIMMICK:
6784     break;
6785 
6786   default:
6787 #ifdef ONIG_DEBUG
6788     fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6789 #endif
6790     r = ONIGERR_TYPE_BUG;
6791     break;
6792   }
6793 
6794   return r;
6795 }
6796 
6797 static int
set_optimize_exact(regex_t * reg,OptStr * e)6798 set_optimize_exact(regex_t* reg, OptStr* e)
6799 {
6800   int r;
6801   int allow_reverse;
6802 
6803   if (e->len == 0) return 0;
6804 
6805   reg->exact = (UChar* )xmalloc(e->len);
6806   CHECK_NULL_RETURN_MEMERR(reg->exact);
6807   xmemcpy(reg->exact, e->s, e->len);
6808   reg->exact_end = reg->exact + e->len;
6809 
6810   allow_reverse =
6811     ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6812 
6813   if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6814     r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6815                                                   reg->exact, reg->exact_end,
6816                                                   reg->map, &(reg->map_offset));
6817     if (r != 0) return r;
6818 
6819     reg->optimize = (allow_reverse != 0
6820                      ? OPTIMIZE_STR_FAST
6821                      : OPTIMIZE_STR_FAST_STEP_FORWARD);
6822   }
6823   else {
6824     reg->optimize = OPTIMIZE_STR;
6825   }
6826 
6827   reg->dist_min = e->mm.min;
6828   reg->dist_max = e->mm.max;
6829 
6830   if (reg->dist_min != INFINITE_LEN) {
6831     int n = (int )(reg->exact_end - reg->exact);
6832     reg->threshold_len = reg->dist_min + n;
6833   }
6834 
6835   return 0;
6836 }
6837 
6838 static void
set_optimize_map(regex_t * reg,OptMap * m)6839 set_optimize_map(regex_t* reg, OptMap* m)
6840 {
6841   int i;
6842 
6843   for (i = 0; i < CHAR_MAP_SIZE; i++)
6844     reg->map[i] = m->map[i];
6845 
6846   reg->optimize   = OPTIMIZE_MAP;
6847   reg->dist_min   = m->mm.min;
6848   reg->dist_max   = m->mm.max;
6849 
6850   if (reg->dist_min != INFINITE_LEN) {
6851     reg->threshold_len = reg->dist_min + ONIGENC_MBC_MINLEN(reg->enc);
6852   }
6853 }
6854 
6855 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6856 set_sub_anchor(regex_t* reg, OptAnc* anc)
6857 {
6858   reg->sub_anchor |= anc->left  & ANCR_BEGIN_LINE;
6859   reg->sub_anchor |= anc->right & ANCR_END_LINE;
6860 }
6861 
6862 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6863 static void print_optimize_info(FILE* f, regex_t* reg);
6864 #endif
6865 
6866 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)6867 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
6868 {
6869   int r;
6870   OptNode opt;
6871   OptEnv env;
6872 
6873   env.enc            = reg->enc;
6874   env.case_fold_flag = reg->case_fold_flag;
6875   env.scan_env       = scan_env;
6876   mml_clear(&env.mm);
6877 
6878   r = optimize_nodes(node, &opt, &env);
6879   if (r != 0) return r;
6880 
6881   reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6882         ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6883         ANCR_LOOK_BEHIND);
6884 
6885   if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6886     reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6887 
6888   reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6889                                   ANCR_PREC_READ_NOT);
6890 
6891   if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6892     reg->anc_dist_min = opt.len.min;
6893     reg->anc_dist_max = opt.len.max;
6894   }
6895 
6896   if (opt.sb.len > 0 || opt.sm.len > 0) {
6897     select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6898     if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6899       goto set_map;
6900     }
6901     else {
6902       r = set_optimize_exact(reg, &opt.sb);
6903       set_sub_anchor(reg, &opt.sb.anc);
6904     }
6905   }
6906   else if (opt.map.value > 0) {
6907   set_map:
6908     set_optimize_map(reg, &opt.map);
6909     set_sub_anchor(reg, &opt.map.anc);
6910   }
6911   else {
6912     reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
6913     if (opt.len.max == 0)
6914       reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
6915   }
6916 
6917 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6918   print_optimize_info(DBGFP, reg);
6919 #endif
6920   return r;
6921 }
6922 
6923 static void
clear_optimize_info(regex_t * reg)6924 clear_optimize_info(regex_t* reg)
6925 {
6926   reg->optimize      = OPTIMIZE_NONE;
6927   reg->anchor        = 0;
6928   reg->anc_dist_min  = 0;
6929   reg->anc_dist_max  = 0;
6930   reg->sub_anchor    = 0;
6931   reg->exact_end     = (UChar* )NULL;
6932   reg->map_offset    = 0;
6933   reg->threshold_len = 0;
6934   if (IS_NOT_NULL(reg->exact)) {
6935     xfree(reg->exact);
6936     reg->exact = (UChar* )NULL;
6937   }
6938 }
6939 
6940 #ifdef ONIG_DEBUG
6941 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)6942 static void print_enc_string(FILE* fp, OnigEncoding enc,
6943                              const UChar *s, const UChar *end)
6944 {
6945   if (ONIGENC_MBC_MINLEN(enc) > 1) {
6946     const UChar *p;
6947     OnigCodePoint code;
6948 
6949     p = s;
6950     while (p < end) {
6951       code = ONIGENC_MBC_TO_CODE(enc, p, end);
6952       if (code >= 0x80) {
6953         fprintf(fp, " 0x%04x ", (int )code);
6954       }
6955       else {
6956         fputc((int )code, fp);
6957       }
6958 
6959       p += enclen(enc, p);
6960     }
6961   }
6962   else {
6963     while (s < end) {
6964       fputc((int )*s, fp);
6965       s++;
6966     }
6967   }
6968 
6969   fprintf(fp, "/\n");
6970 }
6971 
6972 #endif /* ONIG_DEBUG */
6973 
6974 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6975 
6976 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)6977 print_distance_range(FILE* f, OnigLen a, OnigLen b)
6978 {
6979   if (a == INFINITE_LEN)
6980     fputs("inf", f);
6981   else
6982     fprintf(f, "(%u)", a);
6983 
6984   fputs("-", f);
6985 
6986   if (b == INFINITE_LEN)
6987     fputs("inf", f);
6988   else
6989     fprintf(f, "(%u)", b);
6990 }
6991 
6992 static void
print_anchor(FILE * f,int anchor)6993 print_anchor(FILE* f, int anchor)
6994 {
6995   int q = 0;
6996 
6997   fprintf(f, "[");
6998 
6999   if (anchor & ANCR_BEGIN_BUF) {
7000     fprintf(f, "begin-buf");
7001     q = 1;
7002   }
7003   if (anchor & ANCR_BEGIN_LINE) {
7004     if (q) fprintf(f, ", ");
7005     q = 1;
7006     fprintf(f, "begin-line");
7007   }
7008   if (anchor & ANCR_BEGIN_POSITION) {
7009     if (q) fprintf(f, ", ");
7010     q = 1;
7011     fprintf(f, "begin-pos");
7012   }
7013   if (anchor & ANCR_END_BUF) {
7014     if (q) fprintf(f, ", ");
7015     q = 1;
7016     fprintf(f, "end-buf");
7017   }
7018   if (anchor & ANCR_SEMI_END_BUF) {
7019     if (q) fprintf(f, ", ");
7020     q = 1;
7021     fprintf(f, "semi-end-buf");
7022   }
7023   if (anchor & ANCR_END_LINE) {
7024     if (q) fprintf(f, ", ");
7025     q = 1;
7026     fprintf(f, "end-line");
7027   }
7028   if (anchor & ANCR_ANYCHAR_INF) {
7029     if (q) fprintf(f, ", ");
7030     q = 1;
7031     fprintf(f, "anychar-inf");
7032   }
7033   if (anchor & ANCR_ANYCHAR_INF_ML) {
7034     if (q) fprintf(f, ", ");
7035     fprintf(f, "anychar-inf-ml");
7036   }
7037 
7038   fprintf(f, "]");
7039 }
7040 
7041 static void
print_optimize_info(FILE * f,regex_t * reg)7042 print_optimize_info(FILE* f, regex_t* reg)
7043 {
7044   static const char* on[] =
7045     { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", "MAP" };
7046 
7047   fprintf(f, "optimize: %s\n", on[reg->optimize]);
7048   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
7049   if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
7050     print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
7051   fprintf(f, "\n");
7052 
7053   if (reg->optimize) {
7054     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
7055     fprintf(f, "\n");
7056   }
7057   fprintf(f, "\n");
7058 
7059   if (reg->exact) {
7060     UChar *p;
7061     fprintf(f, "exact: [");
7062     for (p = reg->exact; p < reg->exact_end; p++) {
7063       fputc(*p, f);
7064     }
7065     fprintf(f, "]: length: %ld, dmin: %u, dmax: %u\n",
7066             (reg->exact_end - reg->exact), reg->dist_min, reg->dist_max);
7067   }
7068   else if (reg->optimize & OPTIMIZE_MAP) {
7069     int c, i, n = 0;
7070 
7071     for (i = 0; i < CHAR_MAP_SIZE; i++)
7072       if (reg->map[i]) n++;
7073 
7074     fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n",
7075             n, reg->dist_min, reg->dist_max);
7076     if (n > 0) {
7077       c = 0;
7078       fputc('[', f);
7079       for (i = 0; i < CHAR_MAP_SIZE; i++) {
7080         if (reg->map[i] != 0) {
7081           if (c > 0)  fputs(", ", f);
7082           c++;
7083           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
7084               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
7085             fputc(i, f);
7086           else
7087             fprintf(f, "%d", i);
7088         }
7089       }
7090       fprintf(f, "]\n");
7091     }
7092   }
7093 }
7094 #endif
7095 
7096 
7097 extern RegexExt*
onig_get_regex_ext(regex_t * reg)7098 onig_get_regex_ext(regex_t* reg)
7099 {
7100   if (IS_NULL(reg->extp)) {
7101     RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
7102     if (IS_NULL(ext)) return 0;
7103 
7104     ext->pattern      = 0;
7105     ext->pattern_end  = 0;
7106 #ifdef USE_CALLOUT
7107     ext->tag_table    = 0;
7108     ext->callout_num  = 0;
7109     ext->callout_list_alloc = 0;
7110     ext->callout_list = 0;
7111 #endif
7112 
7113     reg->extp = ext;
7114   }
7115 
7116   return reg->extp;
7117 }
7118 
7119 static void
free_regex_ext(RegexExt * ext)7120 free_regex_ext(RegexExt* ext)
7121 {
7122   if (IS_NOT_NULL(ext)) {
7123     if (IS_NOT_NULL(ext->pattern))
7124       xfree((void* )ext->pattern);
7125 
7126 #ifdef USE_CALLOUT
7127     if (IS_NOT_NULL(ext->tag_table))
7128       onig_callout_tag_table_free(ext->tag_table);
7129 
7130     if (IS_NOT_NULL(ext->callout_list))
7131       onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
7132 #endif
7133 
7134     xfree(ext);
7135   }
7136 }
7137 
7138 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)7139 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
7140 {
7141   RegexExt* ext;
7142   UChar* s;
7143 
7144   ext = onig_get_regex_ext(reg);
7145   CHECK_NULL_RETURN_MEMERR(ext);
7146 
7147   s = onigenc_strdup(reg->enc, pattern, pattern_end);
7148   CHECK_NULL_RETURN_MEMERR(s);
7149 
7150   ext->pattern     = s;
7151   ext->pattern_end = s + (pattern_end - pattern);
7152 
7153   return ONIG_NORMAL;
7154 }
7155 
7156 extern void
onig_free_body(regex_t * reg)7157 onig_free_body(regex_t* reg)
7158 {
7159   if (IS_NOT_NULL(reg)) {
7160     ops_free(reg);
7161     if (IS_NOT_NULL(reg->string_pool)) {
7162       xfree(reg->string_pool);
7163       reg->string_pool_end = reg->string_pool = 0;
7164     }
7165     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
7166     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
7167     if (IS_NOT_NULL(reg->extp)) {
7168       free_regex_ext(reg->extp);
7169       reg->extp = 0;
7170     }
7171 
7172     onig_names_free(reg);
7173   }
7174 }
7175 
7176 extern void
onig_free(regex_t * reg)7177 onig_free(regex_t* reg)
7178 {
7179   if (IS_NOT_NULL(reg)) {
7180     onig_free_body(reg);
7181     xfree(reg);
7182   }
7183 }
7184 
7185 
7186 #ifdef ONIG_DEBUG_PARSE
7187 static void print_tree P_((FILE* f, Node* node));
7188 #endif
7189 
7190 extern int onig_init_for_match_at(regex_t* reg);
7191 
7192 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)7193 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
7194              OnigErrorInfo* einfo)
7195 {
7196   int r;
7197   Node*  root;
7198   ScanEnv  scan_env;
7199 #ifdef USE_CALL
7200   UnsetAddrList  uslist = {0};
7201 #endif
7202 
7203   root = 0;
7204   if (IS_NOT_NULL(einfo)) {
7205     einfo->enc = reg->enc;
7206     einfo->par = (UChar* )NULL;
7207   }
7208 
7209 #ifdef ONIG_DEBUG
7210   fprintf(DBGFP, "\nPATTERN: /");
7211   print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
7212 #endif
7213 
7214   if (reg->ops_alloc == 0) {
7215     r = ops_init(reg, OPS_INIT_SIZE);
7216     if (r != 0) goto end;
7217   }
7218   else
7219     reg->ops_used = 0;
7220 
7221   reg->string_pool        = 0;
7222   reg->string_pool_end    = 0;
7223   reg->num_mem            = 0;
7224   reg->num_repeat         = 0;
7225   reg->num_empty_check    = 0;
7226   reg->repeat_range_alloc = 0;
7227   reg->repeat_range       = (RepeatRange* )NULL;
7228   reg->empty_status_mem   = 0;
7229 
7230   r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
7231   if (r != 0) goto err;
7232 
7233   r = reduce_string_list(root);
7234   if (r != 0) goto err;
7235 
7236   /* mixed use named group and no-named group */
7237   if (scan_env.num_named > 0 &&
7238       IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
7239       ! OPTON_CAPTURE_GROUP(reg->options)) {
7240     if (scan_env.num_named != scan_env.num_mem)
7241       r = disable_noname_group_capture(&root, reg, &scan_env);
7242     else
7243       r = numbered_ref_check(root);
7244 
7245     if (r != 0) goto err;
7246   }
7247 
7248   r = check_backrefs(root, &scan_env);
7249   if (r != 0) goto err;
7250 
7251 #ifdef USE_CALL
7252   if (scan_env.num_call > 0) {
7253     r = unset_addr_list_init(&uslist, scan_env.num_call);
7254     if (r != 0) goto err;
7255     scan_env.unset_addr_list = &uslist;
7256     r = tune_call(root, &scan_env, 0);
7257     if (r != 0) goto err_unset;
7258     r = tune_call2(root);
7259     if (r != 0) goto err_unset;
7260     r = recursive_call_check_trav(root, &scan_env, 0);
7261     if (r  < 0) goto err_unset;
7262     r = infinite_recursive_call_check_trav(root, &scan_env);
7263     if (r != 0) goto err_unset;
7264 
7265     tune_called_state(root, 0);
7266   }
7267 
7268   reg->num_call = scan_env.num_call;
7269 #endif
7270 
7271 #ifdef ONIG_DEBUG_PARSE
7272   fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth);
7273   fprintf(DBGFP, "TREE (parsed)\n");
7274   print_tree(DBGFP, root);
7275   fprintf(DBGFP, "\n");
7276 #endif
7277 
7278   r = tune_tree(root, reg, 0, &scan_env);
7279   if (r != 0) goto err_unset;
7280 
7281   if (scan_env.backref_num != 0) {
7282     set_parent_node_trav(root, NULL_NODE);
7283     r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env);
7284     if (r != 0) goto err_unset;
7285     set_empty_status_check_trav(root, &scan_env);
7286   }
7287 
7288 #ifdef ONIG_DEBUG_PARSE
7289   fprintf(DBGFP, "TREE (after tune)\n");
7290   print_tree(DBGFP, root);
7291   fprintf(DBGFP, "\n");
7292 #endif
7293 
7294   reg->capture_history = scan_env.cap_history;
7295   reg->push_mem_start  = scan_env.backtrack_mem | scan_env.cap_history;
7296 
7297 #ifdef USE_CALLOUT
7298   if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
7299     reg->push_mem_end = reg->push_mem_start;
7300   }
7301   else {
7302     if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7303       reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7304     else
7305       reg->push_mem_end = reg->push_mem_start &
7306                         (scan_env.backrefed_mem | scan_env.cap_history);
7307   }
7308 #else
7309   if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7310     reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7311   else
7312     reg->push_mem_end = reg->push_mem_start &
7313                       (scan_env.backrefed_mem | scan_env.cap_history);
7314 #endif
7315 
7316   clear_optimize_info(reg);
7317 #ifndef ONIG_DONT_OPTIMIZE
7318   r = set_optimize_info_from_tree(root, reg, &scan_env);
7319   if (r != 0) goto err_unset;
7320 #endif
7321 
7322   if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
7323     xfree(scan_env.mem_env_dynamic);
7324     scan_env.mem_env_dynamic = (MemEnv* )NULL;
7325   }
7326 
7327   r = compile_tree(root, reg, &scan_env);
7328   if (r == 0) {
7329     if (scan_env.keep_num > 0) {
7330       r = add_op(reg, OP_UPDATE_VAR);
7331       if (r != 0) goto err;
7332 
7333       COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
7334       COP(reg)->update_var.id   = 0; /* not used */
7335       COP(reg)->update_var.clear = FALSE;
7336     }
7337 
7338     r = add_op(reg, OP_END);
7339     if (r != 0) goto err;
7340 
7341 #ifdef USE_CALL
7342     if (scan_env.num_call > 0) {
7343       r = fix_unset_addr_list(&uslist, reg);
7344       unset_addr_list_end(&uslist);
7345       if (r != 0) goto err;
7346     }
7347 #endif
7348 
7349     set_addr_in_repeat_range(reg);
7350 
7351     if ((reg->push_mem_end != 0)
7352 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
7353         || (reg->num_repeat      != 0)
7354         || (reg->num_empty_check != 0)
7355 #endif
7356 #ifdef USE_CALLOUT
7357         || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
7358 #endif
7359 #ifdef USE_CALL
7360         || scan_env.num_call > 0
7361 #endif
7362         )
7363       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
7364     else {
7365       if (reg->push_mem_start != 0)
7366         reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
7367       else
7368         reg->stack_pop_level = STACK_POP_LEVEL_FREE;
7369     }
7370 
7371     r = ops_make_string_pool(reg);
7372     if (r != 0) goto err;
7373   }
7374 #ifdef USE_CALL
7375   else if (scan_env.num_call > 0) {
7376     unset_addr_list_end(&uslist);
7377   }
7378 #endif
7379   onig_node_free(root);
7380 
7381 #ifdef ONIG_DEBUG_COMPILE
7382   onig_print_names(DBGFP, reg);
7383   onig_print_compiled_byte_code_list(DBGFP, reg);
7384 #endif
7385 
7386 #ifdef USE_DIRECT_THREADED_CODE
7387   /* opcode -> opaddr */
7388   onig_init_for_match_at(reg);
7389 #endif
7390 
7391  end:
7392   return r;
7393 
7394  err_unset:
7395 #ifdef USE_CALL
7396   if (scan_env.num_call > 0) {
7397     unset_addr_list_end(&uslist);
7398   }
7399 #endif
7400  err:
7401   if (IS_NOT_NULL(scan_env.error)) {
7402     if (IS_NOT_NULL(einfo)) {
7403       einfo->par     = scan_env.error;
7404       einfo->par_end = scan_env.error_end;
7405     }
7406   }
7407 
7408   onig_node_free(root);
7409   if (IS_NOT_NULL(scan_env.mem_env_dynamic))
7410       xfree(scan_env.mem_env_dynamic);
7411   return r;
7412 }
7413 
7414 
7415 static int onig_inited = 0;
7416 
7417 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)7418 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
7419               OnigEncoding enc, OnigSyntaxType* syntax)
7420 {
7421   int r;
7422 
7423   xmemset(reg, 0, sizeof(*reg));
7424 
7425   if (onig_inited == 0) {
7426 #if 0
7427     return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
7428 #else
7429     r = onig_initialize(&enc, 1);
7430     if (r != 0)
7431       return ONIGERR_FAIL_TO_INITIALIZE;
7432 
7433     onig_warning("You didn't call onig_initialize() explicitly");
7434 #endif
7435   }
7436 
7437   if (IS_NULL(reg))
7438     return ONIGERR_INVALID_ARGUMENT;
7439 
7440   if (ONIGENC_IS_UNDEF(enc))
7441     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
7442 
7443   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
7444       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
7445     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
7446   }
7447 
7448   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
7449     option |= syntax->options;
7450     option &= ~ONIG_OPTION_SINGLELINE;
7451   }
7452   else
7453     option |= syntax->options;
7454 
7455   (reg)->enc            = enc;
7456   (reg)->options        = option;
7457   (reg)->syntax         = syntax;
7458   (reg)->optimize       = 0;
7459   (reg)->exact          = (UChar* )NULL;
7460   (reg)->extp           = (RegexExt* )NULL;
7461   (reg)->ops            = (Operation* )NULL;
7462   (reg)->ops_curr       = (Operation* )NULL;
7463   (reg)->ops_used       = 0;
7464   (reg)->ops_alloc      = 0;
7465   (reg)->name_table     = (void* )NULL;
7466   (reg)->case_fold_flag = case_fold_flag;
7467   return 0;
7468 }
7469 
7470 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7471 onig_new_without_alloc(regex_t* reg,
7472                        const UChar* pattern, const UChar* pattern_end,
7473                        OnigOptionType option, OnigEncoding enc,
7474                        OnigSyntaxType* syntax, OnigErrorInfo* einfo)
7475 {
7476   int r;
7477 
7478   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7479   if (r != 0) return r;
7480 
7481   r = onig_compile(reg, pattern, pattern_end, einfo);
7482   return r;
7483 }
7484 
7485 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7486 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
7487          OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
7488          OnigErrorInfo* einfo)
7489 {
7490   int r;
7491 
7492   *reg = (regex_t* )xmalloc(sizeof(regex_t));
7493   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
7494 
7495   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7496   if (r != 0) {
7497     xfree(*reg);
7498     *reg = NULL;
7499     return r;
7500   }
7501 
7502   r = onig_compile(*reg, pattern, pattern_end, einfo);
7503   if (r != 0) {
7504     onig_free(*reg);
7505     *reg = NULL;
7506   }
7507   return r;
7508 }
7509 
7510 extern int
onig_initialize(OnigEncoding encodings[],int n)7511 onig_initialize(OnigEncoding encodings[], int n)
7512 {
7513   int i;
7514   int r;
7515 
7516   if (onig_inited != 0)
7517     return 0;
7518 
7519   onigenc_init();
7520 
7521   onig_inited = 1;
7522 
7523   for (i = 0; i < n; i++) {
7524     OnigEncoding enc = encodings[i];
7525     r = onig_initialize_encoding(enc);
7526     if (r != 0)
7527       return r;
7528   }
7529 
7530   return ONIG_NORMAL;
7531 }
7532 
7533 typedef struct EndCallListItem {
7534   struct EndCallListItem* next;
7535   void (*func)(void);
7536 } EndCallListItemType;
7537 
7538 static EndCallListItemType* EndCallTop;
7539 
onig_add_end_call(void (* func)(void))7540 extern void onig_add_end_call(void (*func)(void))
7541 {
7542   EndCallListItemType* item;
7543 
7544   item = (EndCallListItemType* )xmalloc(sizeof(*item));
7545   if (item == 0) return ;
7546 
7547   item->next = EndCallTop;
7548   item->func = func;
7549 
7550   EndCallTop = item;
7551 }
7552 
7553 static void
exec_end_call_list(void)7554 exec_end_call_list(void)
7555 {
7556   EndCallListItemType* prev;
7557   void (*func)(void);
7558 
7559   while (EndCallTop != 0) {
7560     func = EndCallTop->func;
7561     (*func)();
7562 
7563     prev = EndCallTop;
7564     EndCallTop = EndCallTop->next;
7565     xfree(prev);
7566   }
7567 }
7568 
7569 extern int
onig_end(void)7570 onig_end(void)
7571 {
7572   exec_end_call_list();
7573 
7574 #ifdef USE_CALLOUT
7575   onig_global_callout_names_free();
7576 #endif
7577 
7578   onigenc_end();
7579 
7580   onig_inited = 0;
7581 
7582   return 0;
7583 }
7584 
7585 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7586 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7587 {
7588   OnigCodePoint n, *data;
7589   OnigCodePoint low, high, x;
7590 
7591   GET_CODE_POINT(n, p);
7592   data = (OnigCodePoint* )p;
7593   data++;
7594 
7595   for (low = 0, high = n; low < high; ) {
7596     x = (low + high) >> 1;
7597     if (code > data[x * 2 + 1])
7598       low = x + 1;
7599     else
7600       high = x;
7601   }
7602 
7603   return ((low < n && code >= data[low * 2]) ? 1 : 0);
7604 }
7605 
7606 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7607 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7608 {
7609   int found;
7610   CClassNode* cc = (CClassNode* )cc_arg;
7611 
7612   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7613     if (IS_NULL(cc->mbuf)) {
7614       found = 0;
7615     }
7616     else {
7617       found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7618     }
7619   }
7620   else {
7621     found = BITSET_AT(cc->bs, code) != 0;
7622   }
7623 
7624   if (IS_NCCLASS_NOT(cc))
7625     return !found;
7626   else
7627     return found;
7628 }
7629 
7630 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7631 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7632 {
7633   int len;
7634 
7635   if (ONIGENC_MBC_MINLEN(enc) > 1) {
7636     len = 2;
7637   }
7638   else {
7639     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7640     if (len < 0) return 0;
7641   }
7642   return onig_is_code_in_cc_len(len, code, cc);
7643 }
7644 
7645 
7646 #ifdef ONIG_DEBUG_PARSE
7647 
7648 #ifdef USE_CALL
7649 static void
p_string(FILE * f,int len,UChar * s)7650 p_string(FILE* f, int len, UChar* s)
7651 {
7652   fputs(":", f);
7653   while (len-- > 0) { fputc(*s++, f); }
7654 }
7655 #endif
7656 
7657 static void
Indent(FILE * f,int indent)7658 Indent(FILE* f, int indent)
7659 {
7660   int i;
7661   for (i = 0; i < indent; i++) putc(' ', f);
7662 }
7663 
7664 static void
print_indent_tree(FILE * f,Node * node,int indent)7665 print_indent_tree(FILE* f, Node* node, int indent)
7666 {
7667   int i;
7668   NodeType type;
7669   UChar* p;
7670   int add = 3;
7671 
7672   Indent(f, indent);
7673   if (IS_NULL(node)) {
7674     fprintf(f, "ERROR: null node!!!\n");
7675     exit(0);
7676   }
7677 
7678   type = NODE_TYPE(node);
7679   switch (type) {
7680   case NODE_LIST:
7681   case NODE_ALT:
7682     if (type == NODE_LIST)
7683       fprintf(f, "<list:%p>\n", node);
7684     else
7685       fprintf(f, "<alt:%p>\n", node);
7686 
7687     print_indent_tree(f, NODE_CAR(node), indent + add);
7688     while (IS_NOT_NULL(node = NODE_CDR(node))) {
7689       if (NODE_TYPE(node) != type) {
7690         fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
7691         exit(0);
7692       }
7693       print_indent_tree(f, NODE_CAR(node), indent + add);
7694     }
7695     break;
7696 
7697   case NODE_STRING:
7698     {
7699       char* str;
7700       char* mode;
7701 
7702       if (NODE_STRING_IS_CRUDE(node))
7703         mode = "-crude";
7704       else if (NODE_IS_IGNORECASE(node))
7705         mode = "-ignorecase";
7706       else
7707         mode = "";
7708 
7709       if (STR_(node)->s == STR_(node)->end)
7710         str = "empty-string";
7711       else
7712         str = "string";
7713 
7714       fprintf(f, "<%s%s:%p>", str, mode, node);
7715       for (p = STR_(node)->s; p < STR_(node)->end; p++) {
7716         if (*p >= 0x20 && *p < 0x7f)
7717           fputc(*p, f);
7718         else {
7719           fprintf(f, " 0x%02x", *p);
7720         }
7721       }
7722     }
7723     break;
7724 
7725   case NODE_CCLASS:
7726     fprintf(f, "<cclass:%p>", node);
7727     if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
7728     if (CCLASS_(node)->mbuf) {
7729       BBuf* bbuf = CCLASS_(node)->mbuf;
7730       for (i = 0; i < bbuf->used; i++) {
7731         if (i > 0) fprintf(f, ",");
7732         fprintf(f, "%0x", bbuf->p[i]);
7733       }
7734     }
7735     break;
7736 
7737   case NODE_CTYPE:
7738     fprintf(f, "<ctype:%p> ", node);
7739     switch (CTYPE_(node)->ctype) {
7740     case CTYPE_ANYCHAR:
7741       fprintf(f, "anychar");
7742       break;
7743 
7744     case ONIGENC_CTYPE_WORD:
7745       if (CTYPE_(node)->not != 0)
7746         fputs("not word", f);
7747       else
7748         fputs("word",     f);
7749 
7750       if (CTYPE_(node)->ascii_mode != 0)
7751         fputs(" (ascii)", f);
7752 
7753       break;
7754 
7755     default:
7756       fprintf(f, "ERROR: undefined ctype.\n");
7757       exit(0);
7758     }
7759     break;
7760 
7761   case NODE_ANCHOR:
7762     fprintf(f, "<anchor:%p> ", node);
7763     switch (ANCHOR_(node)->type) {
7764     case ANCR_BEGIN_BUF:        fputs("begin buf",      f); break;
7765     case ANCR_END_BUF:          fputs("end buf",        f); break;
7766     case ANCR_BEGIN_LINE:       fputs("begin line",     f); break;
7767     case ANCR_END_LINE:         fputs("end line",       f); break;
7768     case ANCR_SEMI_END_BUF:     fputs("semi end buf",   f); break;
7769     case ANCR_BEGIN_POSITION:   fputs("begin position", f); break;
7770 
7771     case ANCR_WORD_BOUNDARY:    fputs("word boundary",     f); break;
7772     case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
7773 #ifdef USE_WORD_BEGIN_END
7774     case ANCR_WORD_BEGIN:       fputs("word begin", f);     break;
7775     case ANCR_WORD_END:         fputs("word end", f);       break;
7776 #endif
7777     case ANCR_TEXT_SEGMENT_BOUNDARY:
7778       fputs("text-segment boundary", f); break;
7779     case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
7780       fputs("no text-segment boundary", f); break;
7781     case ANCR_PREC_READ:
7782       fprintf(f, "prec read\n");
7783       print_indent_tree(f, NODE_BODY(node), indent + add);
7784       break;
7785     case ANCR_PREC_READ_NOT:
7786       fprintf(f, "prec read not\n");
7787       print_indent_tree(f, NODE_BODY(node), indent + add);
7788       break;
7789     case ANCR_LOOK_BEHIND:
7790       fprintf(f, "look behind\n");
7791       print_indent_tree(f, NODE_BODY(node), indent + add);
7792       break;
7793     case ANCR_LOOK_BEHIND_NOT:
7794       fprintf(f, "look behind not\n");
7795       print_indent_tree(f, NODE_BODY(node), indent + add);
7796       break;
7797 
7798     default:
7799       fprintf(f, "ERROR: undefined anchor type.\n");
7800       break;
7801     }
7802     break;
7803 
7804   case NODE_BACKREF:
7805     {
7806       int* p;
7807       BackRefNode* br = BACKREF_(node);
7808       p = BACKREFS_P(br);
7809       fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
7810       for (i = 0; i < br->back_num; i++) {
7811         if (i > 0) fputs(", ", f);
7812         fprintf(f, "%d", p[i]);
7813       }
7814     }
7815     break;
7816 
7817 #ifdef USE_CALL
7818   case NODE_CALL:
7819     {
7820       CallNode* cn = CALL_(node);
7821       fprintf(f, "<call:%p>", node);
7822       p_string(f, cn->name_end - cn->name, cn->name);
7823     }
7824     break;
7825 #endif
7826 
7827   case NODE_QUANT:
7828     fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node,
7829             QUANT_(node)->lower, QUANT_(node)->upper,
7830             (QUANT_(node)->greedy ? "" : "?"),
7831             QUANT_(node)->include_referred == 0 ? "" : " referred");
7832     print_indent_tree(f, NODE_BODY(node), indent + add);
7833     break;
7834 
7835   case NODE_BAG:
7836     fprintf(f, "<bag:%p> ", node);
7837     if (BAG_(node)->type == BAG_IF_ELSE) {
7838       Node* Then;
7839       Node* Else;
7840       BagNode* bn;
7841 
7842       bn = BAG_(node);
7843       fprintf(f, "if-else\n");
7844       print_indent_tree(f, NODE_BODY(node), indent + add);
7845 
7846       Then = bn->te.Then;
7847       Else = bn->te.Else;
7848       if (IS_NULL(Then)) {
7849         Indent(f, indent + add);
7850         fprintf(f, "THEN empty\n");
7851       }
7852       else
7853         print_indent_tree(f, Then, indent + add);
7854 
7855       if (IS_NULL(Else)) {
7856         Indent(f, indent + add);
7857         fprintf(f, "ELSE empty\n");
7858       }
7859       else
7860         print_indent_tree(f, Else, indent + add);
7861 
7862       break;
7863     }
7864 
7865     switch (BAG_(node)->type) {
7866     case BAG_OPTION:
7867       fprintf(f, "option:%d", BAG_(node)->o.options);
7868       break;
7869     case BAG_MEMORY:
7870       fprintf(f, "memory:%d", BAG_(node)->m.regnum);
7871       if (NODE_IS_CALLED(node))
7872         fprintf(f, ", called");
7873       if (NODE_IS_FIXED_ADDR(node))
7874         fprintf(f, ", fixed-addr");
7875       break;
7876     case BAG_STOP_BACKTRACK:
7877       fprintf(f, "stop-bt");
7878       break;
7879     default:
7880       break;
7881     }
7882     fprintf(f, "\n");
7883     print_indent_tree(f, NODE_BODY(node), indent + add);
7884     break;
7885 
7886   case NODE_GIMMICK:
7887     fprintf(f, "<gimmick:%p> ", node);
7888     switch (GIMMICK_(node)->type) {
7889     case GIMMICK_FAIL:
7890       fprintf(f, "fail");
7891       break;
7892     case GIMMICK_SAVE:
7893       fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7894       break;
7895     case GIMMICK_UPDATE_VAR:
7896       fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7897       break;
7898 #ifdef USE_CALLOUT
7899     case GIMMICK_CALLOUT:
7900       switch (GIMMICK_(node)->detail_type) {
7901       case ONIG_CALLOUT_OF_CONTENTS:
7902         fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
7903         break;
7904       case ONIG_CALLOUT_OF_NAME:
7905         fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
7906         break;
7907       }
7908 #endif
7909     }
7910     break;
7911 
7912   default:
7913     fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
7914     break;
7915   }
7916 
7917   if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
7918       type != NODE_BAG)
7919     fprintf(f, "\n");
7920   fflush(f);
7921 }
7922 
7923 static void
print_tree(FILE * f,Node * node)7924 print_tree(FILE* f, Node* node)
7925 {
7926   print_indent_tree(f, node, 0);
7927 }
7928 #endif
7929