1 /**********************************************************************
2   regcomp.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 #define OPS_INIT_SIZE  8
33 
34 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35 
36 #if 0
37 typedef struct {
38   int  n;
39   int  alloc;
40   int* v;
41 } int_stack;
42 
43 static int
44 make_int_stack(int_stack** rs, int init_size)
45 {
46   int_stack* s;
47   int* v;
48 
49   *rs = 0;
50 
51   s = xmalloc(sizeof(*s));
52   if (IS_NULL(s)) return ONIGERR_MEMORY;
53 
54   v = (int* )xmalloc(sizeof(int) * init_size);
55   if (IS_NULL(v)) {
56     xfree(s);
57     return ONIGERR_MEMORY;
58   }
59 
60   s->n = 0;
61   s->alloc = init_size;
62   s->v = v;
63 
64   *rs = s;
65   return ONIG_NORMAL;
66 }
67 
68 static void
69 free_int_stack(int_stack* s)
70 {
71   if (IS_NOT_NULL(s)) {
72     if (IS_NOT_NULL(s->v))
73       xfree(s->v);
74     xfree(s);
75   }
76 }
77 
78 static int
79 int_stack_push(int_stack* s, int v)
80 {
81   if (s->n >= s->alloc) {
82     int new_size = s->alloc * 2;
83     int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
84     if (IS_NULL(nv)) return ONIGERR_MEMORY;
85 
86     s->alloc = new_size;
87     s->v = nv;
88   }
89 
90   s->v[s->n] = v;
91   s->n++;
92   return ONIG_NORMAL;
93 }
94 
95 static int
96 int_stack_pop(int_stack* s)
97 {
98   int v;
99 
100 #ifdef ONIG_DEBUG
101   if (s->n <= 0) {
102     fprintf(stderr, "int_stack_pop: fail empty. %p\n", s);
103     return 0;
104   }
105 #endif
106 
107   v = s->v[s->n];
108   s->n--;
109   return v;
110 }
111 #endif
112 
113 static int
ops_init(regex_t * reg,int init_alloc_size)114 ops_init(regex_t* reg, int init_alloc_size)
115 {
116   Operation* p;
117   size_t size;
118 
119   if (init_alloc_size > 0) {
120     size = sizeof(Operation) * init_alloc_size;
121     p = (Operation* )xrealloc(reg->ops, size);
122     CHECK_NULL_RETURN_MEMERR(p);
123 #ifdef USE_DIRECT_THREADED_CODE
124     {
125       enum OpCode* cp;
126       size = sizeof(enum OpCode) * init_alloc_size;
127       cp = (enum OpCode* )xrealloc(reg->ocs, size);
128       CHECK_NULL_RETURN_MEMERR(cp);
129       reg->ocs = cp;
130     }
131 #endif
132   }
133   else {
134     p  = (Operation* )0;
135 #ifdef USE_DIRECT_THREADED_CODE
136     reg->ocs = (enum OpCode* )0;
137 #endif
138   }
139 
140   reg->ops = p;
141   reg->ops_curr  = 0; /* !!! not yet done ops_new() */
142   reg->ops_alloc = init_alloc_size;
143   reg->ops_used  = 0;
144 
145   return ONIG_NORMAL;
146 }
147 
148 static int
ops_expand(regex_t * reg,int n)149 ops_expand(regex_t* reg, int n)
150 {
151 #define MIN_OPS_EXPAND_SIZE   4
152 
153 #ifdef USE_DIRECT_THREADED_CODE
154   enum OpCode* cp;
155 #endif
156   Operation* p;
157   size_t size;
158 
159   if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
160 
161   n += reg->ops_alloc;
162 
163   size = sizeof(Operation) * n;
164   p = (Operation* )xrealloc(reg->ops, size);
165   CHECK_NULL_RETURN_MEMERR(p);
166 
167 #ifdef USE_DIRECT_THREADED_CODE
168   size = sizeof(enum OpCode) * n;
169   cp = (enum OpCode* )xrealloc(reg->ocs, size);
170   CHECK_NULL_RETURN_MEMERR(cp);
171   reg->ocs = cp;
172 #endif
173 
174   reg->ops = p;
175   reg->ops_alloc = n;
176   if (reg->ops_used == 0)
177     reg->ops_curr = 0;
178   else
179     reg->ops_curr = reg->ops + (reg->ops_used - 1);
180 
181   return ONIG_NORMAL;
182 }
183 
184 static int
ops_new(regex_t * reg)185 ops_new(regex_t* reg)
186 {
187   int r;
188 
189   if (reg->ops_used >= reg->ops_alloc) {
190     r = ops_expand(reg, reg->ops_alloc);
191     if (r != ONIG_NORMAL) return r;
192   }
193 
194   reg->ops_curr = reg->ops + reg->ops_used;
195   reg->ops_used++;
196 
197   xmemset(reg->ops_curr, 0, sizeof(Operation));
198   return ONIG_NORMAL;
199 }
200 
201 static int
is_in_string_pool(regex_t * reg,UChar * s)202 is_in_string_pool(regex_t* reg, UChar* s)
203 {
204   return (s >= reg->string_pool && s < reg->string_pool_end);
205 }
206 
207 static void
ops_free(regex_t * reg)208 ops_free(regex_t* reg)
209 {
210   int i;
211 
212   if (IS_NULL(reg->ops)) return ;
213 
214   for (i = 0; i < (int )reg->ops_used; i++) {
215     enum OpCode opcode;
216     Operation* op;
217 
218     op = reg->ops + i;
219 
220 #ifdef USE_DIRECT_THREADED_CODE
221     opcode = *(reg->ocs + i);
222 #else
223     opcode = op->opcode;
224 #endif
225 
226     switch (opcode) {
227     case OP_STR_MBN:
228       if (! is_in_string_pool(reg, op->exact_len_n.s))
229         xfree(op->exact_len_n.s);
230       break;
231     case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC:
232       if (! is_in_string_pool(reg, op->exact_n.s))
233         xfree(op->exact_n.s);
234       break;
235     case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
236     case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
237     case OP_STR_MB2N3: case OP_STR_1_IC:
238       break;
239 
240     case OP_CCLASS_NOT: case OP_CCLASS:
241       xfree(op->cclass.bsp);
242       break;
243 
244     case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
245       xfree(op->cclass_mb.mb);
246       break;
247     case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
248       xfree(op->cclass_mix.mb);
249       xfree(op->cclass_mix.bsp);
250       break;
251 
252     case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
253       break;
254     case OP_BACKREF_MULTI:      case OP_BACKREF_MULTI_IC:
255     case OP_BACKREF_WITH_LEVEL:
256     case OP_BACKREF_WITH_LEVEL_IC:
257     case OP_BACKREF_CHECK:
258     case OP_BACKREF_CHECK_WITH_LEVEL:
259       if (op->backref_general.num != 1)
260         xfree(op->backref_general.ns);
261       break;
262 
263     default:
264       break;
265     }
266   }
267 
268   xfree(reg->ops);
269 #ifdef USE_DIRECT_THREADED_CODE
270   xfree(reg->ocs);
271   reg->ocs = 0;
272 #endif
273 
274   reg->ops = 0;
275   reg->ops_curr  = 0;
276   reg->ops_alloc = 0;
277   reg->ops_used  = 0;
278 }
279 
280 static int
ops_calc_size_of_string_pool(regex_t * reg)281 ops_calc_size_of_string_pool(regex_t* reg)
282 {
283   int i;
284   int total;
285 
286   if (IS_NULL(reg->ops)) return 0;
287 
288   total = 0;
289   for (i = 0; i < (int )reg->ops_used; i++) {
290     enum OpCode opcode;
291     Operation* op;
292 
293     op = reg->ops + i;
294 #ifdef USE_DIRECT_THREADED_CODE
295     opcode = *(reg->ocs + i);
296 #else
297     opcode = op->opcode;
298 #endif
299 
300     switch (opcode) {
301     case OP_STR_MBN:
302       total += op->exact_len_n.len * op->exact_len_n.n;
303       break;
304     case OP_STR_N:
305     case OP_STR_N_IC:
306       total += op->exact_n.n;
307       break;
308     case OP_STR_MB2N:
309       total += op->exact_n.n * 2;
310       break;
311     case OP_STR_MB3N:
312       total += op->exact_n.n * 3;
313       break;
314 
315     default:
316       break;
317     }
318   }
319 
320   return total;
321 }
322 
323 static int
ops_make_string_pool(regex_t * reg)324 ops_make_string_pool(regex_t* reg)
325 {
326   int i;
327   int len;
328   int size;
329   UChar* pool;
330   UChar* curr;
331 
332   size = ops_calc_size_of_string_pool(reg);
333   if (size <= 0) {
334     return 0;
335   }
336 
337   curr = pool = (UChar* )xmalloc((size_t )size);
338   CHECK_NULL_RETURN_MEMERR(pool);
339 
340   for (i = 0; i < (int )reg->ops_used; i++) {
341     enum OpCode opcode;
342     Operation* op;
343 
344     op = reg->ops + i;
345 #ifdef USE_DIRECT_THREADED_CODE
346     opcode = *(reg->ocs + i);
347 #else
348     opcode = op->opcode;
349 #endif
350 
351     switch (opcode) {
352     case OP_STR_MBN:
353       len = op->exact_len_n.len * op->exact_len_n.n;
354       xmemcpy(curr, op->exact_len_n.s, len);
355       xfree(op->exact_len_n.s);
356       op->exact_len_n.s = curr;
357       curr += len;
358       break;
359     case OP_STR_N:
360     case OP_STR_N_IC:
361       len = op->exact_n.n;
362     copy:
363       xmemcpy(curr, op->exact_n.s, len);
364       xfree(op->exact_n.s);
365       op->exact_n.s = curr;
366       curr += len;
367       break;
368     case OP_STR_MB2N:
369       len = op->exact_n.n * 2;
370       goto copy;
371       break;
372     case OP_STR_MB3N:
373       len = op->exact_n.n * 3;
374       goto copy;
375       break;
376 
377     default:
378       break;
379     }
380   }
381 
382   reg->string_pool     = pool;
383   reg->string_pool_end = pool + size;
384   return 0;
385 }
386 
387 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)388 onig_get_default_case_fold_flag(void)
389 {
390   return OnigDefaultCaseFoldFlag;
391 }
392 
393 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)394 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
395 {
396   OnigDefaultCaseFoldFlag = case_fold_flag;
397   return 0;
398 }
399 
400 static int
int_multiply_cmp(int x,int y,int v)401 int_multiply_cmp(int x, int y, int v)
402 {
403   if (x == 0 || y == 0) return -1;
404 
405   if (x < INT_MAX / y) {
406     int xy = x * y;
407     if (xy > v) return 1;
408     else {
409       if (xy == v) return 0;
410       else return -1;
411     }
412   }
413   else
414     return 1;
415 }
416 
417 extern int
onig_positive_int_multiply(int x,int y)418 onig_positive_int_multiply(int x, int y)
419 {
420   if (x == 0 || y == 0) return 0;
421 
422   if (x < INT_MAX / y)
423     return x * y;
424   else
425     return -1;
426 }
427 
428 
429 static void
node_swap(Node * a,Node * b)430 node_swap(Node* a, Node* b)
431 {
432   Node c;
433 
434   c = *a; *a = *b; *b = c;
435 
436   if (NODE_TYPE(a) == NODE_STRING) {
437     StrNode* sn = STR_(a);
438     if (sn->capacity == 0) {
439       int len = (int )(sn->end - sn->s);
440       sn->s   = sn->buf;
441       sn->end = sn->s + len;
442     }
443   }
444 
445   if (NODE_TYPE(b) == NODE_STRING) {
446     StrNode* sn = STR_(b);
447     if (sn->capacity == 0) {
448       int len = (int )(sn->end - sn->s);
449       sn->s   = sn->buf;
450       sn->end = sn->s + len;
451     }
452   }
453 }
454 
455 static int
node_list_len(Node * list)456 node_list_len(Node* list)
457 {
458   int len;
459 
460   len = 1;
461   while (IS_NOT_NULL(NODE_CDR(list))) {
462     list = NODE_CDR(list);
463     len++;
464   }
465 
466   return len;
467 }
468 
469 static Node*
node_list_add(Node * list,Node * x)470 node_list_add(Node* list, Node* x)
471 {
472   Node *n;
473 
474   n = onig_node_new_list(x, NULL);
475   if (IS_NULL(n)) return NULL_NODE;
476 
477   if (IS_NOT_NULL(list)) {
478     while (IS_NOT_NULL(NODE_CDR(list)))
479       list = NODE_CDR(list);
480 
481     NODE_CDR(list) = n;
482   }
483 
484   return n;
485 }
486 
487 static int
node_str_node_cat(Node * node,Node * add)488 node_str_node_cat(Node* node, Node* add)
489 {
490   int r;
491 
492   if (STR_(node)->flag != STR_(add)->flag)
493     return ONIGERR_TYPE_BUG;
494 
495   r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
496   if (r != 0) return r;
497 
498   if (NODE_STRING_IS_CASE_FOLD_MATCH(node))
499     STR_(node)->case_min_len += STR_(add)->case_min_len;
500 
501   return 0;
502 }
503 
504 static int
node_str_cat_case_fold(Node * node,const UChar * s,const UChar * end,int case_min_len)505 node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len)
506 {
507   int r;
508 
509   if (! NODE_STRING_IS_CASE_FOLD_MATCH(node))
510     return ONIGERR_TYPE_BUG;
511 
512   r = onig_node_str_cat(node, s, end);
513   if (r != 0) return r;
514 
515   STR_(node)->case_min_len += case_min_len;
516   return 0;
517 }
518 
519 static void
node_conv_to_str_node(Node * node,int flag)520 node_conv_to_str_node(Node* node, int flag)
521 {
522   NODE_SET_TYPE(node, NODE_STRING);
523   STR_(node)->flag     = flag;
524   STR_(node)->s        = STR_(node)->buf;
525   STR_(node)->end      = STR_(node)->buf;
526   STR_(node)->capacity = 0;
527   STR_(node)->case_min_len = 0;
528 }
529 
530 static OnigLen
distance_add(OnigLen d1,OnigLen d2)531 distance_add(OnigLen d1, OnigLen d2)
532 {
533   if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
534     return INFINITE_LEN;
535   else {
536     if (d1 <= INFINITE_LEN - d2) return d1 + d2;
537     else return INFINITE_LEN;
538   }
539 }
540 
541 static OnigLen
distance_multiply(OnigLen d,int m)542 distance_multiply(OnigLen d, int m)
543 {
544   if (m == 0) return 0;
545 
546   if (d < INFINITE_LEN / m)
547     return d * m;
548   else
549     return INFINITE_LEN;
550 }
551 
552 static int
bitset_is_empty(BitSetRef bs)553 bitset_is_empty(BitSetRef bs)
554 {
555   int i;
556 
557   for (i = 0; i < (int )BITSET_SIZE; i++) {
558     if (bs[i] != 0) return 0;
559   }
560   return 1;
561 }
562 
563 #ifdef USE_CALL
564 
565 static int
unset_addr_list_init(UnsetAddrList * list,int size)566 unset_addr_list_init(UnsetAddrList* list, int size)
567 {
568   UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
569   CHECK_NULL_RETURN_MEMERR(p);
570 
571   list->num   = 0;
572   list->alloc = size;
573   list->us    = p;
574   return 0;
575 }
576 
577 static void
unset_addr_list_end(UnsetAddrList * list)578 unset_addr_list_end(UnsetAddrList* list)
579 {
580   if (IS_NOT_NULL(list->us))
581     xfree(list->us);
582 }
583 
584 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)585 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
586 {
587   UnsetAddr* p;
588   int size;
589 
590   if (list->num >= list->alloc) {
591     size = list->alloc * 2;
592     p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
593     CHECK_NULL_RETURN_MEMERR(p);
594     list->alloc = size;
595     list->us    = p;
596   }
597 
598   list->us[list->num].offset = offset;
599   list->us[list->num].target = node;
600   list->num++;
601   return 0;
602 }
603 #endif /* USE_CALL */
604 
605 
606 static int
add_op(regex_t * reg,int opcode)607 add_op(regex_t* reg, int opcode)
608 {
609   int r;
610 
611   r = ops_new(reg);
612   if (r != ONIG_NORMAL) return r;
613 
614 #ifdef USE_DIRECT_THREADED_CODE
615   *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
616 #else
617   reg->ops_curr->opcode = opcode;
618 #endif
619 
620   return 0;
621 }
622 
623 static int compile_length_tree(Node* node, regex_t* reg);
624 static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
625 
626 
627 #define IS_NEED_STR_LEN_OP(op) \
628    ((op) == OP_STR_N    || (op) == OP_STR_MB2N ||\
629     (op) == OP_STR_MB3N || (op) == OP_STR_MBN  || (op) == OP_STR_N_IC)
630 
631 static int
select_str_opcode(int mb_len,int str_len)632 select_str_opcode(int mb_len, int str_len)
633 {
634   int op;
635 
636   switch (mb_len) {
637   case 1:
638     switch (str_len) {
639     case 1:  op = OP_STR_1; break;
640     case 2:  op = OP_STR_2; break;
641     case 3:  op = OP_STR_3; break;
642     case 4:  op = OP_STR_4; break;
643     case 5:  op = OP_STR_5; break;
644     default: op = OP_STR_N; break;
645     }
646     break;
647 
648   case 2:
649     switch (str_len) {
650     case 1:  op = OP_STR_MB2N1; break;
651     case 2:  op = OP_STR_MB2N2; break;
652     case 3:  op = OP_STR_MB2N3; break;
653     default: op = OP_STR_MB2N;  break;
654     }
655     break;
656 
657   case 3:
658     op = OP_STR_MB3N;
659     break;
660 
661   default:
662     op = OP_STR_MBN;
663     break;
664   }
665 
666   return op;
667 }
668 
669 static int
is_strict_real_node(Node * node)670 is_strict_real_node(Node* node)
671 {
672   switch (NODE_TYPE(node)) {
673   case NODE_STRING:
674     {
675       StrNode* sn = STR_(node);
676       return (sn->end != sn->s);
677     }
678     break;
679 
680   case NODE_CCLASS:
681   case NODE_CTYPE:
682     return 1;
683     break;
684 
685   default:
686     return 0;
687     break;
688   }
689 }
690 
691 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ScanEnv * env)692 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
693 {
694   int r;
695   int saved_num_empty_check;
696   int emptiness;
697   Node* body;
698 
699   body = NODE_BODY((Node* )qn);
700   emptiness = qn->emptiness;
701   saved_num_empty_check = reg->num_empty_check;
702 
703   if (emptiness != BODY_IS_NOT_EMPTY) {
704     r = add_op(reg, OP_EMPTY_CHECK_START);
705     if (r != 0) return r;
706     COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
707     reg->num_empty_check++;
708   }
709 
710   r = compile_tree(body, reg, env);
711   if (r != 0) return r;
712 
713   if (emptiness != BODY_IS_NOT_EMPTY) {
714     if (emptiness == BODY_IS_EMPTY_POSSIBILITY)
715       r = add_op(reg, OP_EMPTY_CHECK_END);
716     else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) {
717       if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0)
718         r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
719       else
720         r = add_op(reg, OP_EMPTY_CHECK_END);
721     }
722 #ifdef USE_CALL
723     else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC)
724       r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
725 #endif
726 
727     if (r != 0) return r;
728     COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
729   }
730   return r;
731 }
732 
733 #ifdef USE_CALL
734 static int
compile_call(CallNode * node,regex_t * reg,ScanEnv * env)735 compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
736 {
737   int r;
738   int offset;
739 
740   r = add_op(reg, OP_CALL);
741   if (r != 0) return r;
742 
743   COP(reg)->call.addr = 0; /* dummy addr. */
744 
745   offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
746   r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
747   return r;
748 }
749 #endif
750 
751 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ScanEnv * env)752 compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
753 {
754   int i, r;
755 
756   for (i = 0; i < n; i++) {
757     r = compile_tree(node, reg, env);
758     if (r != 0) return r;
759   }
760   return 0;
761 }
762 
763 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)764 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
765                           regex_t* reg ARG_UNUSED)
766 {
767   return 1;
768 }
769 
770 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)771 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
772 {
773   int op;
774   int r;
775   int byte_len;
776   UChar* p;
777   UChar* end;
778 
779   op = select_str_opcode(mb_len, str_len);
780   r = add_op(reg, op);
781   if (r != 0) return r;
782 
783   byte_len = mb_len * str_len;
784   end = s + byte_len;
785 
786   if (op == OP_STR_MBN) {
787     p = onigenc_strdup(reg->enc, s, end);
788     CHECK_NULL_RETURN_MEMERR(p);
789 
790     COP(reg)->exact_len_n.len = mb_len;
791     COP(reg)->exact_len_n.n   = str_len;
792     COP(reg)->exact_len_n.s   = p;
793   }
794   else if (IS_NEED_STR_LEN_OP(op)) {
795     p = onigenc_strdup(reg->enc, s, end);
796     CHECK_NULL_RETURN_MEMERR(p);
797 
798     if (op == OP_STR_N_IC)
799       COP(reg)->exact_n.n = byte_len;
800     else
801       COP(reg)->exact_n.n = str_len;
802 
803     COP(reg)->exact_n.s = p;
804   }
805   else {
806     xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
807     xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
808   }
809 
810   return 0;
811 }
812 
813 static int
compile_length_string_node(Node * node,regex_t * reg)814 compile_length_string_node(Node* node, regex_t* reg)
815 {
816   int rlen, r, len, prev_len, slen;
817   UChar *p, *prev;
818   StrNode* sn;
819   OnigEncoding enc = reg->enc;
820 
821   sn = STR_(node);
822   if (sn->end <= sn->s)
823     return 0;
824 
825   if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1;
826 
827   p = prev = sn->s;
828   prev_len = enclen(enc, p);
829   p += prev_len;
830   slen = 1;
831   rlen = 0;
832 
833   for (; p < sn->end; ) {
834     len = enclen(enc, p);
835     if (len == prev_len) {
836       slen++;
837     }
838     else {
839       r = add_compile_string_length(prev, prev_len, slen, reg);
840       rlen += r;
841       prev = p;
842       slen = 1;
843       prev_len = len;
844     }
845     p += len;
846   }
847 
848   r = add_compile_string_length(prev, prev_len, slen, reg);
849   rlen += r;
850   return rlen;
851 }
852 
853 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)854 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
855 {
856   if (sn->end <= sn->s)
857     return 0;
858 
859   return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
860                                    reg);
861 }
862 
863 static int
compile_ambig_string_node(Node * node,regex_t * reg)864 compile_ambig_string_node(Node* node, regex_t* reg)
865 {
866   int r;
867   int len;
868   int byte_len;
869   UChar* p;
870   StrNode* sn;
871   OnigEncoding enc = reg->enc;
872 
873   sn = STR_(node);
874   len = enclen(enc, sn->s);
875   byte_len = (int )(sn->end - sn->s);
876   if (len == byte_len) {
877     r = add_op(reg, OP_STR_1_IC);
878     if (r != 0) return r;
879 
880     xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
881     xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len);
882   }
883   else {
884     r = add_op(reg, OP_STR_N_IC);
885     if (r != 0) return r;
886 
887     p = onigenc_strdup(enc, sn->s, sn->end);
888     CHECK_NULL_RETURN_MEMERR(p);
889 
890     COP(reg)->exact_n.s = p;
891     COP(reg)->exact_n.n = byte_len;
892   }
893 
894   return 0;
895 }
896 
897 static int
compile_string_node(Node * node,regex_t * reg)898 compile_string_node(Node* node, regex_t* reg)
899 {
900   int r, len, prev_len, slen;
901   UChar *p, *prev, *end;
902   StrNode* sn;
903   OnigEncoding enc = reg->enc;
904 
905   sn = STR_(node);
906   if (sn->end <= sn->s)
907     return 0;
908 
909   end = sn->end;
910   if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) {
911     return compile_ambig_string_node(node, reg);
912   }
913 
914   p = prev = sn->s;
915   prev_len = enclen(enc, p);
916   p += prev_len;
917   slen = 1;
918 
919   for (; p < end; ) {
920     len = enclen(enc, p);
921     if (len == prev_len) {
922       slen++;
923     }
924     else {
925       r = add_compile_string(prev, prev_len, slen, reg);
926       if (r != 0) return r;
927 
928       prev  = p;
929       slen  = 1;
930       prev_len = len;
931     }
932 
933     p += len;
934   }
935 
936   return add_compile_string(prev, prev_len, slen, reg);
937 }
938 
939 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)940 compile_string_crude_node(StrNode* sn, regex_t* reg)
941 {
942   if (sn->end <= sn->s)
943     return 0;
944 
945   return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
946 }
947 
948 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)949 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
950 {
951   size_t len;
952   void* p;
953 
954   len = (size_t )mbuf->used;
955   p = xmalloc(len);
956   if (IS_NULL(p)) return NULL;
957 
958   xmemcpy(p, mbuf->p, len);
959   return p;
960 }
961 
962 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)963 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
964 {
965   return 1;
966 }
967 
968 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)969 compile_cclass_node(CClassNode* cc, regex_t* reg)
970 {
971   int r;
972 
973   if (IS_NULL(cc->mbuf)) {
974     r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
975     if (r != 0) return r;
976 
977     COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
978     CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
979     xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
980   }
981   else {
982     void* p;
983 
984     if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
985       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
986       if (r != 0) return r;
987 
988       p = set_multi_byte_cclass(cc->mbuf, reg);
989       CHECK_NULL_RETURN_MEMERR(p);
990       COP(reg)->cclass_mb.mb = p;
991     }
992     else {
993       r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
994       if (r != 0) return r;
995 
996       COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
997       CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
998       xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
999 
1000       p = set_multi_byte_cclass(cc->mbuf, reg);
1001       CHECK_NULL_RETURN_MEMERR(p);
1002       COP(reg)->cclass_mix.mb = p;
1003     }
1004   }
1005 
1006   return 0;
1007 }
1008 
1009 static void
set_addr_in_repeat_range(regex_t * reg)1010 set_addr_in_repeat_range(regex_t* reg)
1011 {
1012   int i;
1013 
1014   for (i = 0; i < reg->num_repeat; i++) {
1015     RepeatRange* p = reg->repeat_range + i;
1016     int offset = p->u.offset;
1017     p->u.pcode = reg->ops + offset;
1018   }
1019 }
1020 
1021 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1022 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1023 {
1024 #define REPEAT_RANGE_ALLOC  4
1025 
1026   RepeatRange* p;
1027 
1028   if (reg->repeat_range_alloc == 0) {
1029     p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1030     CHECK_NULL_RETURN_MEMERR(p);
1031     reg->repeat_range = p;
1032     reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1033   }
1034   else if (reg->repeat_range_alloc <= id) {
1035     int n;
1036     n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1037     p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1038     CHECK_NULL_RETURN_MEMERR(p);
1039     reg->repeat_range = p;
1040     reg->repeat_range_alloc = n;
1041   }
1042   else {
1043     p = reg->repeat_range;
1044   }
1045 
1046   p[id].lower    = lower;
1047   p[id].upper    = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1048   p[id].u.offset = ops_index;
1049   return 0;
1050 }
1051 
1052 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ScanEnv * env)1053 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1054                           regex_t* reg, ScanEnv* env)
1055 {
1056   int r;
1057   int num_repeat = reg->num_repeat++;
1058 
1059   r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1060   if (r != 0) return r;
1061 
1062   COP(reg)->repeat.id   = num_repeat;
1063   COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1064 
1065   r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1066                          COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1067   if (r != 0) return r;
1068 
1069   r = compile_quant_body_with_empty_check(qn, reg, env);
1070   if (r != 0) return r;
1071 
1072   r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1073   if (r != 0) return r;
1074 
1075   COP(reg)->repeat_inc.id = num_repeat;
1076   return r;
1077 }
1078 
1079 static int
is_anychar_infinite_greedy(QuantNode * qn)1080 is_anychar_infinite_greedy(QuantNode* qn)
1081 {
1082   if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1083       NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1084     return 1;
1085   else
1086     return 0;
1087 }
1088 
1089 #define QUANTIFIER_EXPAND_LIMIT_SIZE   10
1090 #define CKN_ON   (ckn > 0)
1091 
1092 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1093 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1094 {
1095   int len, mod_tlen;
1096   int infinite = IS_INFINITE_REPEAT(qn->upper);
1097   enum BodyEmptyType emptiness = qn->emptiness;
1098   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1099 
1100   if (tlen < 0) return tlen;
1101   if (tlen == 0) return 0;
1102 
1103   /* anychar repeat */
1104   if (is_anychar_infinite_greedy(qn)) {
1105     if (qn->lower <= 1 ||
1106         int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1107       if (IS_NOT_NULL(qn->next_head_exact))
1108         return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1109       else
1110         return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1111     }
1112   }
1113 
1114   mod_tlen = tlen;
1115   if (emptiness != BODY_IS_NOT_EMPTY)
1116     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1117 
1118   if (infinite &&
1119       (qn->lower <= 1 ||
1120        int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1121     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1122       len = OPSIZE_JUMP;
1123     }
1124     else {
1125       len = tlen * qn->lower;
1126     }
1127 
1128     if (qn->greedy) {
1129 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1130       if (IS_NOT_NULL(qn->head_exact))
1131         len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1132       else
1133 #endif
1134       if (IS_NOT_NULL(qn->next_head_exact))
1135         len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1136       else
1137         len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1138     }
1139     else
1140       len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1141   }
1142   else if (qn->upper == 0) {
1143     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1144       len = OPSIZE_JUMP + tlen;
1145     }
1146     else
1147       len = 0;
1148   }
1149   else if (!infinite && qn->greedy &&
1150            (qn->upper == 1 ||
1151             int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper,
1152                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1153     len = tlen * qn->lower;
1154     len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1155   }
1156   else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1157     len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1158   }
1159   else {
1160     len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1161   }
1162 
1163   return len;
1164 }
1165 
1166 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ScanEnv * env)1167 compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
1168 {
1169   int i, r, mod_tlen;
1170   int infinite = IS_INFINITE_REPEAT(qn->upper);
1171   enum BodyEmptyType emptiness = qn->emptiness;
1172   int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1173 
1174   if (tlen < 0) return tlen;
1175   if (tlen == 0) return 0;
1176 
1177   if (is_anychar_infinite_greedy(qn) &&
1178       (qn->lower <= 1 ||
1179        int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1180     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1181     if (r != 0) return r;
1182     if (IS_NOT_NULL(qn->next_head_exact)) {
1183       r = add_op(reg,
1184                  IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1185                  OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1186       if (r != 0) return r;
1187 
1188       COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1189       return 0;
1190     }
1191     else {
1192       r = add_op(reg,
1193                  IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1194                  OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1195       return r;
1196     }
1197   }
1198 
1199   mod_tlen = tlen;
1200   if (emptiness != BODY_IS_NOT_EMPTY)
1201     mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1202 
1203   if (infinite &&
1204       (qn->lower <= 1 ||
1205        int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1206     int addr;
1207 
1208     if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1209       r = add_op(reg, OP_JUMP);
1210       if (r != 0) return r;
1211       if (qn->greedy) {
1212 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1213         if (IS_NOT_NULL(qn->head_exact))
1214           COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1215         else
1216 #endif
1217         if (IS_NOT_NULL(qn->next_head_exact))
1218           COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1219         else
1220           COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1221       }
1222       else {
1223         COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1224       }
1225     }
1226     else {
1227       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1228       if (r != 0) return r;
1229     }
1230 
1231     if (qn->greedy) {
1232 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1233       if (IS_NOT_NULL(qn->head_exact)) {
1234         r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1235         if (r != 0) return r;
1236         COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1237         COP(reg)->push_or_jump_exact1.c    = STR_(qn->head_exact)->s[0];
1238 
1239         r = compile_quant_body_with_empty_check(qn, reg, env);
1240         if (r != 0) return r;
1241 
1242         addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1243       }
1244       else
1245 #endif
1246       if (IS_NOT_NULL(qn->next_head_exact)) {
1247         r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1248         if (r != 0) return r;
1249         COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1250         COP(reg)->push_if_peek_next.c    = STR_(qn->next_head_exact)->s[0];
1251 
1252         r = compile_quant_body_with_empty_check(qn, reg, env);
1253         if (r != 0) return r;
1254 
1255         addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1256       }
1257       else {
1258         r = add_op(reg, OP_PUSH);
1259         if (r != 0) return r;
1260         COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1261 
1262         r = compile_quant_body_with_empty_check(qn, reg, env);
1263         if (r != 0) return r;
1264 
1265         addr = -(mod_tlen + (int )OPSIZE_PUSH);
1266       }
1267 
1268       r = add_op(reg, OP_JUMP);
1269       if (r != 0) return r;
1270       COP(reg)->jump.addr = addr;
1271     }
1272     else {
1273       r = add_op(reg, OP_JUMP);
1274       if (r != 0) return r;
1275       COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1276 
1277       r = compile_quant_body_with_empty_check(qn, reg, env);
1278       if (r != 0) return r;
1279 
1280       r = add_op(reg, OP_PUSH);
1281       if (r != 0) return r;
1282       COP(reg)->push.addr = -mod_tlen;
1283     }
1284   }
1285   else if (qn->upper == 0) {
1286     if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1287       r = add_op(reg, OP_JUMP);
1288       if (r != 0) return r;
1289       COP(reg)->jump.addr = tlen + SIZE_INC;
1290 
1291       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1292     }
1293     else {
1294       /* Nothing output */
1295       r = 0;
1296     }
1297   }
1298   else if (! infinite && qn->greedy &&
1299            (qn->upper == 1 ||
1300             int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper,
1301                              QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1302     int n = qn->upper - qn->lower;
1303 
1304     r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1305     if (r != 0) return r;
1306 
1307     for (i = 0; i < n; i++) {
1308       int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1309       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1310 
1311       r = add_op(reg, OP_PUSH);
1312       if (r != 0) return r;
1313       COP(reg)->push.addr = v;
1314 
1315       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1316       if (r != 0) return r;
1317     }
1318   }
1319   else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1320     r = add_op(reg, OP_PUSH);
1321     if (r != 0) return r;
1322     COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1323 
1324     r = add_op(reg, OP_JUMP);
1325     if (r != 0) return r;
1326     COP(reg)->jump.addr = tlen + SIZE_INC;
1327 
1328     r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1329   }
1330   else {
1331     r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1332   }
1333   return r;
1334 }
1335 
1336 static int
compile_length_option_node(BagNode * node,regex_t * reg)1337 compile_length_option_node(BagNode* node, regex_t* reg)
1338 {
1339   int tlen;
1340   OnigOptionType prev = reg->options;
1341 
1342   reg->options = node->o.options;
1343   tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1344   reg->options = prev;
1345 
1346   return tlen;
1347 }
1348 
1349 static int
compile_option_node(BagNode * node,regex_t * reg,ScanEnv * env)1350 compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
1351 {
1352   int r;
1353   OnigOptionType prev = reg->options;
1354 
1355   reg->options = node->o.options;
1356   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1357   reg->options = prev;
1358 
1359   return r;
1360 }
1361 
1362 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1363 compile_length_bag_node(BagNode* node, regex_t* reg)
1364 {
1365   int len;
1366   int tlen;
1367 
1368   if (node->type == BAG_OPTION)
1369     return compile_length_option_node(node, reg);
1370 
1371   if (NODE_BAG_BODY(node)) {
1372     tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1373     if (tlen < 0) return tlen;
1374   }
1375   else
1376     tlen = 0;
1377 
1378   switch (node->type) {
1379   case BAG_MEMORY:
1380 #ifdef USE_CALL
1381 
1382     if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1383       len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1384       return len;
1385     }
1386 
1387     if (NODE_IS_CALLED(node)) {
1388       len = OPSIZE_MEM_START_PUSH + tlen
1389         + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1390       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1391         len += (NODE_IS_RECURSION(node)
1392                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1393       else
1394         len += (NODE_IS_RECURSION(node)
1395                 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1396     }
1397     else if (NODE_IS_RECURSION(node)) {
1398       len = OPSIZE_MEM_START_PUSH;
1399       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1400                      ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1401     }
1402     else
1403 #endif
1404     {
1405       if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1406         len = OPSIZE_MEM_START_PUSH;
1407       else
1408         len = OPSIZE_MEM_START;
1409 
1410       len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1411                      ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1412     }
1413     break;
1414 
1415   case BAG_STOP_BACKTRACK:
1416     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1417       int v;
1418       QuantNode* qn;
1419 
1420       qn = QUANT_(NODE_BAG_BODY(node));
1421       tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1422       if (tlen < 0) return tlen;
1423 
1424       v = onig_positive_int_multiply(qn->lower, tlen);
1425       if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1426       len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP;
1427     }
1428     else {
1429       len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END;
1430     }
1431     break;
1432 
1433   case BAG_IF_ELSE:
1434     {
1435       Node* cond = NODE_BAG_BODY(node);
1436       Node* Then = node->te.Then;
1437       Node* Else = node->te.Else;
1438 
1439       len = compile_length_tree(cond, reg);
1440       if (len < 0) return len;
1441       len += OPSIZE_PUSH;
1442       len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END;
1443 
1444       if (IS_NOT_NULL(Then)) {
1445         tlen = compile_length_tree(Then, reg);
1446         if (tlen < 0) return tlen;
1447         len += tlen;
1448       }
1449 
1450       len += OPSIZE_JUMP + OPSIZE_ATOMIC_END;
1451 
1452       if (IS_NOT_NULL(Else)) {
1453         tlen = compile_length_tree(Else, reg);
1454         if (tlen < 0) return tlen;
1455         len += tlen;
1456       }
1457     }
1458     break;
1459 
1460   case BAG_OPTION:
1461     /* never come here, but set for escape warning */
1462     len = 0;
1463     break;
1464   }
1465 
1466   return len;
1467 }
1468 
1469 static int get_char_len_node(Node* node, regex_t* reg, int* len);
1470 
1471 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ScanEnv * env)1472 compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
1473 {
1474   int r;
1475 
1476 #ifdef USE_CALL
1477   if (NODE_IS_CALLED(node)) {
1478     int len;
1479 
1480     r = add_op(reg, OP_CALL);
1481     if (r != 0) return r;
1482 
1483     node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1484     NODE_STATUS_ADD(node, ADDR_FIXED);
1485     COP(reg)->call.addr = (int )node->m.called_addr;
1486 
1487     if (node->m.regnum == 0) {
1488       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1489       len += OPSIZE_RETURN;
1490 
1491       r = add_op(reg, OP_JUMP);
1492       if (r != 0) return r;
1493       COP(reg)->jump.addr = len + SIZE_INC;
1494 
1495       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1496       if (r != 0) return r;
1497 
1498       r = add_op(reg, OP_RETURN);
1499       return r;
1500     }
1501     else {
1502       len = compile_length_tree(NODE_BAG_BODY(node), reg);
1503       len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1504       if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1505         len += (NODE_IS_RECURSION(node)
1506                 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1507       else
1508         len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1509 
1510       r = add_op(reg, OP_JUMP);
1511       if (r != 0) return r;
1512       COP(reg)->jump.addr = len + SIZE_INC;
1513     }
1514   }
1515 #endif
1516 
1517   if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1518     r = add_op(reg, OP_MEM_START_PUSH);
1519   else
1520     r = add_op(reg, OP_MEM_START);
1521   if (r != 0) return r;
1522   COP(reg)->memory_start.num = node->m.regnum;
1523 
1524   r = compile_tree(NODE_BAG_BODY(node), reg, env);
1525   if (r != 0) return r;
1526 
1527 #ifdef USE_CALL
1528   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1529     r = add_op(reg, (NODE_IS_RECURSION(node)
1530                      ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1531   else
1532     r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1533   if (r != 0) return r;
1534   COP(reg)->memory_end.num = node->m.regnum;
1535 
1536   if (NODE_IS_CALLED(node)) {
1537     if (r != 0) return r;
1538     r = add_op(reg, OP_RETURN);
1539   }
1540 #else
1541   if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1542     r = add_op(reg, OP_MEM_END_PUSH);
1543   else
1544     r = add_op(reg, OP_MEM_END);
1545   if (r != 0) return r;
1546   COP(reg)->memory_end.num = node->m.regnum;
1547 #endif
1548 
1549   return r;
1550 }
1551 
1552 static int
compile_bag_node(BagNode * node,regex_t * reg,ScanEnv * env)1553 compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
1554 {
1555   int r, len;
1556 
1557   switch (node->type) {
1558   case BAG_MEMORY:
1559     r = compile_bag_memory_node(node, reg, env);
1560     break;
1561 
1562   case BAG_OPTION:
1563     r = compile_option_node(node, reg, env);
1564     break;
1565 
1566   case BAG_STOP_BACKTRACK:
1567     if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1568       QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1569       r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1570       if (r != 0) return r;
1571 
1572       len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1573       if (len < 0) return len;
1574 
1575       r = add_op(reg, OP_PUSH);
1576       if (r != 0) return r;
1577       COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP;
1578 
1579       r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1580       if (r != 0) return r;
1581       r = add_op(reg, OP_POP_OUT);
1582       if (r != 0) return r;
1583 
1584       r = add_op(reg, OP_JUMP);
1585       if (r != 0) return r;
1586       COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT);
1587     }
1588     else {
1589       r = add_op(reg, OP_ATOMIC_START);
1590       if (r != 0) return r;
1591       r = compile_tree(NODE_BAG_BODY(node), reg, env);
1592       if (r != 0) return r;
1593       r = add_op(reg, OP_ATOMIC_END);
1594     }
1595     break;
1596 
1597   case BAG_IF_ELSE:
1598     {
1599       int cond_len, then_len, else_len, jump_len;
1600       Node* cond = NODE_BAG_BODY(node);
1601       Node* Then = node->te.Then;
1602       Node* Else = node->te.Else;
1603 
1604       r = add_op(reg, OP_ATOMIC_START);
1605       if (r != 0) return r;
1606 
1607       cond_len = compile_length_tree(cond, reg);
1608       if (cond_len < 0) return cond_len;
1609       if (IS_NOT_NULL(Then)) {
1610         then_len = compile_length_tree(Then, reg);
1611         if (then_len < 0) return then_len;
1612       }
1613       else
1614         then_len = 0;
1615 
1616       jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP;
1617 
1618       r = add_op(reg, OP_PUSH);
1619       if (r != 0) return r;
1620       COP(reg)->push.addr = SIZE_INC + jump_len;
1621 
1622       r = compile_tree(cond, reg, env);
1623       if (r != 0) return r;
1624       r = add_op(reg, OP_ATOMIC_END);
1625       if (r != 0) return r;
1626 
1627       if (IS_NOT_NULL(Then)) {
1628         r = compile_tree(Then, reg, env);
1629         if (r != 0) return r;
1630       }
1631 
1632       if (IS_NOT_NULL(Else)) {
1633         else_len = compile_length_tree(Else, reg);
1634         if (else_len < 0) return else_len;
1635       }
1636       else
1637         else_len = 0;
1638 
1639       r = add_op(reg, OP_JUMP);
1640       if (r != 0) return r;
1641       COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC;
1642 
1643       r = add_op(reg, OP_ATOMIC_END);
1644       if (r != 0) return r;
1645 
1646       if (IS_NOT_NULL(Else)) {
1647         r = compile_tree(Else, reg, env);
1648       }
1649     }
1650     break;
1651   }
1652 
1653   return r;
1654 }
1655 
1656 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1657 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1658 {
1659   int len;
1660   int tlen = 0;
1661 
1662   if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1663     tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1664     if (tlen < 0) return tlen;
1665   }
1666 
1667   switch (node->type) {
1668   case ANCR_PREC_READ:
1669     len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END;
1670     break;
1671   case ANCR_PREC_READ_NOT:
1672     len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END;
1673     break;
1674   case ANCR_LOOK_BEHIND:
1675     len = OPSIZE_LOOK_BEHIND + tlen;
1676     break;
1677   case ANCR_LOOK_BEHIND_NOT:
1678     len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END;
1679     break;
1680 
1681   case ANCR_WORD_BOUNDARY:
1682   case ANCR_NO_WORD_BOUNDARY:
1683 #ifdef USE_WORD_BEGIN_END
1684   case ANCR_WORD_BEGIN:
1685   case ANCR_WORD_END:
1686 #endif
1687     len = OPSIZE_WORD_BOUNDARY;
1688     break;
1689 
1690   case ANCR_TEXT_SEGMENT_BOUNDARY:
1691   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1692     len = SIZE_OPCODE;
1693     break;
1694 
1695   default:
1696     len = SIZE_OPCODE;
1697     break;
1698   }
1699 
1700   return len;
1701 }
1702 
1703 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ScanEnv * env)1704 compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
1705 {
1706   int r, len;
1707   enum OpCode op;
1708 
1709   switch (node->type) {
1710   case ANCR_BEGIN_BUF:      r = add_op(reg, OP_BEGIN_BUF);      break;
1711   case ANCR_END_BUF:        r = add_op(reg, OP_END_BUF);        break;
1712   case ANCR_BEGIN_LINE:     r = add_op(reg, OP_BEGIN_LINE);     break;
1713   case ANCR_END_LINE:       r = add_op(reg, OP_END_LINE);       break;
1714   case ANCR_SEMI_END_BUF:   r = add_op(reg, OP_SEMI_END_BUF);   break;
1715   case ANCR_BEGIN_POSITION: r = add_op(reg, OP_BEGIN_POSITION); break;
1716 
1717   case ANCR_WORD_BOUNDARY:
1718     op = OP_WORD_BOUNDARY;
1719   word:
1720     r = add_op(reg, op);
1721     if (r != 0) return r;
1722     COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
1723     break;
1724 
1725   case ANCR_NO_WORD_BOUNDARY:
1726     op = OP_NO_WORD_BOUNDARY; goto word;
1727     break;
1728 #ifdef USE_WORD_BEGIN_END
1729   case ANCR_WORD_BEGIN:
1730     op = OP_WORD_BEGIN; goto word;
1731     break;
1732   case ANCR_WORD_END:
1733     op = OP_WORD_END; goto word;
1734     break;
1735 #endif
1736 
1737   case ANCR_TEXT_SEGMENT_BOUNDARY:
1738   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1739     {
1740       enum TextSegmentBoundaryType type;
1741 
1742       r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
1743       if (r != 0) return r;
1744 
1745       type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
1746 #ifdef USE_UNICODE_WORD_BREAK
1747       if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD))
1748         type = WORD_BOUNDARY;
1749 #endif
1750 
1751       COP(reg)->text_segment_boundary.type = type;
1752       COP(reg)->text_segment_boundary.not =
1753         (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
1754     }
1755     break;
1756 
1757   case ANCR_PREC_READ:
1758     r = add_op(reg, OP_PREC_READ_START);
1759     if (r != 0) return r;
1760     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1761     if (r != 0) return r;
1762     r = add_op(reg, OP_PREC_READ_END);
1763     break;
1764 
1765   case ANCR_PREC_READ_NOT:
1766     len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1767     if (len < 0) return len;
1768 
1769     r = add_op(reg, OP_PREC_READ_NOT_START);
1770     if (r != 0) return r;
1771     COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END;
1772     r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1773     if (r != 0) return r;
1774     r = add_op(reg, OP_PREC_READ_NOT_END);
1775     break;
1776 
1777   case ANCR_LOOK_BEHIND:
1778     {
1779       int n;
1780       r = add_op(reg, OP_LOOK_BEHIND);
1781       if (r != 0) return r;
1782       if (node->char_len < 0) {
1783         r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1784         if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1785       }
1786       else
1787         n = node->char_len;
1788 
1789       COP(reg)->look_behind.len = n;
1790       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1791     }
1792     break;
1793 
1794   case ANCR_LOOK_BEHIND_NOT:
1795     {
1796       int n;
1797 
1798       len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1799       r = add_op(reg, OP_LOOK_BEHIND_NOT_START);
1800       if (r != 0) return r;
1801       COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END;
1802 
1803       if (node->char_len < 0) {
1804         r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1805         if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1806       }
1807       else
1808         n = node->char_len;
1809 
1810       COP(reg)->look_behind_not_start.len = n;
1811 
1812       r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1813       if (r != 0) return r;
1814       r = add_op(reg, OP_LOOK_BEHIND_NOT_END);
1815     }
1816     break;
1817 
1818   default:
1819     return ONIGERR_TYPE_BUG;
1820     break;
1821   }
1822 
1823   return r;
1824 }
1825 
1826 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)1827 compile_gimmick_node(GimmickNode* node, regex_t* reg)
1828 {
1829   int r;
1830 
1831   switch (node->type) {
1832   case GIMMICK_FAIL:
1833     r = add_op(reg, OP_FAIL);
1834     break;
1835 
1836   case GIMMICK_SAVE:
1837     r = add_op(reg, OP_PUSH_SAVE_VAL);
1838     if (r != 0) return r;
1839     COP(reg)->push_save_val.type = node->detail_type;
1840     COP(reg)->push_save_val.id   = node->id;
1841     break;
1842 
1843   case GIMMICK_UPDATE_VAR:
1844     r = add_op(reg, OP_UPDATE_VAR);
1845     if (r != 0) return r;
1846     COP(reg)->update_var.type = node->detail_type;
1847     COP(reg)->update_var.id   = node->id;
1848     break;
1849 
1850 #ifdef USE_CALLOUT
1851   case GIMMICK_CALLOUT:
1852     switch (node->detail_type) {
1853     case ONIG_CALLOUT_OF_CONTENTS:
1854     case ONIG_CALLOUT_OF_NAME:
1855       {
1856         if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
1857           r = add_op(reg, OP_CALLOUT_NAME);
1858           if (r != 0) return r;
1859           COP(reg)->callout_name.id  = node->id;
1860           COP(reg)->callout_name.num = node->num;
1861         }
1862         else {
1863           r = add_op(reg, OP_CALLOUT_CONTENTS);
1864           if (r != 0) return r;
1865           COP(reg)->callout_contents.num = node->num;
1866         }
1867       }
1868       break;
1869 
1870     default:
1871       r = ONIGERR_TYPE_BUG;
1872       break;
1873     }
1874 #endif
1875   }
1876 
1877   return r;
1878 }
1879 
1880 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)1881 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
1882 {
1883   int len;
1884 
1885   switch (node->type) {
1886   case GIMMICK_FAIL:
1887     len = OPSIZE_FAIL;
1888     break;
1889 
1890   case GIMMICK_SAVE:
1891     len = OPSIZE_PUSH_SAVE_VAL;
1892     break;
1893 
1894   case GIMMICK_UPDATE_VAR:
1895     len = OPSIZE_UPDATE_VAR;
1896     break;
1897 
1898 #ifdef USE_CALLOUT
1899   case GIMMICK_CALLOUT:
1900     switch (node->detail_type) {
1901     case ONIG_CALLOUT_OF_CONTENTS:
1902       len = OPSIZE_CALLOUT_CONTENTS;
1903       break;
1904     case ONIG_CALLOUT_OF_NAME:
1905       len = OPSIZE_CALLOUT_NAME;
1906       break;
1907 
1908     default:
1909       len = ONIGERR_TYPE_BUG;
1910       break;
1911     }
1912     break;
1913 #endif
1914   }
1915 
1916   return len;
1917 }
1918 
1919 static int
compile_length_tree(Node * node,regex_t * reg)1920 compile_length_tree(Node* node, regex_t* reg)
1921 {
1922   int len, r;
1923 
1924   switch (NODE_TYPE(node)) {
1925   case NODE_LIST:
1926     len = 0;
1927     do {
1928       r = compile_length_tree(NODE_CAR(node), reg);
1929       if (r < 0) return r;
1930       len += r;
1931     } while (IS_NOT_NULL(node = NODE_CDR(node)));
1932     r = len;
1933     break;
1934 
1935   case NODE_ALT:
1936     {
1937       int n;
1938 
1939       n = r = 0;
1940       do {
1941         r += compile_length_tree(NODE_CAR(node), reg);
1942         n++;
1943       } while (IS_NOT_NULL(node = NODE_CDR(node)));
1944       r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
1945     }
1946     break;
1947 
1948   case NODE_STRING:
1949     if (NODE_STRING_IS_CRUDE(node))
1950       r = compile_length_string_crude_node(STR_(node), reg);
1951     else
1952       r = compile_length_string_node(node, reg);
1953     break;
1954 
1955   case NODE_CCLASS:
1956     r = compile_length_cclass_node(CCLASS_(node), reg);
1957     break;
1958 
1959   case NODE_CTYPE:
1960     r = SIZE_OPCODE;
1961     break;
1962 
1963   case NODE_BACKREF:
1964     r = OPSIZE_BACKREF;
1965     break;
1966 
1967 #ifdef USE_CALL
1968   case NODE_CALL:
1969     r = OPSIZE_CALL;
1970     break;
1971 #endif
1972 
1973   case NODE_QUANT:
1974     r = compile_length_quantifier_node(QUANT_(node), reg);
1975     break;
1976 
1977   case NODE_BAG:
1978     r = compile_length_bag_node(BAG_(node), reg);
1979     break;
1980 
1981   case NODE_ANCHOR:
1982     r = compile_length_anchor_node(ANCHOR_(node), reg);
1983     break;
1984 
1985   case NODE_GIMMICK:
1986     r = compile_length_gimmick_node(GIMMICK_(node), reg);
1987     break;
1988 
1989   default:
1990     return ONIGERR_TYPE_BUG;
1991     break;
1992   }
1993 
1994   return r;
1995 }
1996 
1997 static int
compile_tree(Node * node,regex_t * reg,ScanEnv * env)1998 compile_tree(Node* node, regex_t* reg, ScanEnv* env)
1999 {
2000   int n, len, pos, r = 0;
2001 
2002   switch (NODE_TYPE(node)) {
2003   case NODE_LIST:
2004     do {
2005       r = compile_tree(NODE_CAR(node), reg, env);
2006     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2007     break;
2008 
2009   case NODE_ALT:
2010     {
2011       Node* x = node;
2012       len = 0;
2013       do {
2014         len += compile_length_tree(NODE_CAR(x), reg);
2015         if (IS_NOT_NULL(NODE_CDR(x))) {
2016           len += OPSIZE_PUSH + OPSIZE_JUMP;
2017         }
2018       } while (IS_NOT_NULL(x = NODE_CDR(x)));
2019       pos = COP_CURR_OFFSET(reg) + 1 + len;  /* goal position */
2020 
2021       do {
2022         len = compile_length_tree(NODE_CAR(node), reg);
2023         if (IS_NOT_NULL(NODE_CDR(node))) {
2024           enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2025           r = add_op(reg, push);
2026           if (r != 0) break;
2027           COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2028         }
2029         r = compile_tree(NODE_CAR(node), reg, env);
2030         if (r != 0) break;
2031         if (IS_NOT_NULL(NODE_CDR(node))) {
2032           len = pos - (COP_CURR_OFFSET(reg) + 1);
2033           r = add_op(reg, OP_JUMP);
2034           if (r != 0) break;
2035           COP(reg)->jump.addr = len;
2036         }
2037       } while (IS_NOT_NULL(node = NODE_CDR(node)));
2038     }
2039     break;
2040 
2041   case NODE_STRING:
2042     if (NODE_STRING_IS_CRUDE(node))
2043       r = compile_string_crude_node(STR_(node), reg);
2044     else
2045       r = compile_string_node(node, reg);
2046     break;
2047 
2048   case NODE_CCLASS:
2049     r = compile_cclass_node(CCLASS_(node), reg);
2050     break;
2051 
2052   case NODE_CTYPE:
2053     {
2054       int op;
2055 
2056       switch (CTYPE_(node)->ctype) {
2057       case CTYPE_ANYCHAR:
2058         r = add_op(reg, IS_MULTILINE(CTYPE_OPTION(node, reg)) ?
2059                    OP_ANYCHAR_ML : OP_ANYCHAR);
2060         break;
2061 
2062       case ONIGENC_CTYPE_WORD:
2063         if (CTYPE_(node)->ascii_mode == 0) {
2064           op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2065         }
2066         else {
2067           op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2068         }
2069         r = add_op(reg, op);
2070         break;
2071 
2072       default:
2073         return ONIGERR_TYPE_BUG;
2074         break;
2075       }
2076     }
2077     break;
2078 
2079   case NODE_BACKREF:
2080     {
2081       BackRefNode* br = BACKREF_(node);
2082 
2083       if (NODE_IS_CHECKER(node)) {
2084 #ifdef USE_BACKREF_WITH_LEVEL
2085         if (NODE_IS_NEST_LEVEL(node)) {
2086           r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2087           if (r != 0) return r;
2088           COP(reg)->backref_general.nest_level = br->nest_level;
2089         }
2090         else
2091 #endif
2092           {
2093             r = add_op(reg, OP_BACKREF_CHECK);
2094             if (r != 0) return r;
2095           }
2096         goto add_bacref_mems;
2097       }
2098       else {
2099 #ifdef USE_BACKREF_WITH_LEVEL
2100         if (NODE_IS_NEST_LEVEL(node)) {
2101           if ((reg->options & ONIG_OPTION_IGNORECASE) != 0)
2102             r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2103           else
2104             r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2105 
2106           if (r != 0) return r;
2107           COP(reg)->backref_general.nest_level = br->nest_level;
2108           goto add_bacref_mems;
2109         }
2110         else
2111 #endif
2112         if (br->back_num == 1) {
2113           n = br->back_static[0];
2114           if (IS_IGNORECASE(reg->options)) {
2115             r = add_op(reg, OP_BACKREF_N_IC);
2116             if (r != 0) return r;
2117             COP(reg)->backref_n.n1 = n;
2118           }
2119           else {
2120             switch (n) {
2121             case 1:  r = add_op(reg, OP_BACKREF1); break;
2122             case 2:  r = add_op(reg, OP_BACKREF2); break;
2123             default:
2124               r = add_op(reg, OP_BACKREF_N);
2125               if (r != 0) return r;
2126               COP(reg)->backref_n.n1 = n;
2127               break;
2128             }
2129           }
2130         }
2131         else {
2132           int num;
2133           int* p;
2134 
2135           r = add_op(reg, IS_IGNORECASE(reg->options) ?
2136                      OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2137           if (r != 0) return r;
2138 
2139         add_bacref_mems:
2140           num = br->back_num;
2141           COP(reg)->backref_general.num = num;
2142           if (num == 1) {
2143             COP(reg)->backref_general.n1 = br->back_static[0];
2144           }
2145           else {
2146             int i, j;
2147             MemNumType* ns;
2148 
2149             ns = xmalloc(sizeof(MemNumType) * num);
2150             CHECK_NULL_RETURN_MEMERR(ns);
2151             COP(reg)->backref_general.ns = ns;
2152             p = BACKREFS_P(br);
2153             for (i = num - 1, j = 0; i >= 0; i--, j++) {
2154               ns[j] = p[i];
2155             }
2156           }
2157         }
2158       }
2159     }
2160     break;
2161 
2162 #ifdef USE_CALL
2163   case NODE_CALL:
2164     r = compile_call(CALL_(node), reg, env);
2165     break;
2166 #endif
2167 
2168   case NODE_QUANT:
2169     r = compile_quantifier_node(QUANT_(node), reg, env);
2170     break;
2171 
2172   case NODE_BAG:
2173     r = compile_bag_node(BAG_(node), reg, env);
2174     break;
2175 
2176   case NODE_ANCHOR:
2177     r = compile_anchor_node(ANCHOR_(node), reg, env);
2178     break;
2179 
2180   case NODE_GIMMICK:
2181     r = compile_gimmick_node(GIMMICK_(node), reg);
2182     break;
2183 
2184   default:
2185 #ifdef ONIG_DEBUG
2186     fprintf(stderr, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2187 #endif
2188     break;
2189   }
2190 
2191   return r;
2192 }
2193 
2194 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)2195 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
2196 {
2197   int r = 0;
2198   Node* node = *plink;
2199 
2200   switch (NODE_TYPE(node)) {
2201   case NODE_LIST:
2202   case NODE_ALT:
2203     do {
2204       r = noname_disable_map(&(NODE_CAR(node)), map, counter);
2205     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2206     break;
2207 
2208   case NODE_QUANT:
2209     {
2210       Node** ptarget = &(NODE_BODY(node));
2211       Node*  old = *ptarget;
2212       r = noname_disable_map(ptarget, map, counter);
2213       if (r != 0) return r;
2214       if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) {
2215         r = onig_reduce_nested_quantifier(node);
2216       }
2217     }
2218     break;
2219 
2220   case NODE_BAG:
2221     {
2222       BagNode* en = BAG_(node);
2223       if (en->type == BAG_MEMORY) {
2224         if (NODE_IS_NAMED_GROUP(node)) {
2225           (*counter)++;
2226           map[en->m.regnum].new_val = *counter;
2227           en->m.regnum = *counter;
2228           r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2229         }
2230         else {
2231           *plink = NODE_BODY(node);
2232           NODE_BODY(node) = NULL_NODE;
2233           onig_node_free(node);
2234           r = noname_disable_map(plink, map, counter);
2235         }
2236       }
2237       else if (en->type == BAG_IF_ELSE) {
2238         r = noname_disable_map(&(NODE_BAG_BODY(en)), map, counter);
2239         if (r != 0) return r;
2240         if (IS_NOT_NULL(en->te.Then)) {
2241           r = noname_disable_map(&(en->te.Then), map, counter);
2242           if (r != 0) return r;
2243         }
2244         if (IS_NOT_NULL(en->te.Else)) {
2245           r = noname_disable_map(&(en->te.Else), map, counter);
2246           if (r != 0) return r;
2247         }
2248       }
2249       else
2250         r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2251     }
2252     break;
2253 
2254   case NODE_ANCHOR:
2255     if (IS_NOT_NULL(NODE_BODY(node)))
2256       r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2257     break;
2258 
2259   default:
2260     break;
2261   }
2262 
2263   return r;
2264 }
2265 
2266 static int
renumber_node_backref(Node * node,GroupNumRemap * map)2267 renumber_node_backref(Node* node, GroupNumRemap* map)
2268 {
2269   int i, pos, n, old_num;
2270   int *backs;
2271   BackRefNode* bn = BACKREF_(node);
2272 
2273   if (! NODE_IS_BY_NAME(node))
2274     return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2275 
2276   old_num = bn->back_num;
2277   if (IS_NULL(bn->back_dynamic))
2278     backs = bn->back_static;
2279   else
2280     backs = bn->back_dynamic;
2281 
2282   for (i = 0, pos = 0; i < old_num; i++) {
2283     n = map[backs[i]].new_val;
2284     if (n > 0) {
2285       backs[pos] = n;
2286       pos++;
2287     }
2288   }
2289 
2290   bn->back_num = pos;
2291   return 0;
2292 }
2293 
2294 static int
renumber_by_map(Node * node,GroupNumRemap * map)2295 renumber_by_map(Node* node, GroupNumRemap* map)
2296 {
2297   int r = 0;
2298 
2299   switch (NODE_TYPE(node)) {
2300   case NODE_LIST:
2301   case NODE_ALT:
2302     do {
2303       r = renumber_by_map(NODE_CAR(node), map);
2304     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2305     break;
2306 
2307   case NODE_QUANT:
2308     r = renumber_by_map(NODE_BODY(node), map);
2309     break;
2310 
2311   case NODE_BAG:
2312     {
2313       BagNode* en = BAG_(node);
2314 
2315       r = renumber_by_map(NODE_BODY(node), map);
2316       if (r != 0) return r;
2317 
2318       if (en->type == BAG_IF_ELSE) {
2319         if (IS_NOT_NULL(en->te.Then)) {
2320           r = renumber_by_map(en->te.Then, map);
2321           if (r != 0) return r;
2322         }
2323         if (IS_NOT_NULL(en->te.Else)) {
2324           r = renumber_by_map(en->te.Else, map);
2325           if (r != 0) return r;
2326         }
2327       }
2328     }
2329     break;
2330 
2331   case NODE_BACKREF:
2332     r = renumber_node_backref(node, map);
2333     break;
2334 
2335   case NODE_ANCHOR:
2336     if (IS_NOT_NULL(NODE_BODY(node)))
2337       r = renumber_by_map(NODE_BODY(node), map);
2338     break;
2339 
2340   default:
2341     break;
2342   }
2343 
2344   return r;
2345 }
2346 
2347 static int
numbered_ref_check(Node * node)2348 numbered_ref_check(Node* node)
2349 {
2350   int r = 0;
2351 
2352   switch (NODE_TYPE(node)) {
2353   case NODE_LIST:
2354   case NODE_ALT:
2355     do {
2356       r = numbered_ref_check(NODE_CAR(node));
2357     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2358     break;
2359 
2360   case NODE_ANCHOR:
2361     if (IS_NULL(NODE_BODY(node)))
2362       break;
2363     /* fall */
2364   case NODE_QUANT:
2365     r = numbered_ref_check(NODE_BODY(node));
2366     break;
2367 
2368   case NODE_BAG:
2369     {
2370       BagNode* en = BAG_(node);
2371 
2372       r = numbered_ref_check(NODE_BODY(node));
2373       if (r != 0) return r;
2374 
2375       if (en->type == BAG_IF_ELSE) {
2376         if (IS_NOT_NULL(en->te.Then)) {
2377           r = numbered_ref_check(en->te.Then);
2378           if (r != 0) return r;
2379         }
2380         if (IS_NOT_NULL(en->te.Else)) {
2381           r = numbered_ref_check(en->te.Else);
2382           if (r != 0) return r;
2383         }
2384       }
2385     }
2386 
2387     break;
2388 
2389   case NODE_BACKREF:
2390     if (! NODE_IS_BY_NAME(node))
2391       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2392     break;
2393 
2394   default:
2395     break;
2396   }
2397 
2398   return r;
2399 }
2400 
2401 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)2402 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2403 {
2404   int r, i, pos, counter;
2405   MemStatusType loc;
2406   GroupNumRemap* map;
2407 
2408   map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2409   CHECK_NULL_RETURN_MEMERR(map);
2410   for (i = 1; i <= env->num_mem; i++) {
2411     map[i].new_val = 0;
2412   }
2413   counter = 0;
2414   r = noname_disable_map(root, map, &counter);
2415   if (r != 0) return r;
2416 
2417   r = renumber_by_map(*root, map);
2418   if (r != 0) return r;
2419 
2420   for (i = 1, pos = 1; i <= env->num_mem; i++) {
2421     if (map[i].new_val > 0) {
2422       SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
2423       pos++;
2424     }
2425   }
2426 
2427   loc = env->cap_history;
2428   MEM_STATUS_CLEAR(env->cap_history);
2429   for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2430     if (MEM_STATUS_AT(loc, i)) {
2431       MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
2432     }
2433   }
2434 
2435   env->num_mem = env->num_named;
2436   reg->num_mem = env->num_named;
2437 
2438   return onig_renumber_name_table(reg, map);
2439 }
2440 
2441 #ifdef USE_CALL
2442 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)2443 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
2444 {
2445   int i, offset;
2446   BagNode* en;
2447   AbsAddrType addr;
2448   AbsAddrType* paddr;
2449 
2450   for (i = 0; i < uslist->num; i++) {
2451     if (! NODE_IS_ADDR_FIXED(uslist->us[i].target))
2452       return ONIGERR_PARSER_BUG;
2453 
2454     en = BAG_(uslist->us[i].target);
2455     addr   = en->m.called_addr;
2456     offset = uslist->us[i].offset;
2457 
2458     paddr = (AbsAddrType* )((char* )reg->ops + offset);
2459     *paddr = addr;
2460   }
2461   return 0;
2462 }
2463 #endif
2464 
2465 
2466 #define GET_CHAR_LEN_VARLEN           -1
2467 #define GET_CHAR_LEN_TOP_ALT_VARLEN   -2
2468 
2469 /* fixed size pattern node only */
2470 static int
get_char_len_node1(Node * node,regex_t * reg,int * len,int level)2471 get_char_len_node1(Node* node, regex_t* reg, int* len, int level)
2472 {
2473   int tlen;
2474   int r = 0;
2475 
2476   level++;
2477   *len = 0;
2478   switch (NODE_TYPE(node)) {
2479   case NODE_LIST:
2480     do {
2481       r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2482       if (r == 0)
2483         *len = distance_add(*len, tlen);
2484     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2485     break;
2486 
2487   case NODE_ALT:
2488     {
2489       int tlen2;
2490       int varlen = 0;
2491 
2492       r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2493       while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))) {
2494         r = get_char_len_node1(NODE_CAR(node), reg, &tlen2, level);
2495         if (r == 0) {
2496           if (tlen != tlen2)
2497             varlen = 1;
2498         }
2499       }
2500       if (r == 0) {
2501         if (varlen != 0) {
2502           if (level == 1)
2503             r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2504           else
2505             r = GET_CHAR_LEN_VARLEN;
2506         }
2507         else
2508           *len = tlen;
2509       }
2510     }
2511     break;
2512 
2513   case NODE_STRING:
2514     {
2515       StrNode* sn = STR_(node);
2516       UChar *s = sn->s;
2517 
2518       while (s < sn->end) {
2519         s += enclen(reg->enc, s);
2520         (*len)++;
2521       }
2522     }
2523     break;
2524 
2525   case NODE_QUANT:
2526     {
2527       QuantNode* qn = QUANT_(node);
2528 
2529       if (qn->lower == qn->upper) {
2530         if (qn->upper == 0) {
2531           *len = 0;
2532         }
2533         else {
2534           r = get_char_len_node1(NODE_BODY(node), reg, &tlen, level);
2535           if (r == 0)
2536             *len = distance_multiply(tlen, qn->lower);
2537         }
2538       }
2539       else
2540         r = GET_CHAR_LEN_VARLEN;
2541     }
2542     break;
2543 
2544 #ifdef USE_CALL
2545   case NODE_CALL:
2546     if (! NODE_IS_RECURSION(node))
2547       r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2548     else
2549       r = GET_CHAR_LEN_VARLEN;
2550     break;
2551 #endif
2552 
2553   case NODE_CTYPE:
2554   case NODE_CCLASS:
2555     *len = 1;
2556     break;
2557 
2558   case NODE_BAG:
2559     {
2560       BagNode* en = BAG_(node);
2561 
2562       switch (en->type) {
2563       case BAG_MEMORY:
2564 #ifdef USE_CALL
2565         if (NODE_IS_CLEN_FIXED(node))
2566           *len = en->char_len;
2567         else {
2568           r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2569           if (r == 0) {
2570             en->char_len = *len;
2571             NODE_STATUS_ADD(node, CLEN_FIXED);
2572           }
2573         }
2574         break;
2575 #endif
2576       case BAG_OPTION:
2577       case BAG_STOP_BACKTRACK:
2578         r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2579         break;
2580       case BAG_IF_ELSE:
2581         {
2582           int clen, elen;
2583 
2584           r = get_char_len_node1(NODE_BODY(node), reg, &clen, level);
2585           if (r == 0) {
2586             if (IS_NOT_NULL(en->te.Then)) {
2587               r = get_char_len_node1(en->te.Then, reg, &tlen, level);
2588               if (r != 0) break;
2589             }
2590             else tlen = 0;
2591             if (IS_NOT_NULL(en->te.Else)) {
2592               r = get_char_len_node1(en->te.Else, reg, &elen, level);
2593               if (r != 0) break;
2594             }
2595             else elen = 0;
2596 
2597             if (clen + tlen != elen) {
2598               r = GET_CHAR_LEN_VARLEN;
2599             }
2600             else {
2601               *len = elen;
2602             }
2603           }
2604         }
2605         break;
2606       }
2607     }
2608     break;
2609 
2610   case NODE_ANCHOR:
2611   case NODE_GIMMICK:
2612     break;
2613 
2614   case NODE_BACKREF:
2615     if (NODE_IS_CHECKER(node))
2616       break;
2617     /* fall */
2618   default:
2619     r = GET_CHAR_LEN_VARLEN;
2620     break;
2621   }
2622 
2623   return r;
2624 }
2625 
2626 static int
get_char_len_node(Node * node,regex_t * reg,int * len)2627 get_char_len_node(Node* node, regex_t* reg, int* len)
2628 {
2629   return get_char_len_node1(node, reg, len, 0);
2630 }
2631 
2632 /* x is not included y ==>  1 : 0 */
2633 static int
is_exclusive(Node * x,Node * y,regex_t * reg)2634 is_exclusive(Node* x, Node* y, regex_t* reg)
2635 {
2636   int i, len;
2637   OnigCodePoint code;
2638   UChar *p;
2639   NodeType ytype;
2640 
2641  retry:
2642   ytype = NODE_TYPE(y);
2643   switch (NODE_TYPE(x)) {
2644   case NODE_CTYPE:
2645     {
2646       if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
2647           CTYPE_(y)->ctype == CTYPE_ANYCHAR)
2648         break;
2649 
2650       switch (ytype) {
2651       case NODE_CTYPE:
2652         if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
2653             CTYPE_(y)->not   != CTYPE_(x)->not &&
2654             CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
2655           return 1;
2656         else
2657           return 0;
2658         break;
2659 
2660       case NODE_CCLASS:
2661       swap:
2662         {
2663           Node* tmp;
2664           tmp = x; x = y; y = tmp;
2665           goto retry;
2666         }
2667         break;
2668 
2669       case NODE_STRING:
2670         goto swap;
2671         break;
2672 
2673       default:
2674         break;
2675       }
2676     }
2677     break;
2678 
2679   case NODE_CCLASS:
2680     {
2681       int range;
2682       CClassNode* xc = CCLASS_(x);
2683 
2684       switch (ytype) {
2685       case NODE_CTYPE:
2686         switch (CTYPE_(y)->ctype) {
2687         case CTYPE_ANYCHAR:
2688           return 0;
2689           break;
2690 
2691         case ONIGENC_CTYPE_WORD:
2692           if (CTYPE_(y)->not == 0) {
2693             if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2694               range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2695               for (i = 0; i < range; i++) {
2696                 if (BITSET_AT(xc->bs, i)) {
2697                   if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2698                 }
2699               }
2700               return 1;
2701             }
2702             return 0;
2703           }
2704           else {
2705             if (IS_NOT_NULL(xc->mbuf)) return 0;
2706             if (IS_NCCLASS_NOT(xc)) return 0;
2707 
2708             range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2709             for (i = 0; i < range; i++) {
2710               if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
2711                 if (BITSET_AT(xc->bs, i))
2712                   return 0;
2713               }
2714             }
2715             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
2716               if (BITSET_AT(xc->bs, i)) return 0;
2717             }
2718             return 1;
2719           }
2720           break;
2721 
2722         default:
2723           break;
2724         }
2725         break;
2726 
2727       case NODE_CCLASS:
2728         {
2729           int v;
2730           CClassNode* yc = CCLASS_(y);
2731 
2732           for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2733             v = BITSET_AT(xc->bs, i);
2734             if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
2735               v = BITSET_AT(yc->bs, i);
2736               if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2737                   (v == 0 && IS_NCCLASS_NOT(yc)))
2738                 return 0;
2739             }
2740           }
2741           if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2742               (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2743             return 1;
2744           return 0;
2745         }
2746         break;
2747 
2748       case NODE_STRING:
2749         goto swap;
2750         break;
2751 
2752       default:
2753         break;
2754       }
2755     }
2756     break;
2757 
2758   case NODE_STRING:
2759     {
2760       StrNode* xs = STR_(x);
2761 
2762       if (NODE_STRING_LEN(x) == 0)
2763         break;
2764 
2765       switch (ytype) {
2766       case NODE_CTYPE:
2767         switch (CTYPE_(y)->ctype) {
2768         case CTYPE_ANYCHAR:
2769           break;
2770 
2771         case ONIGENC_CTYPE_WORD:
2772           if (CTYPE_(y)->ascii_mode == 0) {
2773             if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2774               return CTYPE_(y)->not;
2775             else
2776               return !(CTYPE_(y)->not);
2777           }
2778           else {
2779             if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
2780               return CTYPE_(y)->not;
2781             else
2782               return !(CTYPE_(y)->not);
2783           }
2784           break;
2785         default:
2786           break;
2787         }
2788         break;
2789 
2790       case NODE_CCLASS:
2791         {
2792           CClassNode* cc = CCLASS_(y);
2793 
2794           code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2795                                      xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2796           return onig_is_code_in_cc(reg->enc, code, cc) == 0;
2797         }
2798         break;
2799 
2800       case NODE_STRING:
2801         {
2802           UChar *q;
2803           StrNode* ys = STR_(y);
2804 
2805           len = NODE_STRING_LEN(x);
2806           if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
2807           if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) {
2808             /* tiny version */
2809             return 0;
2810           }
2811           else {
2812             for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2813               if (*p != *q) return 1;
2814             }
2815           }
2816         }
2817         break;
2818 
2819       default:
2820         break;
2821       }
2822     }
2823     break;
2824 
2825   default:
2826     break;
2827   }
2828 
2829   return 0;
2830 }
2831 
2832 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2833 get_head_value_node(Node* node, int exact, regex_t* reg)
2834 {
2835   Node* n = NULL_NODE;
2836 
2837   switch (NODE_TYPE(node)) {
2838   case NODE_BACKREF:
2839   case NODE_ALT:
2840 #ifdef USE_CALL
2841   case NODE_CALL:
2842 #endif
2843     break;
2844 
2845   case NODE_CTYPE:
2846     if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
2847       break;
2848     /* fall */
2849   case NODE_CCLASS:
2850     if (exact == 0) {
2851       n = node;
2852     }
2853     break;
2854 
2855   case NODE_LIST:
2856     n = get_head_value_node(NODE_CAR(node), exact, reg);
2857     break;
2858 
2859   case NODE_STRING:
2860     {
2861       StrNode* sn = STR_(node);
2862 
2863       if (sn->end <= sn->s)
2864         break;
2865 
2866       if (exact == 0 ||
2867           ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) {
2868         n = node;
2869       }
2870     }
2871     break;
2872 
2873   case NODE_QUANT:
2874     {
2875       QuantNode* qn = QUANT_(node);
2876       if (qn->lower > 0) {
2877         if (IS_NOT_NULL(qn->head_exact))
2878           n = qn->head_exact;
2879         else
2880           n = get_head_value_node(NODE_BODY(node), exact, reg);
2881       }
2882     }
2883     break;
2884 
2885   case NODE_BAG:
2886     {
2887       BagNode* en = BAG_(node);
2888       switch (en->type) {
2889       case BAG_OPTION:
2890         {
2891           OnigOptionType options = reg->options;
2892 
2893           reg->options = BAG_(node)->o.options;
2894           n = get_head_value_node(NODE_BODY(node), exact, reg);
2895           reg->options = options;
2896         }
2897         break;
2898 
2899       case BAG_MEMORY:
2900       case BAG_STOP_BACKTRACK:
2901       case BAG_IF_ELSE:
2902         n = get_head_value_node(NODE_BODY(node), exact, reg);
2903         break;
2904       }
2905     }
2906     break;
2907 
2908   case NODE_ANCHOR:
2909     if (ANCHOR_(node)->type == ANCR_PREC_READ)
2910       n = get_head_value_node(NODE_BODY(node), exact, reg);
2911     break;
2912 
2913   case NODE_GIMMICK:
2914   default:
2915     break;
2916   }
2917 
2918   return n;
2919 }
2920 
2921 static int
check_type_tree(Node * node,int type_mask,int bag_mask,int anchor_mask)2922 check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask)
2923 {
2924   NodeType type;
2925   int r = 0;
2926 
2927   type = NODE_TYPE(node);
2928   if ((NODE_TYPE2BIT(type) & type_mask) == 0)
2929     return 1;
2930 
2931   switch (type) {
2932   case NODE_LIST:
2933   case NODE_ALT:
2934     do {
2935       r = check_type_tree(NODE_CAR(node), type_mask, bag_mask, anchor_mask);
2936     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2937     break;
2938 
2939   case NODE_QUANT:
2940     r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2941     break;
2942 
2943   case NODE_BAG:
2944     {
2945       BagNode* en = BAG_(node);
2946       if (((1<<en->type) & bag_mask) == 0)
2947         return 1;
2948 
2949       r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2950       if (r == 0 && en->type == BAG_IF_ELSE) {
2951         if (IS_NOT_NULL(en->te.Then)) {
2952           r = check_type_tree(en->te.Then, type_mask, bag_mask, anchor_mask);
2953           if (r != 0) break;
2954         }
2955         if (IS_NOT_NULL(en->te.Else)) {
2956           r = check_type_tree(en->te.Else, type_mask, bag_mask, anchor_mask);
2957         }
2958       }
2959     }
2960     break;
2961 
2962   case NODE_ANCHOR:
2963     type = ANCHOR_(node)->type;
2964     if ((type & anchor_mask) == 0)
2965       return 1;
2966 
2967     if (IS_NOT_NULL(NODE_BODY(node)))
2968       r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2969     break;
2970 
2971   case NODE_GIMMICK:
2972   default:
2973     break;
2974   }
2975   return r;
2976 }
2977 
2978 static OnigLen
tree_min_len(Node * node,ScanEnv * env)2979 tree_min_len(Node* node, ScanEnv* env)
2980 {
2981   OnigLen len;
2982   OnigLen tmin;
2983 
2984   len = 0;
2985   switch (NODE_TYPE(node)) {
2986   case NODE_BACKREF:
2987     if (! NODE_IS_CHECKER(node)) {
2988       int i;
2989       int* backs;
2990       MemEnv* mem_env = SCANENV_MEMENV(env);
2991       BackRefNode* br = BACKREF_(node);
2992       if (NODE_IS_RECURSION(node)) break;
2993 
2994       backs = BACKREFS_P(br);
2995       len = tree_min_len(mem_env[backs[0]].mem_node, env);
2996       for (i = 1; i < br->back_num; i++) {
2997         tmin = tree_min_len(mem_env[backs[i]].mem_node, env);
2998         if (len > tmin) len = tmin;
2999       }
3000     }
3001     break;
3002 
3003 #ifdef USE_CALL
3004   case NODE_CALL:
3005     {
3006       Node* t = NODE_BODY(node);
3007       if (NODE_IS_RECURSION(node)) {
3008         if (NODE_IS_MIN_FIXED(t))
3009           len = BAG_(t)->min_len;
3010       }
3011       else
3012         len = tree_min_len(t, env);
3013     }
3014     break;
3015 #endif
3016 
3017   case NODE_LIST:
3018     do {
3019       tmin = tree_min_len(NODE_CAR(node), env);
3020       len = distance_add(len, tmin);
3021     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3022     break;
3023 
3024   case NODE_ALT:
3025     {
3026       Node *x, *y;
3027       y = node;
3028       do {
3029         x = NODE_CAR(y);
3030         tmin = tree_min_len(x, env);
3031         if (y == node) len = tmin;
3032         else if (len > tmin) len = tmin;
3033       } while (IS_NOT_NULL(y = NODE_CDR(y)));
3034     }
3035     break;
3036 
3037   case NODE_STRING:
3038     {
3039       StrNode* sn = STR_(node);
3040       len = (int )(sn->end - sn->s);
3041     }
3042     break;
3043 
3044   case NODE_CTYPE:
3045   case NODE_CCLASS:
3046     len = ONIGENC_MBC_MINLEN(env->enc);
3047     break;
3048 
3049   case NODE_QUANT:
3050     {
3051       QuantNode* qn = QUANT_(node);
3052 
3053       if (qn->lower > 0) {
3054         len = tree_min_len(NODE_BODY(node), env);
3055         len = distance_multiply(len, qn->lower);
3056       }
3057     }
3058     break;
3059 
3060   case NODE_BAG:
3061     {
3062       BagNode* en = BAG_(node);
3063       switch (en->type) {
3064       case BAG_MEMORY:
3065         if (NODE_IS_MIN_FIXED(node))
3066           len = en->min_len;
3067         else {
3068           if (NODE_IS_MARK1(node))
3069             len = 0;  /* recursive */
3070           else {
3071             NODE_STATUS_ADD(node, MARK1);
3072             len = tree_min_len(NODE_BODY(node), env);
3073             NODE_STATUS_REMOVE(node, MARK1);
3074 
3075             en->min_len = len;
3076             NODE_STATUS_ADD(node, MIN_FIXED);
3077           }
3078         }
3079         break;
3080 
3081       case BAG_OPTION:
3082       case BAG_STOP_BACKTRACK:
3083         len = tree_min_len(NODE_BODY(node), env);
3084         break;
3085       case BAG_IF_ELSE:
3086         {
3087           OnigLen elen;
3088 
3089           len = tree_min_len(NODE_BODY(node), env);
3090           if (IS_NOT_NULL(en->te.Then))
3091             len += tree_min_len(en->te.Then, env);
3092           if (IS_NOT_NULL(en->te.Else))
3093             elen = tree_min_len(en->te.Else, env);
3094           else elen = 0;
3095 
3096           if (elen < len) len = elen;
3097         }
3098         break;
3099       }
3100     }
3101     break;
3102 
3103   case NODE_GIMMICK:
3104     {
3105       GimmickNode* g = GIMMICK_(node);
3106       if (g->type == GIMMICK_FAIL) {
3107         len = INFINITE_LEN;
3108         break;
3109       }
3110     }
3111     /* fall */
3112   case NODE_ANCHOR:
3113   default:
3114     break;
3115   }
3116 
3117   return len;
3118 }
3119 
3120 static OnigLen
tree_max_len(Node * node,ScanEnv * env)3121 tree_max_len(Node* node, ScanEnv* env)
3122 {
3123   OnigLen len;
3124   OnigLen tmax;
3125 
3126   len = 0;
3127   switch (NODE_TYPE(node)) {
3128   case NODE_LIST:
3129     do {
3130       tmax = tree_max_len(NODE_CAR(node), env);
3131       len = distance_add(len, tmax);
3132     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3133     break;
3134 
3135   case NODE_ALT:
3136     do {
3137       tmax = tree_max_len(NODE_CAR(node), env);
3138       if (len < tmax) len = tmax;
3139     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3140     break;
3141 
3142   case NODE_STRING:
3143     {
3144       StrNode* sn = STR_(node);
3145       len = (OnigLen )(sn->end - sn->s);
3146     }
3147     break;
3148 
3149   case NODE_CTYPE:
3150   case NODE_CCLASS:
3151     len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
3152     break;
3153 
3154   case NODE_BACKREF:
3155     if (! NODE_IS_CHECKER(node)) {
3156       int i;
3157       int* backs;
3158       MemEnv* mem_env = SCANENV_MEMENV(env);
3159       BackRefNode* br = BACKREF_(node);
3160       if (NODE_IS_RECURSION(node)) {
3161         len = INFINITE_LEN;
3162         break;
3163       }
3164       backs = BACKREFS_P(br);
3165       for (i = 0; i < br->back_num; i++) {
3166         tmax = tree_max_len(mem_env[backs[i]].mem_node, env);
3167         if (len < tmax) len = tmax;
3168       }
3169     }
3170     break;
3171 
3172 #ifdef USE_CALL
3173   case NODE_CALL:
3174     if (! NODE_IS_RECURSION(node))
3175       len = tree_max_len(NODE_BODY(node), env);
3176     else
3177       len = INFINITE_LEN;
3178     break;
3179 #endif
3180 
3181   case NODE_QUANT:
3182     {
3183       QuantNode* qn = QUANT_(node);
3184 
3185       if (qn->upper != 0) {
3186         len = tree_max_len(NODE_BODY(node), env);
3187         if (len != 0) {
3188           if (! IS_INFINITE_REPEAT(qn->upper))
3189             len = distance_multiply(len, qn->upper);
3190           else
3191             len = INFINITE_LEN;
3192         }
3193       }
3194     }
3195     break;
3196 
3197   case NODE_BAG:
3198     {
3199       BagNode* en = BAG_(node);
3200       switch (en->type) {
3201       case BAG_MEMORY:
3202         if (NODE_IS_MAX_FIXED(node))
3203           len = en->max_len;
3204         else {
3205           if (NODE_IS_MARK1(node))
3206             len = INFINITE_LEN;
3207           else {
3208             NODE_STATUS_ADD(node, MARK1);
3209             len = tree_max_len(NODE_BODY(node), env);
3210             NODE_STATUS_REMOVE(node, MARK1);
3211 
3212             en->max_len = len;
3213             NODE_STATUS_ADD(node, MAX_FIXED);
3214           }
3215         }
3216         break;
3217 
3218       case BAG_OPTION:
3219       case BAG_STOP_BACKTRACK:
3220         len = tree_max_len(NODE_BODY(node), env);
3221         break;
3222       case BAG_IF_ELSE:
3223         {
3224           OnigLen tlen, elen;
3225 
3226           len = tree_max_len(NODE_BODY(node), env);
3227           if (IS_NOT_NULL(en->te.Then)) {
3228             tlen = tree_max_len(en->te.Then, env);
3229             len = distance_add(len, tlen);
3230           }
3231           if (IS_NOT_NULL(en->te.Else))
3232             elen = tree_max_len(en->te.Else, env);
3233           else elen = 0;
3234 
3235           if (elen > len) len = elen;
3236         }
3237         break;
3238       }
3239     }
3240     break;
3241 
3242   case NODE_ANCHOR:
3243   case NODE_GIMMICK:
3244   default:
3245     break;
3246   }
3247 
3248   return len;
3249 }
3250 
3251 static int
check_backrefs(Node * node,ScanEnv * env)3252 check_backrefs(Node* node, ScanEnv* env)
3253 {
3254   int r;
3255 
3256   switch (NODE_TYPE(node)) {
3257   case NODE_LIST:
3258   case NODE_ALT:
3259     do {
3260       r = check_backrefs(NODE_CAR(node), env);
3261     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3262     break;
3263 
3264   case NODE_ANCHOR:
3265     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3266       r = 0;
3267       break;
3268     }
3269     /* fall */
3270   case NODE_QUANT:
3271     r = check_backrefs(NODE_BODY(node), env);
3272     break;
3273 
3274   case NODE_BAG:
3275     r = check_backrefs(NODE_BODY(node), env);
3276     {
3277       BagNode* en = BAG_(node);
3278 
3279       if (en->type == BAG_IF_ELSE) {
3280         if (r != 0) return r;
3281         if (IS_NOT_NULL(en->te.Then)) {
3282           r = check_backrefs(en->te.Then, env);
3283           if (r != 0) return r;
3284         }
3285         if (IS_NOT_NULL(en->te.Else)) {
3286           r = check_backrefs(en->te.Else, env);
3287         }
3288       }
3289     }
3290     break;
3291 
3292   case NODE_BACKREF:
3293     {
3294       int i;
3295       BackRefNode* br = BACKREF_(node);
3296       int* backs = BACKREFS_P(br);
3297       MemEnv* mem_env = SCANENV_MEMENV(env);
3298 
3299       for (i = 0; i < br->back_num; i++) {
3300         if (backs[i] > env->num_mem)
3301           return ONIGERR_INVALID_BACKREF;
3302 
3303         NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3304       }
3305       r = 0;
3306     }
3307     break;
3308 
3309   default:
3310     r = 0;
3311     break;
3312   }
3313 
3314   return r;
3315 }
3316 
3317 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ScanEnv * env)3318 set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
3319 {
3320   int r;
3321 
3322   switch (NODE_TYPE(node)) {
3323   case NODE_LIST:
3324   case NODE_ALT:
3325     do {
3326       r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3327     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3328     break;
3329 
3330   case NODE_ANCHOR:
3331     {
3332       AnchorNode* an = ANCHOR_(node);
3333 
3334       if (! ANCHOR_HAS_BODY(an)) {
3335         r = 0;
3336         break;
3337       }
3338 
3339       switch (an->type) {
3340       case ANCR_PREC_READ:
3341       case ANCR_LOOK_BEHIND:
3342         empty = NULL_NODE;
3343         break;
3344       default:
3345         break;
3346       }
3347       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3348     }
3349     break;
3350 
3351   case NODE_QUANT:
3352     {
3353       QuantNode* qn = QUANT_(node);
3354 
3355       if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3356       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3357     }
3358     break;
3359 
3360   case NODE_BAG:
3361     if (IS_NOT_NULL(NODE_BODY(node))) {
3362       r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3363       if (r != 0) return r;
3364     }
3365     {
3366       BagNode* en = BAG_(node);
3367 
3368       if (en->type == BAG_MEMORY) {
3369         if (NODE_IS_BACKREF(node)) {
3370           if (IS_NOT_NULL(empty))
3371             SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3372         }
3373       }
3374       else if (en->type == BAG_IF_ELSE) {
3375         if (IS_NOT_NULL(en->te.Then)) {
3376           r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3377           if (r != 0) return r;
3378         }
3379         if (IS_NOT_NULL(en->te.Else)) {
3380           r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3381         }
3382       }
3383     }
3384     break;
3385 
3386   default:
3387     r = 0;
3388     break;
3389   }
3390 
3391   return r;
3392 }
3393 
3394 static int
is_ancestor_node(Node * node,Node * me)3395 is_ancestor_node(Node* node, Node* me)
3396 {
3397   Node* parent;
3398 
3399   while ((parent = NODE_PARENT(me)) != NULL_NODE) {
3400     if (parent == node) return 1;
3401     me = parent;
3402   }
3403   return 0;
3404 }
3405 
3406 static void
set_empty_status_check_trav(Node * node,ScanEnv * env)3407 set_empty_status_check_trav(Node* node, ScanEnv* env)
3408 {
3409   switch (NODE_TYPE(node)) {
3410   case NODE_LIST:
3411   case NODE_ALT:
3412     do {
3413       set_empty_status_check_trav(NODE_CAR(node), env);
3414     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3415     break;
3416 
3417   case NODE_ANCHOR:
3418     {
3419       AnchorNode* an = ANCHOR_(node);
3420 
3421       if (! ANCHOR_HAS_BODY(an)) break;
3422       set_empty_status_check_trav(NODE_BODY(node), env);
3423     }
3424     break;
3425 
3426   case NODE_QUANT:
3427     set_empty_status_check_trav(NODE_BODY(node), env);
3428     break;
3429 
3430   case NODE_BAG:
3431     if (IS_NOT_NULL(NODE_BODY(node)))
3432       set_empty_status_check_trav(NODE_BODY(node), env);
3433     {
3434       BagNode* en = BAG_(node);
3435 
3436       if (en->type == BAG_IF_ELSE) {
3437         if (IS_NOT_NULL(en->te.Then)) {
3438           set_empty_status_check_trav(en->te.Then, env);
3439         }
3440         if (IS_NOT_NULL(en->te.Else)) {
3441           set_empty_status_check_trav(en->te.Else, env);
3442         }
3443       }
3444     }
3445     break;
3446 
3447   case NODE_BACKREF:
3448     {
3449       int i;
3450       int* backs;
3451       MemEnv* mem_env = SCANENV_MEMENV(env);
3452       BackRefNode* br = BACKREF_(node);
3453       backs = BACKREFS_P(br);
3454       for (i = 0; i < br->back_num; i++) {
3455         Node* ernode = mem_env[backs[i]].empty_repeat_node;
3456         if (IS_NOT_NULL(ernode)) {
3457           if (! is_ancestor_node(ernode, node)) {
3458             MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]);
3459             NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
3460             NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
3461           }
3462         }
3463       }
3464     }
3465     break;
3466 
3467   default:
3468     break;
3469   }
3470 }
3471 
3472 static void
set_parent_node_trav(Node * node,Node * parent)3473 set_parent_node_trav(Node* node, Node* parent)
3474 {
3475   NODE_PARENT(node) = parent;
3476 
3477   switch (NODE_TYPE(node)) {
3478   case NODE_LIST:
3479   case NODE_ALT:
3480     do {
3481       set_parent_node_trav(NODE_CAR(node), node);
3482     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3483     break;
3484 
3485   case NODE_ANCHOR:
3486     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
3487     set_parent_node_trav(NODE_BODY(node), node);
3488     break;
3489 
3490   case NODE_QUANT:
3491     set_parent_node_trav(NODE_BODY(node), node);
3492     break;
3493 
3494   case NODE_BAG:
3495     if (IS_NOT_NULL(NODE_BODY(node)))
3496       set_parent_node_trav(NODE_BODY(node), node);
3497     {
3498       BagNode* en = BAG_(node);
3499 
3500       if (en->type == BAG_IF_ELSE) {
3501         if (IS_NOT_NULL(en->te.Then))
3502           set_parent_node_trav(en->te.Then, node);
3503         if (IS_NOT_NULL(en->te.Else)) {
3504           set_parent_node_trav(en->te.Else, node);
3505         }
3506       }
3507     }
3508     break;
3509 
3510   default:
3511     break;
3512   }
3513 }
3514 
3515 
3516 #ifdef USE_CALL
3517 
3518 #define RECURSION_EXIST        (1<<0)
3519 #define RECURSION_MUST         (1<<1)
3520 #define RECURSION_INFINITE     (1<<2)
3521 
3522 static int
infinite_recursive_call_check(Node * node,ScanEnv * env,int head)3523 infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
3524 {
3525   int ret;
3526   int r = 0;
3527 
3528   switch (NODE_TYPE(node)) {
3529   case NODE_LIST:
3530     {
3531       Node *x;
3532       OnigLen min;
3533 
3534       x = node;
3535       do {
3536         ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
3537         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3538         r |= ret;
3539         if (head != 0) {
3540           min = tree_min_len(NODE_CAR(x), env);
3541           if (min != 0) head = 0;
3542         }
3543       } while (IS_NOT_NULL(x = NODE_CDR(x)));
3544     }
3545     break;
3546 
3547   case NODE_ALT:
3548     {
3549       int must;
3550 
3551       must = RECURSION_MUST;
3552       do {
3553         ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
3554         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3555 
3556         r    |= (ret & RECURSION_EXIST);
3557         must &= ret;
3558       } while (IS_NOT_NULL(node = NODE_CDR(node)));
3559       r |= must;
3560     }
3561     break;
3562 
3563   case NODE_QUANT:
3564     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3565     if (r < 0) return r;
3566     if ((r & RECURSION_MUST) != 0) {
3567       if (QUANT_(node)->lower == 0)
3568         r &= ~RECURSION_MUST;
3569     }
3570     break;
3571 
3572   case NODE_ANCHOR:
3573     if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
3574       break;
3575     /* fall */
3576   case NODE_CALL:
3577     r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3578     break;
3579 
3580   case NODE_BAG:
3581     {
3582       BagNode* en = BAG_(node);
3583 
3584       if (en->type == BAG_MEMORY) {
3585         if (NODE_IS_MARK2(node))
3586           return 0;
3587         else if (NODE_IS_MARK1(node))
3588           return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
3589                   : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
3590         else {
3591           NODE_STATUS_ADD(node, MARK2);
3592           r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3593           NODE_STATUS_REMOVE(node, MARK2);
3594         }
3595       }
3596       else if (en->type == BAG_IF_ELSE) {
3597         int eret;
3598 
3599         ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
3600         if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3601         r |= ret;
3602         if (IS_NOT_NULL(en->te.Then)) {
3603           OnigLen min;
3604           if (head != 0) {
3605             min = tree_min_len(NODE_BODY(node), env);
3606           }
3607           else min = 0;
3608 
3609           ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
3610           if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3611           r |= ret;
3612         }
3613         if (IS_NOT_NULL(en->te.Else)) {
3614           eret = infinite_recursive_call_check(en->te.Else, env, head);
3615           if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
3616           r |= (eret & RECURSION_EXIST);
3617           if ((eret & RECURSION_MUST) == 0)
3618             r &= ~RECURSION_MUST;
3619         }
3620         else {
3621           r &= ~RECURSION_MUST;
3622         }
3623       }
3624       else {
3625         r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3626       }
3627     }
3628     break;
3629 
3630   default:
3631     break;
3632   }
3633 
3634   return r;
3635 }
3636 
3637 static int
infinite_recursive_call_check_trav(Node * node,ScanEnv * env)3638 infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
3639 {
3640   int r;
3641 
3642   switch (NODE_TYPE(node)) {
3643   case NODE_LIST:
3644   case NODE_ALT:
3645     do {
3646       r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
3647     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3648     break;
3649 
3650   case NODE_ANCHOR:
3651     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3652       r = 0;
3653       break;
3654     }
3655     /* fall */
3656   case NODE_QUANT:
3657     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3658     break;
3659 
3660   case NODE_BAG:
3661     {
3662       BagNode* en = BAG_(node);
3663 
3664       if (en->type == BAG_MEMORY) {
3665         if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
3666           int ret;
3667 
3668           NODE_STATUS_ADD(node, MARK1);
3669 
3670           ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
3671           if (ret < 0) return ret;
3672           else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
3673             return ONIGERR_NEVER_ENDING_RECURSION;
3674 
3675           NODE_STATUS_REMOVE(node, MARK1);
3676         }
3677       }
3678       else if (en->type == BAG_IF_ELSE) {
3679         if (IS_NOT_NULL(en->te.Then)) {
3680           r = infinite_recursive_call_check_trav(en->te.Then, env);
3681           if (r != 0) return r;
3682         }
3683         if (IS_NOT_NULL(en->te.Else)) {
3684           r = infinite_recursive_call_check_trav(en->te.Else, env);
3685           if (r != 0) return r;
3686         }
3687       }
3688     }
3689 
3690     r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3691     break;
3692 
3693   default:
3694     r = 0;
3695     break;
3696   }
3697 
3698   return r;
3699 }
3700 
3701 static int
recursive_call_check(Node * node)3702 recursive_call_check(Node* node)
3703 {
3704   int r;
3705 
3706   switch (NODE_TYPE(node)) {
3707   case NODE_LIST:
3708   case NODE_ALT:
3709     r = 0;
3710     do {
3711       r |= recursive_call_check(NODE_CAR(node));
3712     } while (IS_NOT_NULL(node = NODE_CDR(node)));
3713     break;
3714 
3715   case NODE_ANCHOR:
3716     if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3717       r = 0;
3718       break;
3719     }
3720     /* fall */
3721   case NODE_QUANT:
3722     r = recursive_call_check(NODE_BODY(node));
3723     break;
3724 
3725   case NODE_CALL:
3726     r = recursive_call_check(NODE_BODY(node));
3727     if (r != 0) {
3728       if (NODE_IS_MARK1(NODE_BODY(node)))
3729         NODE_STATUS_ADD(node, RECURSION);
3730     }
3731     break;
3732 
3733   case NODE_BAG:
3734     {
3735       BagNode* en = BAG_(node);
3736 
3737       if (en->type == BAG_MEMORY) {
3738         if (NODE_IS_MARK2(node))
3739           return 0;
3740         else if (NODE_IS_MARK1(node))
3741           return 1; /* recursion */
3742         else {
3743           NODE_STATUS_ADD(node, MARK2);
3744           r = recursive_call_check(NODE_BODY(node));
3745           NODE_STATUS_REMOVE(node, MARK2);
3746         }
3747       }
3748       else if (en->type == BAG_IF_ELSE) {
3749         r = 0;
3750         if (IS_NOT_NULL(en->te.Then)) {
3751           r |= recursive_call_check(en->te.Then);
3752         }
3753         if (IS_NOT_NULL(en->te.Else)) {
3754           r |= recursive_call_check(en->te.Else);
3755         }
3756         r |= recursive_call_check(NODE_BODY(node));
3757       }
3758       else {
3759         r = recursive_call_check(NODE_BODY(node));
3760       }
3761     }
3762     break;
3763 
3764   default:
3765     r = 0;
3766     break;
3767   }
3768 
3769   return r;
3770 }
3771 
3772 #define IN_RECURSION         (1<<0)
3773 #define FOUND_CALLED_NODE    1
3774 
3775 static int
recursive_call_check_trav(Node * node,ScanEnv * env,int state)3776 recursive_call_check_trav(Node* node, ScanEnv* env, int state)
3777 {
3778   int r = 0;
3779 
3780   switch (NODE_TYPE(node)) {
3781   case NODE_LIST:
3782   case NODE_ALT:
3783     {
3784       int ret;
3785       do {
3786         ret = recursive_call_check_trav(NODE_CAR(node), env, state);
3787         if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3788         else if (ret < 0) return ret;
3789       } while (IS_NOT_NULL(node = NODE_CDR(node)));
3790     }
3791     break;
3792 
3793   case NODE_QUANT:
3794     r = recursive_call_check_trav(NODE_BODY(node), env, state);
3795     if (QUANT_(node)->upper == 0) {
3796       if (r == FOUND_CALLED_NODE)
3797         QUANT_(node)->include_referred = 1;
3798     }
3799     break;
3800 
3801   case NODE_ANCHOR:
3802     {
3803       AnchorNode* an = ANCHOR_(node);
3804       if (ANCHOR_HAS_BODY(an))
3805         r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
3806     }
3807     break;
3808 
3809   case NODE_BAG:
3810     {
3811       int ret;
3812       int state1;
3813       BagNode* en = BAG_(node);
3814 
3815       if (en->type == BAG_MEMORY) {
3816         if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
3817           if (! NODE_IS_RECURSION(node)) {
3818             NODE_STATUS_ADD(node, MARK1);
3819             r = recursive_call_check(NODE_BODY(node));
3820             if (r != 0) {
3821               NODE_STATUS_ADD(node, RECURSION);
3822               MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
3823             }
3824             NODE_STATUS_REMOVE(node, MARK1);
3825           }
3826 
3827           if (NODE_IS_CALLED(node))
3828             r = FOUND_CALLED_NODE;
3829         }
3830       }
3831 
3832       state1 = state;
3833       if (NODE_IS_RECURSION(node))
3834         state1 |= IN_RECURSION;
3835 
3836       ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
3837       if (ret == FOUND_CALLED_NODE)
3838         r = FOUND_CALLED_NODE;
3839 
3840       if (en->type == BAG_IF_ELSE) {
3841         if (IS_NOT_NULL(en->te.Then)) {
3842           ret = recursive_call_check_trav(en->te.Then, env, state1);
3843           if (ret == FOUND_CALLED_NODE)
3844             r = FOUND_CALLED_NODE;
3845         }
3846         if (IS_NOT_NULL(en->te.Else)) {
3847           ret = recursive_call_check_trav(en->te.Else, env, state1);
3848           if (ret == FOUND_CALLED_NODE)
3849             r = FOUND_CALLED_NODE;
3850         }
3851       }
3852     }
3853     break;
3854 
3855   default:
3856     break;
3857   }
3858 
3859   return r;
3860 }
3861 
3862 #endif
3863 
3864 static void
remove_from_list(Node * prev,Node * a)3865 remove_from_list(Node* prev, Node* a)
3866 {
3867   if (NODE_CDR(prev) != a) return ;
3868 
3869   NODE_CDR(prev) = NODE_CDR(a);
3870   NODE_CDR(a) = NULL_NODE;
3871 }
3872 
3873 static int
reduce_string_list(Node * node)3874 reduce_string_list(Node* node)
3875 {
3876   int r = 0;
3877 
3878   switch (NODE_TYPE(node)) {
3879   case NODE_LIST:
3880     {
3881       Node* prev;
3882       Node* curr;
3883       Node* prev_node;
3884       Node* next_node;
3885 
3886       prev = NULL_NODE;
3887       do {
3888         next_node = NODE_CDR(node);
3889         curr = NODE_CAR(node);
3890         if (NODE_TYPE(curr) == NODE_STRING) {
3891           if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) {
3892             prev = curr;
3893             prev_node = node;
3894           }
3895           else {
3896             r = node_str_node_cat(prev, curr);
3897             if (r != 0) return r;
3898             remove_from_list(prev_node, node);
3899             onig_node_free(node);
3900           }
3901         }
3902         else {
3903           prev = NULL_NODE;
3904           prev_node = node;
3905         }
3906 
3907         node = next_node;
3908       } while (r == 0 && IS_NOT_NULL(node));
3909     }
3910     break;
3911 
3912   case NODE_ALT:
3913     do {
3914       r = reduce_string_list(NODE_CAR(node));
3915     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3916     break;
3917 
3918   case NODE_ANCHOR:
3919     if (IS_NULL(NODE_BODY(node)))
3920       break;
3921     /* fall */
3922   case NODE_QUANT:
3923     r = reduce_string_list(NODE_BODY(node));
3924     break;
3925 
3926   case NODE_BAG:
3927     {
3928       BagNode* en = BAG_(node);
3929 
3930       r = reduce_string_list(NODE_BODY(node));
3931       if (r != 0) return r;
3932 
3933       if (en->type == BAG_IF_ELSE) {
3934         if (IS_NOT_NULL(en->te.Then)) {
3935           r = reduce_string_list(en->te.Then);
3936           if (r != 0) return r;
3937         }
3938         if (IS_NOT_NULL(en->te.Else)) {
3939           r = reduce_string_list(en->te.Else);
3940           if (r != 0) return r;
3941         }
3942       }
3943     }
3944     break;
3945 
3946   default:
3947     break;
3948   }
3949 
3950   return r;
3951 }
3952 
3953 
3954 #define IN_ALT          (1<<0)
3955 #define IN_NOT          (1<<1)
3956 #define IN_REAL_REPEAT  (1<<2)
3957 #define IN_VAR_REPEAT   (1<<3)
3958 #define IN_ZERO_REPEAT  (1<<4)
3959 #define IN_MULTI_ENTRY  (1<<5)
3960 #define IN_LOOK_BEHIND  (1<<6)
3961 
3962 
3963 /* divide different length alternatives in look-behind.
3964   (?<=A|B) ==> (?<=A)|(?<=B)
3965   (?<!A|B) ==> (?<!A)(?<!B)
3966 */
3967 static int
divide_look_behind_alternatives(Node * node)3968 divide_look_behind_alternatives(Node* node)
3969 {
3970   Node *head, *np, *insert_node;
3971   AnchorNode* an = ANCHOR_(node);
3972   int anc_type = an->type;
3973 
3974   head = NODE_ANCHOR_BODY(an);
3975   np = NODE_CAR(head);
3976   node_swap(node, head);
3977   NODE_CAR(node) = head;
3978   NODE_BODY(head) = np;
3979 
3980   np = node;
3981   while (IS_NOT_NULL(np = NODE_CDR(np))) {
3982     insert_node = onig_node_new_anchor(anc_type, an->ascii_mode);
3983     CHECK_NULL_RETURN_MEMERR(insert_node);
3984     NODE_BODY(insert_node) = NODE_CAR(np);
3985     NODE_CAR(np) = insert_node;
3986   }
3987 
3988   if (anc_type == ANCR_LOOK_BEHIND_NOT) {
3989     np = node;
3990     do {
3991       NODE_SET_TYPE(np, NODE_LIST);  /* alt -> list */
3992     } while (IS_NOT_NULL(np = NODE_CDR(np)));
3993   }
3994   return 0;
3995 }
3996 
3997 static int
tune_look_behind(Node * node,regex_t * reg,ScanEnv * env)3998 tune_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3999 {
4000   int r, len;
4001   AnchorNode* an = ANCHOR_(node);
4002 
4003   r = get_char_len_node(NODE_ANCHOR_BODY(an), reg, &len);
4004   if (r == 0)
4005     an->char_len = len;
4006   else if (r == GET_CHAR_LEN_VARLEN)
4007     r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4008   else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
4009     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
4010       r = divide_look_behind_alternatives(node);
4011     else
4012       r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4013   }
4014 
4015   return r;
4016 }
4017 
4018 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4019 tune_next(Node* node, Node* next_node, regex_t* reg)
4020 {
4021   NodeType type;
4022 
4023  retry:
4024   type = NODE_TYPE(node);
4025   if (type == NODE_QUANT) {
4026     QuantNode* qn = QUANT_(node);
4027     if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4028 #ifdef USE_QUANT_PEEK_NEXT
4029       Node* n = get_head_value_node(next_node, 1, reg);
4030       /* '\0': for UTF-16BE etc... */
4031       if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4032         qn->next_head_exact = n;
4033       }
4034 #endif
4035       /* automatic posseivation a*b ==> (?>a*)b */
4036       if (qn->lower <= 1) {
4037         if (is_strict_real_node(NODE_BODY(node))) {
4038           Node *x, *y;
4039           x = get_head_value_node(NODE_BODY(node), 0, reg);
4040           if (IS_NOT_NULL(x)) {
4041             y = get_head_value_node(next_node,  0, reg);
4042             if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4043               Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4044               CHECK_NULL_RETURN_MEMERR(en);
4045               NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4046               node_swap(node, en);
4047               NODE_BODY(node) = en;
4048             }
4049           }
4050         }
4051       }
4052     }
4053   }
4054   else if (type == NODE_BAG) {
4055     BagNode* en = BAG_(node);
4056     if (en->type == BAG_MEMORY) {
4057       node = NODE_BODY(node);
4058       goto retry;
4059     }
4060   }
4061   return 0;
4062 }
4063 
4064 
4065 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4066 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4067 {
4068   int i;
4069 
4070   for (i = 0; i < n; i++) {
4071     OnigCaseFoldCodeItem* item = items + i;
4072     if (item->code_len != 1) return 0;
4073   }
4074 
4075   return 1;
4076 }
4077 
4078 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],int * rmin,int * rmax)4079 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax)
4080 {
4081   int i, len, minlen, maxlen;
4082 
4083   minlen = INT_MAX;
4084   maxlen = 0;
4085   for (i = 0; i < n; i++) {
4086     OnigCaseFoldCodeItem* item = items + i;
4087 
4088     len = item->byte_len;
4089     if (len < minlen) minlen = len;
4090     if (len > maxlen) maxlen = len;
4091   }
4092 
4093   *rmin = minlen;
4094   *rmax = maxlen;
4095   return 0;
4096 }
4097 
4098 static int
conv_string_case_fold(OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end,UChar ** rs,UChar ** rend,int * rcase_min_len)4099 conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag,
4100            UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len)
4101 {
4102   UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
4103   UChar *sbuf, *ebuf, *sp;
4104   int i, n, len, sbuf_size;
4105 
4106   *rs = NULL;
4107   sbuf_size = (int )(end - s) * 2;
4108   sbuf = (UChar* )xmalloc(sbuf_size);
4109   CHECK_NULL_RETURN_MEMERR(sbuf);
4110   ebuf = sbuf + sbuf_size;
4111 
4112   n = 0;
4113   sp = sbuf;
4114   p = s;
4115   while (p < end) {
4116     len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf);
4117     for (i = 0; i < len; i++) {
4118       if (sp >= ebuf) {
4119         sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
4120         CHECK_NULL_RETURN_MEMERR(sbuf);
4121         sp = sbuf + sbuf_size;
4122         sbuf_size *= 2;
4123         ebuf = sbuf + sbuf_size;
4124       }
4125 
4126       *sp++ = buf[i];
4127     }
4128     n++;
4129   }
4130 
4131   *rs = sbuf;
4132   *rend = sp;
4133   *rcase_min_len = n;
4134   return 0;
4135 }
4136 
4137 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4138 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4139                          int n, OnigCodePoint codes[])
4140 {
4141   int r, i, len;
4142   Node* node;
4143   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4144 
4145   *rnode = NULL_NODE;
4146   node = onig_node_new_str(NULL, NULL);
4147   CHECK_NULL_RETURN_MEMERR(node);
4148 
4149   for (i = 0; i < n; i++) {
4150     len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4151     if (len < 0) {
4152       r = len;
4153       goto err;
4154     }
4155 
4156     r = onig_node_str_cat(node, buf, buf + len);
4157     if (r != 0) goto err;
4158   }
4159 
4160   *rnode = node;
4161   return 0;
4162 
4163  err:
4164   onig_node_free(node);
4165   return r;
4166 }
4167 
4168 static int
unravel_cf_node_add(Node ** rlist,Node * add)4169 unravel_cf_node_add(Node** rlist, Node* add)
4170 {
4171   Node *list;
4172 
4173   list = *rlist;
4174   if (IS_NULL(list)) {
4175     list = onig_node_new_list(add, NULL);
4176     CHECK_NULL_RETURN_MEMERR(list);
4177     *rlist = list;
4178   }
4179   else {
4180     Node* r = node_list_add(list, add);
4181     CHECK_NULL_RETURN_MEMERR(r);
4182   }
4183 
4184   return 0;
4185 }
4186 
4187 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag,int case_min_len)4188 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4189                       unsigned int flag, int case_min_len)
4190 {
4191   int r;
4192   Node *sn, *list;
4193 
4194   list = *rlist;
4195   sn   = *rsn;
4196 
4197   if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4198     if (NODE_STRING_IS_CASE_FOLD_MATCH(sn))
4199       r = node_str_cat_case_fold(sn, s, end, case_min_len);
4200     else
4201       r = onig_node_str_cat(sn, s, end);
4202   }
4203   else {
4204     sn = onig_node_new_str(s, end);
4205     CHECK_NULL_RETURN_MEMERR(sn);
4206 
4207     STR_(sn)->flag = flag;
4208     STR_(sn)->case_min_len = case_min_len;
4209     r = unravel_cf_node_add(&list, sn);
4210   }
4211 
4212   if (r == 0) {
4213     *rlist = list;
4214     *rsn = sn;
4215   }
4216   return r;
4217 }
4218 
4219 static int
unravel_cf_string_fold_add(Node ** rlist,Node ** rsn,OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4220 unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc,
4221                       OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4222 {
4223   int r;
4224   int case_min_len;
4225   UChar *rs, *rend;
4226 
4227   r = conv_string_case_fold(enc, case_fold_flag, s, end,
4228                             &rs, &rend, &case_min_len);
4229   if (r != 0) return r;
4230 
4231   r = unravel_cf_string_add(rlist, rsn, rs, rend,
4232                             NODE_STRING_CASE_FOLD_MATCH, case_min_len);
4233   xfree(rs);
4234 
4235   return r;
4236 }
4237 
4238 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],int byte_len,OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4239 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4240             OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc,
4241             OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4242 {
4243   int r, i;
4244   Node* node;
4245 
4246   if (is_all_code_len_1_items(n, items)) {
4247     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4248 
4249     codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4250     for (i = 0; i < n; i++) {
4251       OnigCaseFoldCodeItem* item = items + i;
4252       codes[i+1] = item->code[0];
4253     }
4254     r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4255     if (r != 0) return r;
4256   }
4257   else {
4258     Node *snode, *alt, *curr;
4259 
4260     snode = onig_node_new_str(s, end);
4261     CHECK_NULL_RETURN_MEMERR(snode);
4262     node = curr = onig_node_new_alt(snode, NULL_NODE);
4263     if (IS_NULL(curr)) {
4264       onig_node_free(snode);
4265       return ONIGERR_MEMORY;
4266     }
4267 
4268     r = 0;
4269     for (i = 0; i < n; i++) {
4270       OnigCaseFoldCodeItem* item = items + i;
4271       r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4272       if (r != 0) {
4273         onig_node_free(node);
4274         return r;
4275       }
4276 
4277       alt = onig_node_new_alt(snode, NULL_NODE);
4278       if (IS_NULL(alt)) {
4279         onig_node_free(snode);
4280         onig_node_free(node);
4281         return ONIGERR_MEMORY;
4282       }
4283 
4284       NODE_CDR(curr) = alt;
4285       curr = alt;
4286     }
4287   }
4288 
4289   r = unravel_cf_node_add(rlist, node);
4290   if (r != 0) onig_node_free(node);
4291   return r;
4292 }
4293 
4294 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,int one_len)4295 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4296                 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4297                 UChar* s, int one_len)
4298 {
4299   int r, i, found;
4300 
4301   found = 0;
4302   for (i = 0; i < n; i++) {
4303     OnigCaseFoldCodeItem* item = items + i;
4304     if (item->byte_len == one_len) {
4305       if (item->code_len == 1) {
4306         found = 1;
4307       }
4308     }
4309   }
4310 
4311   if (found == 0) {
4312     r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0);
4313   }
4314   else {
4315     Node* node;
4316     OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4317 
4318     found = 0;
4319     codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
4320     for (i = 0; i < n; i++) {
4321       OnigCaseFoldCodeItem* item = items + i;
4322       if (item->byte_len == one_len) {
4323         if (item->code_len == 1) {
4324           codes[found++] = item->code[0];
4325         }
4326       }
4327     }
4328     r = onig_new_cclass_with_code_list(&node, enc, found, codes);
4329     if (r != 0) return r;
4330 
4331     r = unravel_cf_node_add(rlist, node);
4332     if (r != 0) onig_node_free(node);
4333 
4334     *rsn = NULL_NODE;
4335   }
4336 
4337   return r;
4338 }
4339 
4340 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)4341 unravel_case_fold_string(Node* node, regex_t* reg, int state)
4342 {
4343   int r, n, one_len, min_len, max_len, in_look_behind;
4344   UChar *start, *end, *p, *q;
4345   StrNode* snode;
4346   Node *sn, *list;
4347   OnigEncoding enc;
4348   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4349 
4350   if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
4351 
4352   snode = STR_(node);
4353 
4354   start = snode->s;
4355   end   = snode->end;
4356   if (start >= end) return 0;
4357 
4358   in_look_behind = (state & IN_LOOK_BEHIND) != 0;
4359   enc = reg->enc;
4360 
4361   list = sn = NULL_NODE;
4362   p = start;
4363   while (p < end) {
4364     n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
4365                                            items);
4366     if (n < 0) {
4367       r = n;
4368       goto err;
4369     }
4370 
4371     one_len = enclen(enc, p);
4372     if (n == 0) {
4373       q = p + one_len;
4374       r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0);
4375       if (r != 0) goto err;
4376     }
4377     else {
4378       if (in_look_behind != 0) {
4379         q = p + one_len;
4380         r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
4381         if (r != 0) goto err;
4382       }
4383       else {
4384         get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
4385         q = p + max_len;
4386         if (one_len == max_len && min_len == max_len) {
4387           r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc,
4388                                               reg->case_fold_flag, p, q);
4389           if (r != 0) goto err;
4390           sn = NULL_NODE;
4391         }
4392         else {
4393           r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag,
4394                                          p, q);
4395           if (r != 0) goto err;
4396         }
4397       }
4398     }
4399 
4400     p = q;
4401   }
4402 
4403   if (IS_NOT_NULL(list)) {
4404     if (node_list_len(list) == 1) {
4405       node_swap(node, NODE_CAR(list));
4406     }
4407     else {
4408       node_swap(node, list);
4409     }
4410     onig_node_free(list);
4411   }
4412   else {
4413     node_swap(node, sn);
4414     onig_node_free(sn);
4415   }
4416   return 0;
4417 
4418  err:
4419   if (IS_NOT_NULL(list))
4420     onig_node_free(list);
4421   else if (IS_NOT_NULL(sn))
4422     onig_node_free(sn);
4423 
4424   return r;
4425 }
4426 
4427 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4428 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)4429 quantifiers_memory_node_info(Node* node)
4430 {
4431   int r = BODY_IS_EMPTY_POSSIBILITY;
4432 
4433   switch (NODE_TYPE(node)) {
4434   case NODE_LIST:
4435   case NODE_ALT:
4436     {
4437       int v;
4438       do {
4439         v = quantifiers_memory_node_info(NODE_CAR(node));
4440         if (v > r) r = v;
4441       } while (IS_NOT_NULL(node = NODE_CDR(node)));
4442     }
4443     break;
4444 
4445 #ifdef USE_CALL
4446   case NODE_CALL:
4447     if (NODE_IS_RECURSION(node)) {
4448       return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */
4449     }
4450     else
4451       r = quantifiers_memory_node_info(NODE_BODY(node));
4452     break;
4453 #endif
4454 
4455   case NODE_QUANT:
4456     {
4457       QuantNode* qn = QUANT_(node);
4458       if (qn->upper != 0) {
4459         r = quantifiers_memory_node_info(NODE_BODY(node));
4460       }
4461     }
4462     break;
4463 
4464   case NODE_BAG:
4465     {
4466       BagNode* en = BAG_(node);
4467       switch (en->type) {
4468       case BAG_MEMORY:
4469         if (NODE_IS_RECURSION(node)) {
4470           return BODY_IS_EMPTY_POSSIBILITY_REC;
4471         }
4472         return BODY_IS_EMPTY_POSSIBILITY_MEM;
4473         break;
4474 
4475       case BAG_OPTION:
4476       case BAG_STOP_BACKTRACK:
4477         r = quantifiers_memory_node_info(NODE_BODY(node));
4478         break;
4479       case BAG_IF_ELSE:
4480         {
4481           int v;
4482           r = quantifiers_memory_node_info(NODE_BODY(node));
4483           if (IS_NOT_NULL(en->te.Then)) {
4484             v = quantifiers_memory_node_info(en->te.Then);
4485             if (v > r) r = v;
4486           }
4487           if (IS_NOT_NULL(en->te.Else)) {
4488             v = quantifiers_memory_node_info(en->te.Else);
4489             if (v > r) r = v;
4490           }
4491         }
4492         break;
4493       }
4494     }
4495     break;
4496 
4497   case NODE_BACKREF:
4498   case NODE_STRING:
4499   case NODE_CTYPE:
4500   case NODE_CCLASS:
4501   case NODE_ANCHOR:
4502   case NODE_GIMMICK:
4503   default:
4504     break;
4505   }
4506 
4507   return r;
4508 }
4509 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
4510 
4511 
4512 #ifdef USE_CALL
4513 
4514 #ifdef __GNUC__
4515 __inline
4516 #endif
4517 static int
tune_call_node_call(CallNode * cn,ScanEnv * env,int state)4518 tune_call_node_call(CallNode* cn, ScanEnv* env, int state)
4519 {
4520   MemEnv* mem_env = SCANENV_MEMENV(env);
4521 
4522   if (cn->by_number != 0) {
4523     int gnum = cn->group_num;
4524 
4525     if (env->num_named > 0 &&
4526         IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4527         ! ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) {
4528       return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4529     }
4530 
4531     if (gnum > env->num_mem) {
4532       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
4533                                      cn->name, cn->name_end);
4534       return ONIGERR_UNDEFINED_GROUP_REFERENCE;
4535     }
4536 
4537   set_call_attr:
4538     NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node;
4539     if (IS_NULL(NODE_CALL_BODY(cn))) {
4540       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4541                                      cn->name, cn->name_end);
4542       return ONIGERR_UNDEFINED_NAME_REFERENCE;
4543     }
4544   }
4545   else {
4546     int *refs;
4547 
4548     int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
4549     if (n <= 0) {
4550       onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4551                                      cn->name, cn->name_end);
4552       return ONIGERR_UNDEFINED_NAME_REFERENCE;
4553     }
4554     else if (n > 1) {
4555       onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
4556                                      cn->name, cn->name_end);
4557       return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
4558     }
4559     else {
4560       cn->group_num = refs[0];
4561       goto set_call_attr;
4562     }
4563   }
4564 
4565   return 0;
4566 }
4567 
4568 static void
tune_call2_call(Node * node)4569 tune_call2_call(Node* node)
4570 {
4571   switch (NODE_TYPE(node)) {
4572   case NODE_LIST:
4573   case NODE_ALT:
4574     do {
4575       tune_call2_call(NODE_CAR(node));
4576     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4577     break;
4578 
4579   case NODE_QUANT:
4580     tune_call2_call(NODE_BODY(node));
4581     break;
4582 
4583   case NODE_ANCHOR:
4584     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4585       tune_call2_call(NODE_BODY(node));
4586     break;
4587 
4588   case NODE_BAG:
4589     {
4590       BagNode* en = BAG_(node);
4591 
4592       if (en->type == BAG_MEMORY) {
4593         if (! NODE_IS_MARK1(node)) {
4594           NODE_STATUS_ADD(node, MARK1);
4595           tune_call2_call(NODE_BODY(node));
4596           NODE_STATUS_REMOVE(node, MARK1);
4597         }
4598       }
4599       else if (en->type == BAG_IF_ELSE) {
4600         tune_call2_call(NODE_BODY(node));
4601         if (IS_NOT_NULL(en->te.Then))
4602           tune_call2_call(en->te.Then);
4603         if (IS_NOT_NULL(en->te.Else))
4604           tune_call2_call(en->te.Else);
4605       }
4606       else {
4607         tune_call2_call(NODE_BODY(node));
4608       }
4609     }
4610     break;
4611 
4612   case NODE_CALL:
4613     if (! NODE_IS_MARK1(node)) {
4614       NODE_STATUS_ADD(node, MARK1);
4615       {
4616         CallNode* cn = CALL_(node);
4617         Node* called = NODE_CALL_BODY(cn);
4618 
4619         cn->entry_count++;
4620 
4621         NODE_STATUS_ADD(called, CALLED);
4622         BAG_(called)->m.entry_count++;
4623         tune_call2_call(called);
4624       }
4625       NODE_STATUS_REMOVE(node, MARK1);
4626     }
4627     break;
4628 
4629   default:
4630     break;
4631   }
4632 }
4633 
4634 static int
tune_call(Node * node,ScanEnv * env,int state)4635 tune_call(Node* node, ScanEnv* env, int state)
4636 {
4637   int r;
4638 
4639   switch (NODE_TYPE(node)) {
4640   case NODE_LIST:
4641   case NODE_ALT:
4642     do {
4643       r = tune_call(NODE_CAR(node), env, state);
4644     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4645     break;
4646 
4647   case NODE_QUANT:
4648     if (QUANT_(node)->upper == 0)
4649       state |= IN_ZERO_REPEAT;
4650 
4651     r = tune_call(NODE_BODY(node), env, state);
4652     break;
4653 
4654   case NODE_ANCHOR:
4655     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4656       r = tune_call(NODE_BODY(node), env, state);
4657     else
4658       r = 0;
4659     break;
4660 
4661   case NODE_BAG:
4662     {
4663       BagNode* en = BAG_(node);
4664 
4665       if (en->type == BAG_MEMORY) {
4666         if ((state & IN_ZERO_REPEAT) != 0) {
4667           NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4668           BAG_(node)->m.entry_count--;
4669         }
4670         r = tune_call(NODE_BODY(node), env, state);
4671       }
4672       else if (en->type == BAG_IF_ELSE) {
4673         r = tune_call(NODE_BODY(node), env, state);
4674         if (r != 0) return r;
4675         if (IS_NOT_NULL(en->te.Then)) {
4676           r = tune_call(en->te.Then, env, state);
4677           if (r != 0) return r;
4678         }
4679         if (IS_NOT_NULL(en->te.Else))
4680           r = tune_call(en->te.Else, env, state);
4681       }
4682       else
4683         r = tune_call(NODE_BODY(node), env, state);
4684     }
4685     break;
4686 
4687   case NODE_CALL:
4688     if ((state & IN_ZERO_REPEAT) != 0) {
4689       NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4690       CALL_(node)->entry_count--;
4691     }
4692 
4693     r = tune_call_node_call(CALL_(node), env, state);
4694     break;
4695 
4696   default:
4697     r = 0;
4698     break;
4699   }
4700 
4701   return r;
4702 }
4703 
4704 static int
tune_call2(Node * node)4705 tune_call2(Node* node)
4706 {
4707   int r = 0;
4708 
4709   switch (NODE_TYPE(node)) {
4710   case NODE_LIST:
4711   case NODE_ALT:
4712     do {
4713       r = tune_call2(NODE_CAR(node));
4714     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4715     break;
4716 
4717   case NODE_QUANT:
4718     if (QUANT_(node)->upper != 0)
4719       r = tune_call2(NODE_BODY(node));
4720     break;
4721 
4722   case NODE_ANCHOR:
4723     if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4724       r = tune_call2(NODE_BODY(node));
4725     break;
4726 
4727   case NODE_BAG:
4728     if (! NODE_IS_IN_ZERO_REPEAT(node))
4729       r = tune_call2(NODE_BODY(node));
4730 
4731     {
4732       BagNode* en = BAG_(node);
4733 
4734       if (r != 0) return r;
4735       if (en->type == BAG_IF_ELSE) {
4736         if (IS_NOT_NULL(en->te.Then)) {
4737           r = tune_call2(en->te.Then);
4738           if (r != 0) return r;
4739         }
4740         if (IS_NOT_NULL(en->te.Else))
4741           r = tune_call2(en->te.Else);
4742       }
4743     }
4744     break;
4745 
4746   case NODE_CALL:
4747     if (! NODE_IS_IN_ZERO_REPEAT(node)) {
4748       tune_call2_call(node);
4749     }
4750     break;
4751 
4752   default:
4753     break;
4754   }
4755 
4756   return r;
4757 }
4758 
4759 
4760 static void
tune_called_state_call(Node * node,int state)4761 tune_called_state_call(Node* node, int state)
4762 {
4763   switch (NODE_TYPE(node)) {
4764   case NODE_ALT:
4765     state |= IN_ALT;
4766     /* fall */
4767   case NODE_LIST:
4768     do {
4769       tune_called_state_call(NODE_CAR(node), state);
4770     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4771     break;
4772 
4773   case NODE_QUANT:
4774     {
4775       QuantNode* qn = QUANT_(node);
4776 
4777       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4778         state |= IN_REAL_REPEAT;
4779       if (qn->lower != qn->upper)
4780         state |= IN_VAR_REPEAT;
4781 
4782       tune_called_state_call(NODE_QUANT_BODY(qn), state);
4783     }
4784     break;
4785 
4786   case NODE_ANCHOR:
4787     {
4788       AnchorNode* an = ANCHOR_(node);
4789 
4790       switch (an->type) {
4791       case ANCR_PREC_READ_NOT:
4792       case ANCR_LOOK_BEHIND_NOT:
4793         state |= IN_NOT;
4794         /* fall */
4795       case ANCR_PREC_READ:
4796       case ANCR_LOOK_BEHIND:
4797         tune_called_state_call(NODE_ANCHOR_BODY(an), state);
4798         break;
4799       default:
4800         break;
4801       }
4802     }
4803     break;
4804 
4805   case NODE_BAG:
4806     {
4807       BagNode* en = BAG_(node);
4808 
4809       if (en->type == BAG_MEMORY) {
4810         if (NODE_IS_MARK1(node)) {
4811           if ((~en->m.called_state & state) != 0) {
4812             en->m.called_state |= state;
4813             tune_called_state_call(NODE_BODY(node), state);
4814           }
4815         }
4816         else {
4817           NODE_STATUS_ADD(node, MARK1);
4818           en->m.called_state |= state;
4819           tune_called_state_call(NODE_BODY(node), state);
4820           NODE_STATUS_REMOVE(node, MARK1);
4821         }
4822       }
4823       else if (en->type == BAG_IF_ELSE) {
4824         state |= IN_ALT;
4825         tune_called_state_call(NODE_BODY(node), state);
4826         if (IS_NOT_NULL(en->te.Then)) {
4827           tune_called_state_call(en->te.Then, state);
4828         }
4829         if (IS_NOT_NULL(en->te.Else))
4830           tune_called_state_call(en->te.Else, state);
4831       }
4832       else {
4833         tune_called_state_call(NODE_BODY(node), state);
4834       }
4835     }
4836     break;
4837 
4838   case NODE_CALL:
4839     tune_called_state_call(NODE_BODY(node), state);
4840     break;
4841 
4842   default:
4843     break;
4844   }
4845 }
4846 
4847 static void
tune_called_state(Node * node,int state)4848 tune_called_state(Node* node, int state)
4849 {
4850   switch (NODE_TYPE(node)) {
4851   case NODE_ALT:
4852     state |= IN_ALT;
4853     /* fall */
4854   case NODE_LIST:
4855     do {
4856       tune_called_state(NODE_CAR(node), state);
4857     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4858     break;
4859 
4860 #ifdef USE_CALL
4861   case NODE_CALL:
4862     tune_called_state_call(node, state);
4863     break;
4864 #endif
4865 
4866   case NODE_BAG:
4867     {
4868       BagNode* en = BAG_(node);
4869 
4870       switch (en->type) {
4871       case BAG_MEMORY:
4872         if (en->m.entry_count > 1)
4873           state |= IN_MULTI_ENTRY;
4874 
4875         en->m.called_state |= state;
4876         /* fall */
4877       case BAG_OPTION:
4878       case BAG_STOP_BACKTRACK:
4879         tune_called_state(NODE_BODY(node), state);
4880         break;
4881       case BAG_IF_ELSE:
4882         state |= IN_ALT;
4883         tune_called_state(NODE_BODY(node), state);
4884         if (IS_NOT_NULL(en->te.Then))
4885           tune_called_state(en->te.Then, state);
4886         if (IS_NOT_NULL(en->te.Else))
4887           tune_called_state(en->te.Else, state);
4888         break;
4889       }
4890     }
4891     break;
4892 
4893   case NODE_QUANT:
4894     {
4895       QuantNode* qn = QUANT_(node);
4896 
4897       if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4898         state |= IN_REAL_REPEAT;
4899       if (qn->lower != qn->upper)
4900         state |= IN_VAR_REPEAT;
4901 
4902       tune_called_state(NODE_QUANT_BODY(qn), state);
4903     }
4904     break;
4905 
4906   case NODE_ANCHOR:
4907     {
4908       AnchorNode* an = ANCHOR_(node);
4909 
4910       switch (an->type) {
4911       case ANCR_PREC_READ_NOT:
4912       case ANCR_LOOK_BEHIND_NOT:
4913         state |= IN_NOT;
4914         /* fall */
4915       case ANCR_PREC_READ:
4916       case ANCR_LOOK_BEHIND:
4917         tune_called_state(NODE_ANCHOR_BODY(an), state);
4918         break;
4919       default:
4920         break;
4921       }
4922     }
4923     break;
4924 
4925   case NODE_BACKREF:
4926   case NODE_STRING:
4927   case NODE_CTYPE:
4928   case NODE_CCLASS:
4929   case NODE_GIMMICK:
4930   default:
4931     break;
4932   }
4933 }
4934 
4935 #endif  /* USE_CALL */
4936 
4937 
4938 static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
4939 
4940 #ifdef __GNUC__
4941 __inline
4942 #endif
4943 static int
tune_anchor(Node * node,regex_t * reg,int state,ScanEnv * env)4944 tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
4945 {
4946 /* allowed node types in look-behind */
4947 #define ALLOWED_TYPE_IN_LB \
4948   ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
4949   | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
4950   | NODE_BIT_CALL | NODE_BIT_GIMMICK)
4951 
4952 #define ALLOWED_BAG_IN_LB       ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4953 #define ALLOWED_BAG_IN_LB_NOT   ( 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4954 
4955 #define ALLOWED_ANCHOR_IN_LB \
4956   ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
4957   | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
4958   | ANCR_WORD_BEGIN | ANCR_WORD_END \
4959   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4960 
4961 #define ALLOWED_ANCHOR_IN_LB_NOT \
4962   ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
4963   | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
4964   | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
4965   | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4966 
4967   int r;
4968   AnchorNode* an = ANCHOR_(node);
4969 
4970   switch (an->type) {
4971   case ANCR_PREC_READ:
4972     r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env);
4973     break;
4974   case ANCR_PREC_READ_NOT:
4975     r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
4976     break;
4977 
4978   case ANCR_LOOK_BEHIND:
4979     {
4980       r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4981                           ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB);
4982       if (r < 0) return r;
4983       if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4984       r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env);
4985       if (r != 0) return r;
4986       r = tune_look_behind(node, reg, env);
4987     }
4988     break;
4989 
4990   case ANCR_LOOK_BEHIND_NOT:
4991     {
4992       r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4993                           ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4994       if (r < 0) return r;
4995       if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4996       r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND),
4997                     env);
4998       if (r != 0) return r;
4999       r = tune_look_behind(node, reg, env);
5000     }
5001     break;
5002 
5003   default:
5004     r = 0;
5005     break;
5006   }
5007 
5008   return r;
5009 }
5010 
5011 #ifdef __GNUC__
5012 __inline
5013 #endif
5014 static int
tune_quant(Node * node,regex_t * reg,int state,ScanEnv * env)5015 tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
5016 {
5017   int r;
5018   OnigLen d;
5019   QuantNode* qn = QUANT_(node);
5020   Node* body = NODE_BODY(node);
5021 
5022   if ((state & IN_REAL_REPEAT) != 0) {
5023     NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5024   }
5025   if ((state & IN_MULTI_ENTRY) != 0) {
5026     NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5027   }
5028 
5029   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5030     d = tree_min_len(body, env);
5031     if (d == 0) {
5032 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5033       qn->emptiness = quantifiers_memory_node_info(body);
5034 #else
5035       qn->emptiness = BODY_IS_EMPTY_POSSIBILITY;
5036 #endif
5037     }
5038   }
5039 
5040   if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5041     state |= IN_REAL_REPEAT;
5042   if (qn->lower != qn->upper)
5043     state |= IN_VAR_REPEAT;
5044 
5045   r = tune_tree(body, reg, state, env);
5046   if (r != 0) return r;
5047 
5048   /* expand string */
5049 #define EXPAND_STRING_MAX_LENGTH  100
5050   if (NODE_TYPE(body) == NODE_STRING) {
5051     if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5052         qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5053       int len = NODE_STRING_LEN(body);
5054 
5055       if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5056         int i, n = qn->lower;
5057         node_conv_to_str_node(node, STR_(body)->flag);
5058         for (i = 0; i < n; i++) {
5059           r = node_str_node_cat(node, body);
5060           if (r != 0) return r;
5061         }
5062         onig_node_free(body);
5063         return r;
5064       }
5065     }
5066   }
5067 
5068   if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5069     if (NODE_TYPE(body) == NODE_QUANT) {
5070       QuantNode* tqn = QUANT_(body);
5071       if (IS_NOT_NULL(tqn->head_exact)) {
5072         qn->head_exact  = tqn->head_exact;
5073         tqn->head_exact = NULL;
5074       }
5075     }
5076     else {
5077       qn->head_exact = get_head_value_node(NODE_BODY(node), 1, reg);
5078     }
5079   }
5080 
5081   return r;
5082 }
5083 
5084 /* tune_tree does the following work.
5085  1. check empty loop. (set qn->emptiness)
5086  2. expand ignore-case in char class.
5087  3. set memory status bit flags. (reg->mem_stats)
5088  4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5089  5. find invalid patterns in look-behind.
5090  6. expand repeated string.
5091  */
5092 static int
tune_tree(Node * node,regex_t * reg,int state,ScanEnv * env)5093 tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
5094 {
5095   int r = 0;
5096 
5097   switch (NODE_TYPE(node)) {
5098   case NODE_LIST:
5099     {
5100       Node* prev = NULL_NODE;
5101       do {
5102         r = tune_tree(NODE_CAR(node), reg, state, env);
5103         if (IS_NOT_NULL(prev) && r == 0) {
5104           r = tune_next(prev, NODE_CAR(node), reg);
5105         }
5106         prev = NODE_CAR(node);
5107       } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5108     }
5109     break;
5110 
5111   case NODE_ALT:
5112     do {
5113       r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5114     } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5115     break;
5116 
5117   case NODE_STRING:
5118     if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) {
5119       r = unravel_case_fold_string(node, reg, state);
5120     }
5121     break;
5122 
5123   case NODE_BACKREF:
5124     {
5125       int i;
5126       int* p;
5127       BackRefNode* br = BACKREF_(node);
5128       p = BACKREFS_P(br);
5129       for (i = 0; i < br->back_num; i++) {
5130         if (p[i] > env->num_mem)  return ONIGERR_INVALID_BACKREF;
5131         MEM_STATUS_ON(env->backrefed_mem, p[i]);
5132 #if 0
5133 #ifdef USE_BACKREF_WITH_LEVEL
5134         if (NODE_IS_NEST_LEVEL(node)) {
5135           MEM_STATUS_ON(env->backtrack_mem, p[i]);
5136         }
5137 #endif
5138 #else
5139         /* More precisely, it should be checked whether alt/repeat exists before
5140            the subject capture node, and then this backreference position
5141            exists before (or in) the capture node. */
5142         MEM_STATUS_ON(env->backtrack_mem, p[i]);
5143 #endif
5144       }
5145     }
5146     break;
5147 
5148   case NODE_BAG:
5149     {
5150       BagNode* en = BAG_(node);
5151 
5152       switch (en->type) {
5153       case BAG_OPTION:
5154         {
5155           OnigOptionType options = reg->options;
5156           reg->options = BAG_(node)->o.options;
5157           r = tune_tree(NODE_BODY(node), reg, state, env);
5158           reg->options = options;
5159         }
5160         break;
5161 
5162       case BAG_MEMORY:
5163 #ifdef USE_CALL
5164         state |= en->m.called_state;
5165 #endif
5166 
5167         if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5168             || NODE_IS_RECURSION(node)) {
5169           MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5170         }
5171         r = tune_tree(NODE_BODY(node), reg, state, env);
5172         break;
5173 
5174       case BAG_STOP_BACKTRACK:
5175         {
5176           Node* target = NODE_BODY(node);
5177           r = tune_tree(target, reg, state, env);
5178           if (NODE_TYPE(target) == NODE_QUANT) {
5179             QuantNode* tqn = QUANT_(target);
5180             if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5181                 tqn->greedy != 0) {  /* (?>a*), a*+ etc... */
5182               if (is_strict_real_node(NODE_BODY(target)))
5183                 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5184             }
5185           }
5186         }
5187         break;
5188 
5189       case BAG_IF_ELSE:
5190         r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5191         if (r != 0) return r;
5192         if (IS_NOT_NULL(en->te.Then)) {
5193           r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5194           if (r != 0) return r;
5195         }
5196         if (IS_NOT_NULL(en->te.Else))
5197           r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5198         break;
5199       }
5200     }
5201     break;
5202 
5203   case NODE_QUANT:
5204     r = tune_quant(node, reg, state, env);
5205     break;
5206 
5207   case NODE_ANCHOR:
5208     r = tune_anchor(node, reg, state, env);
5209     break;
5210 
5211 #ifdef USE_CALL
5212   case NODE_CALL:
5213 #endif
5214   case NODE_CTYPE:
5215   case NODE_CCLASS:
5216   case NODE_GIMMICK:
5217   default:
5218     break;
5219   }
5220 
5221   return r;
5222 }
5223 
5224 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5225 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5226                                           UChar* s, UChar* end,
5227                                           UChar skip[], int* roffset)
5228 {
5229   int i, j, k, len, offset;
5230   int n, clen;
5231   UChar* p;
5232   OnigEncoding enc;
5233   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5234   UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5235 
5236   enc = reg->enc;
5237   offset = ENC_GET_SKIP_OFFSET(enc);
5238   if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5239     UChar* p = s;
5240     while (1) {
5241       len = enclen(enc, p);
5242       if (p + len >= end) {
5243         if (len == 1) offset = 1;
5244         else          offset = 0;
5245         break;
5246       }
5247       p += len;
5248     }
5249   }
5250 
5251   len = (int )(end - s);
5252   if (len + offset >= UCHAR_MAX)
5253     return ONIGERR_PARSER_BUG;
5254 
5255   *roffset = offset;
5256 
5257   for (i = 0; i < CHAR_MAP_SIZE; i++) {
5258     skip[i] = (UChar )(len + offset);
5259   }
5260 
5261   for (p = s; p < end; ) {
5262     int z;
5263 
5264     clen = enclen(enc, p);
5265     if (p + clen > end) clen = (int )(end - p);
5266 
5267     len = (int )(end - p);
5268     for (j = 0; j < clen; j++) {
5269       z = len - j + (offset - 1);
5270       if (z <= 0) break;
5271       skip[p[j]] = z;
5272     }
5273 
5274     if (case_expand != 0) {
5275       n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5276                                              p, end, items);
5277       for (k = 0; k < n; k++) {
5278         ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5279         for (j = 0; j < clen; j++) {
5280           z = len - j + (offset - 1);
5281           if (z <= 0) break;
5282           if (skip[buf[j]] > z)
5283             skip[buf[j]] = z;
5284         }
5285       }
5286     }
5287 
5288     p += clen;
5289   }
5290 
5291   return 0;
5292 }
5293 
5294 
5295 #define OPT_EXACT_MAXLEN   24
5296 
5297 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5298 #error Too big OPT_EXACT_MAXLEN
5299 #endif
5300 
5301 typedef struct {
5302   OnigLen min;  /* min byte length */
5303   OnigLen max;  /* max byte length */
5304 } MinMax;
5305 
5306 typedef struct {
5307   MinMax           mm;
5308   OnigEncoding     enc;
5309   OnigOptionType   options;
5310   OnigCaseFoldType case_fold_flag;
5311   ScanEnv*         scan_env;
5312 } OptEnv;
5313 
5314 typedef struct {
5315   int left;
5316   int right;
5317 } OptAnc;
5318 
5319 typedef struct {
5320   MinMax     mm;   /* position */
5321   OptAnc     anc;
5322   int        reach_end;
5323   int        case_fold;
5324   int        len;
5325   UChar      s[OPT_EXACT_MAXLEN];
5326 } OptStr;
5327 
5328 typedef struct {
5329   MinMax    mm;     /* position */
5330   OptAnc    anc;
5331   int       value;  /* weighted value */
5332   UChar     map[CHAR_MAP_SIZE];
5333 } OptMap;
5334 
5335 typedef struct {
5336   MinMax  len;
5337   OptAnc  anc;
5338   OptStr  sb;     /* boundary */
5339   OptStr  sm;     /* middle */
5340   OptStr  spr;    /* prec read (?=...) */
5341   OptMap  map;    /* boundary */
5342 } OptNode;
5343 
5344 
5345 static int
map_position_value(OnigEncoding enc,int i)5346 map_position_value(OnigEncoding enc, int i)
5347 {
5348   static const short int Vals[] = {
5349      5,  1,  1,  1,  1,  1,  1,  1,  1, 10, 10,  1,  1, 10,  1,  1,
5350      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
5351     12,  4,  7,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
5352      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,
5353      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
5354      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,  5,  5,  5,
5355      5,  6,  6,  6,  6,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
5356      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  1
5357   };
5358 
5359   if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
5360     if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
5361       return 20;
5362     else
5363       return (int )Vals[i];
5364   }
5365   else
5366     return 4;   /* Take it easy. */
5367 }
5368 
5369 static int
distance_value(MinMax * mm)5370 distance_value(MinMax* mm)
5371 {
5372   /* 1000 / (min-max-dist + 1) */
5373   static const short int dist_vals[] = {
5374     1000,  500,  333,  250,  200,  167,  143,  125,  111,  100,
5375       91,   83,   77,   71,   67,   63,   59,   56,   53,   50,
5376       48,   45,   43,   42,   40,   38,   37,   36,   34,   33,
5377       32,   31,   30,   29,   29,   28,   27,   26,   26,   25,
5378       24,   24,   23,   23,   22,   22,   21,   21,   20,   20,
5379       20,   19,   19,   19,   18,   18,   18,   17,   17,   17,
5380       16,   16,   16,   16,   15,   15,   15,   15,   14,   14,
5381       14,   14,   14,   14,   13,   13,   13,   13,   13,   13,
5382       12,   12,   12,   12,   12,   12,   11,   11,   11,   11,
5383       11,   11,   11,   11,   11,   10,   10,   10,   10,   10
5384   };
5385 
5386   OnigLen d;
5387 
5388   if (mm->max == INFINITE_LEN) return 0;
5389 
5390   d = mm->max - mm->min;
5391   if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
5392     /* return dist_vals[d] * 16 / (mm->min + 12); */
5393     return (int )dist_vals[d];
5394   else
5395     return 1;
5396 }
5397 
5398 static int
comp_distance_value(MinMax * d1,MinMax * d2,int v1,int v2)5399 comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2)
5400 {
5401   if (v2 <= 0) return -1;
5402   if (v1 <= 0) return  1;
5403 
5404   v1 *= distance_value(d1);
5405   v2 *= distance_value(d2);
5406 
5407   if (v2 > v1) return  1;
5408   if (v2 < v1) return -1;
5409 
5410   if (d2->min < d1->min) return  1;
5411   if (d2->min > d1->min) return -1;
5412   return 0;
5413 }
5414 
5415 static int
is_equal_mml(MinMax * a,MinMax * b)5416 is_equal_mml(MinMax* a, MinMax* b)
5417 {
5418   return a->min == b->min && a->max == b->max;
5419 }
5420 
5421 static void
set_mml(MinMax * l,OnigLen min,OnigLen max)5422 set_mml(MinMax* l, OnigLen min, OnigLen max)
5423 {
5424   l->min = min;
5425   l->max = max;
5426 }
5427 
5428 static void
clear_mml(MinMax * l)5429 clear_mml(MinMax* l)
5430 {
5431   l->min = l->max = 0;
5432 }
5433 
5434 static void
copy_mml(MinMax * to,MinMax * from)5435 copy_mml(MinMax* to, MinMax* from)
5436 {
5437   to->min = from->min;
5438   to->max = from->max;
5439 }
5440 
5441 static void
add_mml(MinMax * to,MinMax * from)5442 add_mml(MinMax* to, MinMax* from)
5443 {
5444   to->min = distance_add(to->min, from->min);
5445   to->max = distance_add(to->max, from->max);
5446 }
5447 
5448 static void
alt_merge_mml(MinMax * to,MinMax * from)5449 alt_merge_mml(MinMax* to, MinMax* from)
5450 {
5451   if (to->min > from->min) to->min = from->min;
5452   if (to->max < from->max) to->max = from->max;
5453 }
5454 
5455 static void
copy_opt_env(OptEnv * to,OptEnv * from)5456 copy_opt_env(OptEnv* to, OptEnv* from)
5457 {
5458   *to = *from;
5459 }
5460 
5461 static void
clear_opt_anc_info(OptAnc * a)5462 clear_opt_anc_info(OptAnc* a)
5463 {
5464   a->left  = 0;
5465   a->right = 0;
5466 }
5467 
5468 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)5469 copy_opt_anc_info(OptAnc* to, OptAnc* from)
5470 {
5471   *to = *from;
5472 }
5473 
5474 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)5475 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
5476                     OnigLen left_len, OnigLen right_len)
5477 {
5478   clear_opt_anc_info(to);
5479 
5480   to->left = left->left;
5481   if (left_len == 0) {
5482     to->left |= right->left;
5483   }
5484 
5485   to->right = right->right;
5486   if (right_len == 0) {
5487     to->right |= left->right;
5488   }
5489   else {
5490     to->right |= (left->right & ANCR_PREC_READ_NOT);
5491   }
5492 }
5493 
5494 static int
is_left(int a)5495 is_left(int a)
5496 {
5497   if (a == ANCR_END_BUF  || a == ANCR_SEMI_END_BUF ||
5498       a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
5499     return 0;
5500 
5501   return 1;
5502 }
5503 
5504 static int
is_set_opt_anc_info(OptAnc * to,int anc)5505 is_set_opt_anc_info(OptAnc* to, int anc)
5506 {
5507   if ((to->left & anc) != 0) return 1;
5508 
5509   return ((to->right & anc) != 0 ? 1 : 0);
5510 }
5511 
5512 static void
add_opt_anc_info(OptAnc * to,int anc)5513 add_opt_anc_info(OptAnc* to, int anc)
5514 {
5515   if (is_left(anc))
5516     to->left |= anc;
5517   else
5518     to->right |= anc;
5519 }
5520 
5521 static void
remove_opt_anc_info(OptAnc * to,int anc)5522 remove_opt_anc_info(OptAnc* to, int anc)
5523 {
5524   if (is_left(anc))
5525     to->left &= ~anc;
5526   else
5527     to->right &= ~anc;
5528 }
5529 
5530 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)5531 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
5532 {
5533   to->left  &= add->left;
5534   to->right &= add->right;
5535 }
5536 
5537 static int
is_full_opt_exact(OptStr * e)5538 is_full_opt_exact(OptStr* e)
5539 {
5540   return e->len >= OPT_EXACT_MAXLEN;
5541 }
5542 
5543 static void
clear_opt_exact(OptStr * e)5544 clear_opt_exact(OptStr* e)
5545 {
5546   clear_mml(&e->mm);
5547   clear_opt_anc_info(&e->anc);
5548   e->reach_end      = 0;
5549   e->case_fold      = 0;
5550   e->len            = 0;
5551   e->s[0]           = '\0';
5552 }
5553 
5554 static void
copy_opt_exact(OptStr * to,OptStr * from)5555 copy_opt_exact(OptStr* to, OptStr* from)
5556 {
5557   *to = *from;
5558 }
5559 
5560 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)5561 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
5562 {
5563   int i, j, len, r;
5564   UChar *p, *end;
5565   OptAnc tanc;
5566 
5567   if (add->case_fold != 0) {
5568     if (! to->case_fold) {
5569       if (to->len > 1 || to->len >= add->len) return 0;  /* avoid */
5570 
5571       to->case_fold = 1;
5572     }
5573   }
5574 
5575   r = 0;
5576   p = add->s;
5577   end = p + add->len;
5578   for (i = to->len; p < end; ) {
5579     len = enclen(enc, p);
5580     if (i + len > OPT_EXACT_MAXLEN) {
5581       r = 1; /* 1:full */
5582       break;
5583     }
5584     for (j = 0; j < len && p < end; j++)
5585       to->s[i++] = *p++;
5586   }
5587 
5588   to->len = i;
5589   to->reach_end = (p == end ? add->reach_end : 0);
5590 
5591   concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
5592   if (! to->reach_end) tanc.right = 0;
5593   copy_opt_anc_info(&to->anc, &tanc);
5594 
5595   return r;
5596 }
5597 
5598 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)5599 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
5600 {
5601   int i, j, len;
5602   UChar *p;
5603 
5604   for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
5605     len = enclen(enc, p);
5606     if (i + len > OPT_EXACT_MAXLEN) break;
5607     for (j = 0; j < len && p < end; j++)
5608       to->s[i++] = *p++;
5609   }
5610 
5611   to->len = i;
5612 
5613   if (p >= end && to->len == (int )(end - s))
5614     to->reach_end = 1;
5615 }
5616 
5617 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)5618 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
5619 {
5620   int i, j, len;
5621 
5622   if (add->len == 0 || to->len == 0) {
5623     clear_opt_exact(to);
5624     return ;
5625   }
5626 
5627   if (! is_equal_mml(&to->mm, &add->mm)) {
5628     clear_opt_exact(to);
5629     return ;
5630   }
5631 
5632   for (i = 0; i < to->len && i < add->len; ) {
5633     if (to->s[i] != add->s[i]) break;
5634     len = enclen(env->enc, to->s + i);
5635 
5636     for (j = 1; j < len; j++) {
5637       if (to->s[i+j] != add->s[i+j]) break;
5638     }
5639     if (j < len) break;
5640     i += len;
5641   }
5642 
5643   if (! add->reach_end || i < add->len || i < to->len) {
5644     to->reach_end = 0;
5645   }
5646   to->len = i;
5647   if (add->case_fold != 0)
5648     to->case_fold = 1;
5649 
5650   alt_merge_opt_anc_info(&to->anc, &add->anc);
5651   if (! to->reach_end) to->anc.right = 0;
5652 }
5653 
5654 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)5655 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
5656 {
5657   int vn, va;
5658 
5659   vn = now->len;
5660   va = alt->len;
5661 
5662   if (va == 0) {
5663     return ;
5664   }
5665   else if (vn == 0) {
5666     copy_opt_exact(now, alt);
5667     return ;
5668   }
5669   else if (vn <= 2 && va <= 2) {
5670     /* ByteValTable[x] is big value --> low price */
5671     va = map_position_value(enc, now->s[0]);
5672     vn = map_position_value(enc, alt->s[0]);
5673 
5674     if (now->len > 1) vn += 5;
5675     if (alt->len > 1) va += 5;
5676   }
5677 
5678   if (now->case_fold == 0) vn *= 2;
5679   if (alt->case_fold == 0) va *= 2;
5680 
5681   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
5682     copy_opt_exact(now, alt);
5683 }
5684 
5685 static void
clear_opt_map(OptMap * map)5686 clear_opt_map(OptMap* map)
5687 {
5688   static const OptMap clean_info = {
5689     {0, 0}, {0, 0}, 0,
5690     {
5691       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5692       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5693       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5694       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5695       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5696       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5697       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5698       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5699       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5700       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5701       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5702       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5703       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5704       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5705       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5706       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5707     }
5708   };
5709 
5710   xmemcpy(map, &clean_info, sizeof(OptMap));
5711 }
5712 
5713 static void
copy_opt_map(OptMap * to,OptMap * from)5714 copy_opt_map(OptMap* to, OptMap* from)
5715 {
5716   *to = *from;
5717 }
5718 
5719 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)5720 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
5721 {
5722   if (m->map[c] == 0) {
5723     m->map[c] = 1;
5724     m->value += map_position_value(enc, c);
5725   }
5726 }
5727 
5728 static int
add_char_amb_opt_map(OptMap * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType fold_flag)5729 add_char_amb_opt_map(OptMap* map, UChar* p, UChar* end,
5730                      OnigEncoding enc, OnigCaseFoldType fold_flag)
5731 {
5732   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5733   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5734   int i, n;
5735 
5736   add_char_opt_map(map, p[0], enc);
5737 
5738   fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag);
5739   n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, fold_flag, p, end, items);
5740   if (n < 0) return n;
5741 
5742   for (i = 0; i < n; i++) {
5743     ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
5744     add_char_opt_map(map, buf[0], enc);
5745   }
5746 
5747   return 0;
5748 }
5749 
5750 static void
select_opt_map(OptMap * now,OptMap * alt)5751 select_opt_map(OptMap* now, OptMap* alt)
5752 {
5753   static int z = 1<<15; /* 32768: something big value */
5754 
5755   int vn, va;
5756 
5757   if (alt->value == 0) return ;
5758   if (now->value == 0) {
5759     copy_opt_map(now, alt);
5760     return ;
5761   }
5762 
5763   vn = z / now->value;
5764   va = z / alt->value;
5765   if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
5766     copy_opt_map(now, alt);
5767 }
5768 
5769 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)5770 comp_opt_exact_or_map(OptStr* e, OptMap* m)
5771 {
5772 #define COMP_EM_BASE  20
5773   int ae, am;
5774   int case_value;
5775 
5776   if (m->value <= 0) return -1;
5777 
5778   if (e->case_fold != 0) {
5779     case_value = 1;
5780   }
5781   else
5782     case_value = 3;
5783 
5784   ae = COMP_EM_BASE * e->len * case_value;
5785   am = COMP_EM_BASE * 5 * 2 / m->value;
5786   return comp_distance_value(&e->mm, &m->mm, ae, am);
5787 }
5788 
5789 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)5790 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
5791 {
5792   int i, val;
5793 
5794   /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */
5795   if (to->value == 0) return ;
5796   if (add->value == 0 || to->mm.max < add->mm.min) {
5797     clear_opt_map(to);
5798     return ;
5799   }
5800 
5801   alt_merge_mml(&to->mm, &add->mm);
5802 
5803   val = 0;
5804   for (i = 0; i < CHAR_MAP_SIZE; i++) {
5805     if (add->map[i])
5806       to->map[i] = 1;
5807 
5808     if (to->map[i])
5809       val += map_position_value(enc, i);
5810   }
5811   to->value = val;
5812 
5813   alt_merge_opt_anc_info(&to->anc, &add->anc);
5814 }
5815 
5816 static void
set_bound_node_opt_info(OptNode * opt,MinMax * plen)5817 set_bound_node_opt_info(OptNode* opt, MinMax* plen)
5818 {
5819   copy_mml(&(opt->sb.mm),  plen);
5820   copy_mml(&(opt->spr.mm), plen);
5821   copy_mml(&(opt->map.mm), plen);
5822 }
5823 
5824 static void
clear_node_opt_info(OptNode * opt)5825 clear_node_opt_info(OptNode* opt)
5826 {
5827   clear_mml(&opt->len);
5828   clear_opt_anc_info(&opt->anc);
5829   clear_opt_exact(&opt->sb);
5830   clear_opt_exact(&opt->sm);
5831   clear_opt_exact(&opt->spr);
5832   clear_opt_map(&opt->map);
5833 }
5834 
5835 static void
copy_node_opt_info(OptNode * to,OptNode * from)5836 copy_node_opt_info(OptNode* to, OptNode* from)
5837 {
5838   *to = *from;
5839 }
5840 
5841 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)5842 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
5843 {
5844   int sb_reach, sm_reach;
5845   OptAnc tanc;
5846 
5847   concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
5848   copy_opt_anc_info(&to->anc, &tanc);
5849 
5850   if (add->sb.len > 0 && to->len.max == 0) {
5851     concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
5852     copy_opt_anc_info(&add->sb.anc, &tanc);
5853   }
5854 
5855   if (add->map.value > 0 && to->len.max == 0) {
5856     if (add->map.mm.max == 0)
5857       add->map.anc.left |= to->anc.left;
5858   }
5859 
5860   sb_reach = to->sb.reach_end;
5861   sm_reach = to->sm.reach_end;
5862 
5863   if (add->len.max != 0)
5864     to->sb.reach_end = to->sm.reach_end = 0;
5865 
5866   if (add->sb.len > 0) {
5867     if (sb_reach) {
5868       concat_opt_exact(&to->sb, &add->sb, enc);
5869       clear_opt_exact(&add->sb);
5870     }
5871     else if (sm_reach) {
5872       concat_opt_exact(&to->sm, &add->sb, enc);
5873       clear_opt_exact(&add->sb);
5874     }
5875   }
5876   select_opt_exact(enc, &to->sm, &add->sb);
5877   select_opt_exact(enc, &to->sm, &add->sm);
5878 
5879   if (to->spr.len > 0) {
5880     if (add->len.max > 0) {
5881       if (to->spr.mm.max == 0)
5882         select_opt_exact(enc, &to->sb, &to->spr);
5883       else
5884         select_opt_exact(enc, &to->sm, &to->spr);
5885     }
5886   }
5887   else if (add->spr.len > 0) {
5888     copy_opt_exact(&to->spr, &add->spr);
5889   }
5890 
5891   select_opt_map(&to->map, &add->map);
5892   add_mml(&to->len, &add->len);
5893 }
5894 
5895 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)5896 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
5897 {
5898   alt_merge_opt_anc_info(&to->anc, &add->anc);
5899   alt_merge_opt_exact(&to->sb,  &add->sb, env);
5900   alt_merge_opt_exact(&to->sm,  &add->sm, env);
5901   alt_merge_opt_exact(&to->spr, &add->spr, env);
5902   alt_merge_opt_map(env->enc, &to->map, &add->map);
5903 
5904   alt_merge_mml(&to->len, &add->len);
5905 }
5906 
5907 
5908 #define MAX_NODE_OPT_INFO_REF_COUNT    5
5909 
5910 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)5911 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
5912 {
5913   int i;
5914   int r;
5915   OptNode xo;
5916   OnigEncoding enc;
5917 
5918   r = 0;
5919   enc = env->enc;
5920   clear_node_opt_info(opt);
5921   set_bound_node_opt_info(opt, &env->mm);
5922 
5923   switch (NODE_TYPE(node)) {
5924   case NODE_LIST:
5925     {
5926       OptEnv nenv;
5927       Node* nd = node;
5928 
5929       copy_opt_env(&nenv, env);
5930       do {
5931         r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
5932         if (r == 0) {
5933           add_mml(&nenv.mm, &xo.len);
5934           concat_left_node_opt_info(enc, opt, &xo);
5935         }
5936       } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
5937     }
5938     break;
5939 
5940   case NODE_ALT:
5941     {
5942       Node* nd = node;
5943 
5944       do {
5945         r = optimize_nodes(NODE_CAR(nd), &xo, env);
5946         if (r == 0) {
5947           if (nd == node) copy_node_opt_info(opt, &xo);
5948           else            alt_merge_node_opt_info(opt, &xo, env);
5949         }
5950       } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
5951     }
5952     break;
5953 
5954   case NODE_STRING:
5955     {
5956       StrNode* sn = STR_(node);
5957       int slen = (int )(sn->end - sn->s);
5958 
5959       if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) {
5960         concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5961         if (slen > 0) {
5962           add_char_opt_map(&opt->map, *(sn->s), enc);
5963         }
5964         set_mml(&opt->len, slen, slen);
5965       }
5966       else {
5967         int max, min;
5968 
5969         concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5970         opt->sb.case_fold = 1;
5971 
5972         if (slen > 0) {
5973           r = add_char_amb_opt_map(&opt->map, sn->s, sn->end,
5974                                    enc, env->case_fold_flag);
5975           if (r != 0) break;
5976         }
5977 
5978         max = slen;
5979         min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc);
5980         set_mml(&opt->len, min, max);
5981       }
5982     }
5983     break;
5984 
5985   case NODE_CCLASS:
5986     {
5987       int z;
5988       CClassNode* cc = CCLASS_(node);
5989 
5990       /* no need to check ignore case. (set in tune_tree()) */
5991 
5992       if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
5993         OnigLen min = ONIGENC_MBC_MINLEN(enc);
5994         OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
5995 
5996         set_mml(&opt->len, min, max);
5997       }
5998       else {
5999         for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6000           z = BITSET_AT(cc->bs, i);
6001           if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6002             add_char_opt_map(&opt->map, (UChar )i, enc);
6003           }
6004         }
6005         set_mml(&opt->len, 1, 1);
6006       }
6007     }
6008     break;
6009 
6010   case NODE_CTYPE:
6011     {
6012       int min, max;
6013       int range;
6014 
6015       max = ONIGENC_MBC_MAXLEN_DIST(enc);
6016 
6017       if (max == 1) {
6018         min = 1;
6019 
6020         switch (CTYPE_(node)->ctype) {
6021         case CTYPE_ANYCHAR:
6022           break;
6023 
6024         case ONIGENC_CTYPE_WORD:
6025           range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6026           if (CTYPE_(node)->not != 0) {
6027             for (i = 0; i < range; i++) {
6028               if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6029                 add_char_opt_map(&opt->map, (UChar )i, enc);
6030               }
6031             }
6032             for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6033               add_char_opt_map(&opt->map, (UChar )i, enc);
6034             }
6035           }
6036           else {
6037             for (i = 0; i < range; i++) {
6038               if (ONIGENC_IS_CODE_WORD(enc, i)) {
6039                 add_char_opt_map(&opt->map, (UChar )i, enc);
6040               }
6041             }
6042           }
6043           break;
6044         }
6045       }
6046       else {
6047         min = ONIGENC_MBC_MINLEN(enc);
6048       }
6049       set_mml(&opt->len, min, max);
6050     }
6051     break;
6052 
6053   case NODE_ANCHOR:
6054     switch (ANCHOR_(node)->type) {
6055     case ANCR_BEGIN_BUF:
6056     case ANCR_BEGIN_POSITION:
6057     case ANCR_BEGIN_LINE:
6058     case ANCR_END_BUF:
6059     case ANCR_SEMI_END_BUF:
6060     case ANCR_END_LINE:
6061     case ANCR_PREC_READ_NOT:
6062     case ANCR_LOOK_BEHIND:
6063       add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6064       break;
6065 
6066     case ANCR_PREC_READ:
6067       {
6068         r = optimize_nodes(NODE_BODY(node), &xo, env);
6069         if (r == 0) {
6070           if (xo.sb.len > 0)
6071             copy_opt_exact(&opt->spr, &xo.sb);
6072           else if (xo.sm.len > 0)
6073             copy_opt_exact(&opt->spr, &xo.sm);
6074 
6075           opt->spr.reach_end = 0;
6076 
6077           if (xo.map.value > 0)
6078             copy_opt_map(&opt->map, &xo.map);
6079         }
6080       }
6081       break;
6082 
6083     case ANCR_LOOK_BEHIND_NOT:
6084       break;
6085     }
6086     break;
6087 
6088   case NODE_BACKREF:
6089     if (! NODE_IS_CHECKER(node)) {
6090       int* backs;
6091       OnigLen min, max, tmin, tmax;
6092       MemEnv* mem_env = SCANENV_MEMENV(env->scan_env);
6093       BackRefNode* br = BACKREF_(node);
6094 
6095       if (NODE_IS_RECURSION(node)) {
6096         set_mml(&opt->len, 0, INFINITE_LEN);
6097         break;
6098       }
6099       backs = BACKREFS_P(br);
6100       min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env);
6101       max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env);
6102       for (i = 1; i < br->back_num; i++) {
6103         tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env);
6104         tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env);
6105         if (min > tmin) min = tmin;
6106         if (max < tmax) max = tmax;
6107       }
6108       set_mml(&opt->len, min, max);
6109     }
6110     break;
6111 
6112 #ifdef USE_CALL
6113   case NODE_CALL:
6114     if (NODE_IS_RECURSION(node))
6115       set_mml(&opt->len, 0, INFINITE_LEN);
6116     else {
6117       OnigOptionType save = env->options;
6118       env->options = BAG_(NODE_BODY(node))->o.options;
6119       r = optimize_nodes(NODE_BODY(node), opt, env);
6120       env->options = save;
6121     }
6122     break;
6123 #endif
6124 
6125   case NODE_QUANT:
6126     {
6127       OnigLen min, max;
6128       QuantNode* qn = QUANT_(node);
6129 
6130       r = optimize_nodes(NODE_BODY(node), &xo, env);
6131       if (r != 0) break;
6132 
6133       if (qn->lower > 0) {
6134         copy_node_opt_info(opt, &xo);
6135         if (xo.sb.len > 0) {
6136           if (xo.sb.reach_end) {
6137             for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6138               int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6139               if (rc > 0) break;
6140             }
6141             if (i < qn->lower) opt->sb.reach_end = 0;
6142           }
6143         }
6144 
6145         if (qn->lower != qn->upper) {
6146           opt->sb.reach_end = 0;
6147           opt->sm.reach_end = 0;
6148         }
6149         if (qn->lower > 1)
6150           opt->sm.reach_end = 0;
6151       }
6152 
6153       if (IS_INFINITE_REPEAT(qn->upper)) {
6154         if (env->mm.max == 0 &&
6155             NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6156           if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env)))
6157             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6158           else
6159             add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6160         }
6161 
6162         max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6163       }
6164       else {
6165         max = distance_multiply(xo.len.max, qn->upper);
6166       }
6167 
6168       min = distance_multiply(xo.len.min, qn->lower);
6169       set_mml(&opt->len, min, max);
6170     }
6171     break;
6172 
6173   case NODE_BAG:
6174     {
6175       BagNode* en = BAG_(node);
6176 
6177       switch (en->type) {
6178       case BAG_OPTION:
6179         {
6180           OnigOptionType save = env->options;
6181 
6182           env->options = en->o.options;
6183           r = optimize_nodes(NODE_BODY(node), opt, env);
6184           env->options = save;
6185         }
6186         break;
6187 
6188       case BAG_MEMORY:
6189 #ifdef USE_CALL
6190         en->opt_count++;
6191         if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6192           OnigLen min, max;
6193 
6194           min = 0;
6195           max = INFINITE_LEN;
6196           if (NODE_IS_MIN_FIXED(node)) min = en->min_len;
6197           if (NODE_IS_MAX_FIXED(node)) max = en->max_len;
6198           set_mml(&opt->len, min, max);
6199         }
6200         else
6201 #endif
6202           {
6203             r = optimize_nodes(NODE_BODY(node), opt, env);
6204             if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6205               if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6206                 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6207             }
6208           }
6209         break;
6210 
6211       case BAG_STOP_BACKTRACK:
6212         r = optimize_nodes(NODE_BODY(node), opt, env);
6213         break;
6214 
6215       case BAG_IF_ELSE:
6216         {
6217           OptEnv nenv;
6218 
6219           copy_opt_env(&nenv, env);
6220           r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6221           if (r == 0) {
6222             add_mml(&nenv.mm, &xo.len);
6223             concat_left_node_opt_info(enc, opt, &xo);
6224             if (IS_NOT_NULL(en->te.Then)) {
6225               r = optimize_nodes(en->te.Then, &xo, &nenv);
6226               if (r == 0) {
6227                 concat_left_node_opt_info(enc, opt, &xo);
6228               }
6229             }
6230 
6231             if (IS_NOT_NULL(en->te.Else)) {
6232               r = optimize_nodes(en->te.Else, &xo, env);
6233               if (r == 0)
6234                 alt_merge_node_opt_info(opt, &xo, env);
6235             }
6236           }
6237         }
6238         break;
6239       }
6240     }
6241     break;
6242 
6243   case NODE_GIMMICK:
6244     break;
6245 
6246   default:
6247 #ifdef ONIG_DEBUG
6248     fprintf(stderr, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6249 #endif
6250     r = ONIGERR_TYPE_BUG;
6251     break;
6252   }
6253 
6254   return r;
6255 }
6256 
6257 static int
set_optimize_exact(regex_t * reg,OptStr * e)6258 set_optimize_exact(regex_t* reg, OptStr* e)
6259 {
6260   int r;
6261 
6262   if (e->len == 0) return 0;
6263 
6264   reg->exact = (UChar* )xmalloc(e->len);
6265   CHECK_NULL_RETURN_MEMERR(reg->exact);
6266   xmemcpy(reg->exact, e->s, e->len);
6267   reg->exact_end = reg->exact + e->len;
6268 
6269   if (e->case_fold) {
6270     reg->optimize = OPTIMIZE_STR_CASE_FOLD;
6271   }
6272   else {
6273     int allow_reverse;
6274 
6275     allow_reverse =
6276       ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6277 
6278     if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6279       r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6280                                          reg->exact, reg->exact_end,
6281                                          reg->map, &(reg->map_offset));
6282       if (r != 0) return r;
6283 
6284       reg->optimize = (allow_reverse != 0
6285                        ? OPTIMIZE_STR_FAST
6286                        : OPTIMIZE_STR_FAST_STEP_FORWARD);
6287     }
6288     else {
6289       reg->optimize = OPTIMIZE_STR;
6290     }
6291   }
6292 
6293   reg->dist_min = e->mm.min;
6294   reg->dist_max = e->mm.max;
6295 
6296   if (reg->dist_min != INFINITE_LEN) {
6297     int n;
6298     if (e->case_fold != 0)
6299       n = 1;
6300     else
6301       n = (int )(reg->exact_end - reg->exact);
6302 
6303     reg->threshold_len = reg->dist_min + n;
6304   }
6305 
6306   return 0;
6307 }
6308 
6309 static void
set_optimize_map(regex_t * reg,OptMap * m)6310 set_optimize_map(regex_t* reg, OptMap* m)
6311 {
6312   int i;
6313 
6314   for (i = 0; i < CHAR_MAP_SIZE; i++)
6315     reg->map[i] = m->map[i];
6316 
6317   reg->optimize   = OPTIMIZE_MAP;
6318   reg->dist_min   = m->mm.min;
6319   reg->dist_max   = m->mm.max;
6320 
6321   if (reg->dist_min != INFINITE_LEN) {
6322     reg->threshold_len = reg->dist_min + 1;
6323   }
6324 }
6325 
6326 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6327 set_sub_anchor(regex_t* reg, OptAnc* anc)
6328 {
6329   reg->sub_anchor |= anc->left  & ANCR_BEGIN_LINE;
6330   reg->sub_anchor |= anc->right & ANCR_END_LINE;
6331 }
6332 
6333 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6334 static void print_optimize_info(FILE* f, regex_t* reg);
6335 #endif
6336 
6337 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)6338 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
6339 {
6340   int r;
6341   OptNode opt;
6342   OptEnv env;
6343 
6344   env.enc            = reg->enc;
6345   env.options        = reg->options;
6346   env.case_fold_flag = reg->case_fold_flag;
6347   env.scan_env       = scan_env;
6348   clear_mml(&env.mm);
6349 
6350   r = optimize_nodes(node, &opt, &env);
6351   if (r != 0) return r;
6352 
6353   reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6354         ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6355         ANCR_LOOK_BEHIND);
6356 
6357   if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6358     reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6359 
6360   reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6361                                   ANCR_PREC_READ_NOT);
6362 
6363   if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6364     reg->anc_dist_min = opt.len.min;
6365     reg->anc_dist_max = opt.len.max;
6366   }
6367 
6368   if (opt.sb.len > 0 || opt.sm.len > 0) {
6369     select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6370     if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6371       goto set_map;
6372     }
6373     else {
6374       r = set_optimize_exact(reg, &opt.sb);
6375       set_sub_anchor(reg, &opt.sb.anc);
6376     }
6377   }
6378   else if (opt.map.value > 0) {
6379   set_map:
6380     set_optimize_map(reg, &opt.map);
6381     set_sub_anchor(reg, &opt.map.anc);
6382   }
6383   else {
6384     reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
6385     if (opt.len.max == 0)
6386       reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
6387   }
6388 
6389 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6390   print_optimize_info(stderr, reg);
6391 #endif
6392   return r;
6393 }
6394 
6395 static void
clear_optimize_info(regex_t * reg)6396 clear_optimize_info(regex_t* reg)
6397 {
6398   reg->optimize      = OPTIMIZE_NONE;
6399   reg->anchor        = 0;
6400   reg->anc_dist_min  = 0;
6401   reg->anc_dist_max  = 0;
6402   reg->sub_anchor    = 0;
6403   reg->exact_end     = (UChar* )NULL;
6404   reg->map_offset    = 0;
6405   reg->threshold_len = 0;
6406   if (IS_NOT_NULL(reg->exact)) {
6407     xfree(reg->exact);
6408     reg->exact = (UChar* )NULL;
6409   }
6410 }
6411 
6412 #ifdef ONIG_DEBUG
6413 
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)6414 static void print_enc_string(FILE* fp, OnigEncoding enc,
6415                              const UChar *s, const UChar *end)
6416 {
6417   fprintf(fp, "\nPATTERN: /");
6418 
6419   if (ONIGENC_MBC_MINLEN(enc) > 1) {
6420     const UChar *p;
6421     OnigCodePoint code;
6422 
6423     p = s;
6424     while (p < end) {
6425       code = ONIGENC_MBC_TO_CODE(enc, p, end);
6426       if (code >= 0x80) {
6427         fprintf(fp, " 0x%04x ", (int )code);
6428       }
6429       else {
6430         fputc((int )code, fp);
6431       }
6432 
6433       p += enclen(enc, p);
6434     }
6435   }
6436   else {
6437     while (s < end) {
6438       fputc((int )*s, fp);
6439       s++;
6440     }
6441   }
6442 
6443   fprintf(fp, "/\n");
6444 }
6445 
6446 #endif /* ONIG_DEBUG */
6447 
6448 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6449 
6450 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)6451 print_distance_range(FILE* f, OnigLen a, OnigLen b)
6452 {
6453   if (a == INFINITE_LEN)
6454     fputs("inf", f);
6455   else
6456     fprintf(f, "(%u)", a);
6457 
6458   fputs("-", f);
6459 
6460   if (b == INFINITE_LEN)
6461     fputs("inf", f);
6462   else
6463     fprintf(f, "(%u)", b);
6464 }
6465 
6466 static void
print_anchor(FILE * f,int anchor)6467 print_anchor(FILE* f, int anchor)
6468 {
6469   int q = 0;
6470 
6471   fprintf(f, "[");
6472 
6473   if (anchor & ANCR_BEGIN_BUF) {
6474     fprintf(f, "begin-buf");
6475     q = 1;
6476   }
6477   if (anchor & ANCR_BEGIN_LINE) {
6478     if (q) fprintf(f, ", ");
6479     q = 1;
6480     fprintf(f, "begin-line");
6481   }
6482   if (anchor & ANCR_BEGIN_POSITION) {
6483     if (q) fprintf(f, ", ");
6484     q = 1;
6485     fprintf(f, "begin-pos");
6486   }
6487   if (anchor & ANCR_END_BUF) {
6488     if (q) fprintf(f, ", ");
6489     q = 1;
6490     fprintf(f, "end-buf");
6491   }
6492   if (anchor & ANCR_SEMI_END_BUF) {
6493     if (q) fprintf(f, ", ");
6494     q = 1;
6495     fprintf(f, "semi-end-buf");
6496   }
6497   if (anchor & ANCR_END_LINE) {
6498     if (q) fprintf(f, ", ");
6499     q = 1;
6500     fprintf(f, "end-line");
6501   }
6502   if (anchor & ANCR_ANYCHAR_INF) {
6503     if (q) fprintf(f, ", ");
6504     q = 1;
6505     fprintf(f, "anychar-inf");
6506   }
6507   if (anchor & ANCR_ANYCHAR_INF_ML) {
6508     if (q) fprintf(f, ", ");
6509     fprintf(f, "anychar-inf-ml");
6510   }
6511 
6512   fprintf(f, "]");
6513 }
6514 
6515 static void
print_optimize_info(FILE * f,regex_t * reg)6516 print_optimize_info(FILE* f, regex_t* reg)
6517 {
6518   static const char* on[] = { "NONE", "STR",
6519                               "STR_FAST", "STR_FAST_STEP_FORWARD",
6520                               "STR_CASE_FOLD", "MAP" };
6521 
6522   fprintf(f, "optimize: %s\n", on[reg->optimize]);
6523   fprintf(f, "  anchor: "); print_anchor(f, reg->anchor);
6524   if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
6525     print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
6526   fprintf(f, "\n");
6527 
6528   if (reg->optimize) {
6529     fprintf(f, "  sub anchor: "); print_anchor(f, reg->sub_anchor);
6530     fprintf(f, "\n");
6531   }
6532   fprintf(f, "\n");
6533 
6534   if (reg->exact) {
6535     UChar *p;
6536     fprintf(f, "exact: [");
6537     for (p = reg->exact; p < reg->exact_end; p++) {
6538       fputc(*p, f);
6539     }
6540     fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
6541   }
6542   else if (reg->optimize & OPTIMIZE_MAP) {
6543     int c, i, n = 0;
6544 
6545     for (i = 0; i < CHAR_MAP_SIZE; i++)
6546       if (reg->map[i]) n++;
6547 
6548     fprintf(f, "map: n=%d\n", n);
6549     if (n > 0) {
6550       c = 0;
6551       fputc('[', f);
6552       for (i = 0; i < CHAR_MAP_SIZE; i++) {
6553         if (reg->map[i] != 0) {
6554           if (c > 0)  fputs(", ", f);
6555           c++;
6556           if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
6557               ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
6558             fputc(i, f);
6559           else
6560             fprintf(f, "%d", i);
6561         }
6562       }
6563       fprintf(f, "]\n");
6564     }
6565   }
6566 }
6567 #endif
6568 
6569 
6570 extern RegexExt*
onig_get_regex_ext(regex_t * reg)6571 onig_get_regex_ext(regex_t* reg)
6572 {
6573   if (IS_NULL(reg->extp)) {
6574     RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
6575     if (IS_NULL(ext)) return 0;
6576 
6577     ext->pattern      = 0;
6578     ext->pattern_end  = 0;
6579 #ifdef USE_CALLOUT
6580     ext->tag_table    = 0;
6581     ext->callout_num  = 0;
6582     ext->callout_list_alloc = 0;
6583     ext->callout_list = 0;
6584 #endif
6585 
6586     reg->extp = ext;
6587   }
6588 
6589   return reg->extp;
6590 }
6591 
6592 static void
free_regex_ext(RegexExt * ext)6593 free_regex_ext(RegexExt* ext)
6594 {
6595   if (IS_NOT_NULL(ext)) {
6596     if (IS_NOT_NULL(ext->pattern))
6597       xfree((void* )ext->pattern);
6598 
6599 #ifdef USE_CALLOUT
6600     if (IS_NOT_NULL(ext->tag_table))
6601       onig_callout_tag_table_free(ext->tag_table);
6602 
6603     if (IS_NOT_NULL(ext->callout_list))
6604       onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
6605 #endif
6606 
6607     xfree(ext);
6608   }
6609 }
6610 
6611 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)6612 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
6613 {
6614   RegexExt* ext;
6615   UChar* s;
6616 
6617   ext = onig_get_regex_ext(reg);
6618   CHECK_NULL_RETURN_MEMERR(ext);
6619 
6620   s = onigenc_strdup(reg->enc, pattern, pattern_end);
6621   CHECK_NULL_RETURN_MEMERR(s);
6622 
6623   ext->pattern     = s;
6624   ext->pattern_end = s + (pattern_end - pattern);
6625 
6626   return ONIG_NORMAL;
6627 }
6628 
6629 extern void
onig_free_body(regex_t * reg)6630 onig_free_body(regex_t* reg)
6631 {
6632   if (IS_NOT_NULL(reg)) {
6633     ops_free(reg);
6634     if (IS_NOT_NULL(reg->string_pool)) {
6635       xfree(reg->string_pool);
6636       reg->string_pool_end = reg->string_pool = 0;
6637     }
6638     if (IS_NOT_NULL(reg->exact))            xfree(reg->exact);
6639     if (IS_NOT_NULL(reg->repeat_range))     xfree(reg->repeat_range);
6640     if (IS_NOT_NULL(reg->extp)) {
6641       free_regex_ext(reg->extp);
6642       reg->extp = 0;
6643     }
6644 
6645     onig_names_free(reg);
6646   }
6647 }
6648 
6649 extern void
onig_free(regex_t * reg)6650 onig_free(regex_t* reg)
6651 {
6652   if (IS_NOT_NULL(reg)) {
6653     onig_free_body(reg);
6654     xfree(reg);
6655   }
6656 }
6657 
6658 
6659 #ifdef ONIG_DEBUG_PARSE
6660 static void print_tree P_((FILE* f, Node* node));
6661 #endif
6662 
6663 extern int onig_init_for_match_at(regex_t* reg);
6664 
6665 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)6666 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
6667              OnigErrorInfo* einfo)
6668 {
6669   int r;
6670   Node*  root;
6671   ScanEnv  scan_env;
6672 #ifdef USE_CALL
6673   UnsetAddrList  uslist = {0};
6674 #endif
6675 
6676   root = 0;
6677   if (IS_NOT_NULL(einfo)) {
6678     einfo->enc = reg->enc;
6679     einfo->par = (UChar* )NULL;
6680   }
6681 
6682 #ifdef ONIG_DEBUG
6683   print_enc_string(stderr, reg->enc, pattern, pattern_end);
6684 #endif
6685 
6686   if (reg->ops_alloc == 0) {
6687     r = ops_init(reg, OPS_INIT_SIZE);
6688     if (r != 0) goto end;
6689   }
6690   else
6691     reg->ops_used = 0;
6692 
6693   reg->string_pool        = 0;
6694   reg->string_pool_end    = 0;
6695   reg->num_mem            = 0;
6696   reg->num_repeat         = 0;
6697   reg->num_empty_check    = 0;
6698   reg->repeat_range_alloc = 0;
6699   reg->repeat_range       = (RepeatRange* )NULL;
6700   reg->empty_status_mem   = 0;
6701 
6702   r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
6703   if (r != 0) goto err;
6704 
6705   r = reduce_string_list(root);
6706   if (r != 0) goto err;
6707 
6708   /* mixed use named group and no-named group */
6709   if (scan_env.num_named > 0 &&
6710       IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
6711       ! ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
6712     if (scan_env.num_named != scan_env.num_mem)
6713       r = disable_noname_group_capture(&root, reg, &scan_env);
6714     else
6715       r = numbered_ref_check(root);
6716 
6717     if (r != 0) goto err;
6718   }
6719 
6720   r = check_backrefs(root, &scan_env);
6721   if (r != 0) goto err;
6722 
6723 #ifdef USE_CALL
6724   if (scan_env.num_call > 0) {
6725     r = unset_addr_list_init(&uslist, scan_env.num_call);
6726     if (r != 0) goto err;
6727     scan_env.unset_addr_list = &uslist;
6728     r = tune_call(root, &scan_env, 0);
6729     if (r != 0) goto err_unset;
6730     r = tune_call2(root);
6731     if (r != 0) goto err_unset;
6732     r = recursive_call_check_trav(root, &scan_env, 0);
6733     if (r  < 0) goto err_unset;
6734     r = infinite_recursive_call_check_trav(root, &scan_env);
6735     if (r != 0) goto err_unset;
6736 
6737     tune_called_state(root, 0);
6738   }
6739 
6740   reg->num_call = scan_env.num_call;
6741 #endif
6742 
6743 #ifdef ONIG_DEBUG_PARSE
6744   fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth);
6745   fprintf(stderr, "TREE (parsed)\n");
6746   print_tree(stderr, root);
6747   fprintf(stderr, "\n");
6748 #endif
6749 
6750   r = tune_tree(root, reg, 0, &scan_env);
6751   if (r != 0) goto err_unset;
6752 
6753   if (scan_env.backref_num != 0) {
6754     set_parent_node_trav(root, NULL_NODE);
6755     r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env);
6756     if (r != 0) goto err_unset;
6757     set_empty_status_check_trav(root, &scan_env);
6758   }
6759 
6760 #ifdef ONIG_DEBUG_PARSE
6761   fprintf(stderr, "TREE (after tune)\n");
6762   print_tree(stderr, root);
6763   fprintf(stderr, "\n");
6764 #endif
6765 
6766   reg->capture_history  = scan_env.cap_history;
6767   reg->push_mem_start   = scan_env.backtrack_mem | scan_env.cap_history;
6768 
6769 #ifdef USE_CALLOUT
6770   if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
6771     reg->push_mem_end = reg->push_mem_start;
6772   }
6773   else {
6774     if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
6775       reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
6776     else
6777       reg->push_mem_end = reg->push_mem_start &
6778                         (scan_env.backrefed_mem | scan_env.cap_history);
6779   }
6780 #else
6781   if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
6782     reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
6783   else
6784     reg->push_mem_end = reg->push_mem_start &
6785                       (scan_env.backrefed_mem | scan_env.cap_history);
6786 #endif
6787 
6788   clear_optimize_info(reg);
6789 #ifndef ONIG_DONT_OPTIMIZE
6790   r = set_optimize_info_from_tree(root, reg, &scan_env);
6791   if (r != 0) goto err_unset;
6792 #endif
6793 
6794   if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
6795     xfree(scan_env.mem_env_dynamic);
6796     scan_env.mem_env_dynamic = (MemEnv* )NULL;
6797   }
6798 
6799   r = compile_tree(root, reg, &scan_env);
6800   if (r == 0) {
6801     if (scan_env.keep_num > 0) {
6802       r = add_op(reg, OP_UPDATE_VAR);
6803       if (r != 0) goto err;
6804 
6805       COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
6806       COP(reg)->update_var.id   = 0; /* not used */
6807     }
6808 
6809     r = add_op(reg, OP_END);
6810     if (r != 0) goto err;
6811 
6812 #ifdef USE_CALL
6813     if (scan_env.num_call > 0) {
6814       r = fix_unset_addr_list(&uslist, reg);
6815       unset_addr_list_end(&uslist);
6816       if (r != 0) goto err;
6817     }
6818 #endif
6819 
6820     set_addr_in_repeat_range(reg);
6821 
6822     if ((reg->push_mem_end != 0)
6823 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
6824         || (reg->num_repeat      != 0)
6825         || (reg->num_empty_check != 0)
6826 #endif
6827 #ifdef USE_CALLOUT
6828         || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
6829 #endif
6830         )
6831       reg->stack_pop_level = STACK_POP_LEVEL_ALL;
6832     else {
6833       if (reg->push_mem_start != 0)
6834         reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
6835       else
6836         reg->stack_pop_level = STACK_POP_LEVEL_FREE;
6837     }
6838 
6839     r = ops_make_string_pool(reg);
6840     if (r != 0) goto err;
6841   }
6842 #ifdef USE_CALL
6843   else if (scan_env.num_call > 0) {
6844     unset_addr_list_end(&uslist);
6845   }
6846 #endif
6847   onig_node_free(root);
6848 
6849 #ifdef ONIG_DEBUG_COMPILE
6850   onig_print_names(stderr, reg);
6851   onig_print_compiled_byte_code_list(stderr, reg);
6852 #endif
6853 
6854 #ifdef USE_DIRECT_THREADED_CODE
6855   /* opcode -> opaddr */
6856   onig_init_for_match_at(reg);
6857 #endif
6858 
6859  end:
6860   return r;
6861 
6862  err_unset:
6863 #ifdef USE_CALL
6864   if (scan_env.num_call > 0) {
6865     unset_addr_list_end(&uslist);
6866   }
6867 #endif
6868  err:
6869   if (IS_NOT_NULL(scan_env.error)) {
6870     if (IS_NOT_NULL(einfo)) {
6871       einfo->par     = scan_env.error;
6872       einfo->par_end = scan_env.error_end;
6873     }
6874   }
6875 
6876   onig_node_free(root);
6877   if (IS_NOT_NULL(scan_env.mem_env_dynamic))
6878       xfree(scan_env.mem_env_dynamic);
6879   return r;
6880 }
6881 
6882 
6883 static int onig_inited = 0;
6884 
6885 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)6886 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
6887               OnigEncoding enc, OnigSyntaxType* syntax)
6888 {
6889   int r;
6890 
6891   xmemset(reg, 0, sizeof(*reg));
6892 
6893   if (onig_inited == 0) {
6894 #if 0
6895     return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
6896 #else
6897     r = onig_initialize(&enc, 1);
6898     if (r != 0)
6899       return ONIGERR_FAIL_TO_INITIALIZE;
6900 
6901     onig_warning("You didn't call onig_initialize() explicitly");
6902 #endif
6903   }
6904 
6905   if (IS_NULL(reg))
6906     return ONIGERR_INVALID_ARGUMENT;
6907 
6908   if (ONIGENC_IS_UNDEF(enc))
6909     return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
6910 
6911   if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
6912       == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
6913     return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
6914   }
6915 
6916   if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
6917     option |= syntax->options;
6918     option &= ~ONIG_OPTION_SINGLELINE;
6919   }
6920   else
6921     option |= syntax->options;
6922 
6923   (reg)->enc              = enc;
6924   (reg)->options          = option;
6925   (reg)->syntax           = syntax;
6926   (reg)->optimize         = 0;
6927   (reg)->exact            = (UChar* )NULL;
6928   (reg)->extp             = (RegexExt* )NULL;
6929 
6930   (reg)->ops              = (Operation* )NULL;
6931   (reg)->ops_curr         = (Operation* )NULL;
6932   (reg)->ops_used         = 0;
6933   (reg)->ops_alloc        = 0;
6934   (reg)->name_table       = (void* )NULL;
6935 
6936   (reg)->case_fold_flag   = case_fold_flag;
6937   return 0;
6938 }
6939 
6940 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)6941 onig_new_without_alloc(regex_t* reg,
6942                        const UChar* pattern, const UChar* pattern_end,
6943                        OnigOptionType option, OnigEncoding enc,
6944                        OnigSyntaxType* syntax, OnigErrorInfo* einfo)
6945 {
6946   int r;
6947 
6948   r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6949   if (r != 0) return r;
6950 
6951   r = onig_compile(reg, pattern, pattern_end, einfo);
6952   return r;
6953 }
6954 
6955 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)6956 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
6957          OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
6958          OnigErrorInfo* einfo)
6959 {
6960   int r;
6961 
6962   *reg = (regex_t* )xmalloc(sizeof(regex_t));
6963   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
6964 
6965   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6966   if (r != 0) {
6967     xfree(*reg);
6968     *reg = NULL;
6969     return r;
6970   }
6971 
6972   r = onig_compile(*reg, pattern, pattern_end, einfo);
6973   if (r != 0) {
6974     onig_free(*reg);
6975     *reg = NULL;
6976   }
6977   return r;
6978 }
6979 
6980 extern int
onig_initialize(OnigEncoding encodings[],int n)6981 onig_initialize(OnigEncoding encodings[], int n)
6982 {
6983   int i;
6984   int r;
6985 
6986   if (onig_inited != 0)
6987     return 0;
6988 
6989   onigenc_init();
6990 
6991   onig_inited = 1;
6992 
6993   for (i = 0; i < n; i++) {
6994     OnigEncoding enc = encodings[i];
6995     r = onig_initialize_encoding(enc);
6996     if (r != 0)
6997       return r;
6998   }
6999 
7000   return ONIG_NORMAL;
7001 }
7002 
7003 typedef struct EndCallListItem {
7004   struct EndCallListItem* next;
7005   void (*func)(void);
7006 } EndCallListItemType;
7007 
7008 static EndCallListItemType* EndCallTop;
7009 
onig_add_end_call(void (* func)(void))7010 extern void onig_add_end_call(void (*func)(void))
7011 {
7012   EndCallListItemType* item;
7013 
7014   item = (EndCallListItemType* )xmalloc(sizeof(*item));
7015   if (item == 0) return ;
7016 
7017   item->next = EndCallTop;
7018   item->func = func;
7019 
7020   EndCallTop = item;
7021 }
7022 
7023 static void
exec_end_call_list(void)7024 exec_end_call_list(void)
7025 {
7026   EndCallListItemType* prev;
7027   void (*func)(void);
7028 
7029   while (EndCallTop != 0) {
7030     func = EndCallTop->func;
7031     (*func)();
7032 
7033     prev = EndCallTop;
7034     EndCallTop = EndCallTop->next;
7035     xfree(prev);
7036   }
7037 }
7038 
7039 extern int
onig_end(void)7040 onig_end(void)
7041 {
7042   exec_end_call_list();
7043 
7044 #ifdef USE_CALLOUT
7045   onig_global_callout_names_free();
7046 #endif
7047 
7048   onigenc_end();
7049 
7050   onig_inited = 0;
7051 
7052   return 0;
7053 }
7054 
7055 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7056 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7057 {
7058   OnigCodePoint n, *data;
7059   OnigCodePoint low, high, x;
7060 
7061   GET_CODE_POINT(n, p);
7062   data = (OnigCodePoint* )p;
7063   data++;
7064 
7065   for (low = 0, high = n; low < high; ) {
7066     x = (low + high) >> 1;
7067     if (code > data[x * 2 + 1])
7068       low = x + 1;
7069     else
7070       high = x;
7071   }
7072 
7073   return ((low < n && code >= data[low * 2]) ? 1 : 0);
7074 }
7075 
7076 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7077 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7078 {
7079   int found;
7080   CClassNode* cc = (CClassNode* )cc_arg;
7081 
7082   if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7083     if (IS_NULL(cc->mbuf)) {
7084       found = 0;
7085     }
7086     else {
7087       found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7088     }
7089   }
7090   else {
7091     found = BITSET_AT(cc->bs, code) != 0;
7092   }
7093 
7094   if (IS_NCCLASS_NOT(cc))
7095     return !found;
7096   else
7097     return found;
7098 }
7099 
7100 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7101 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7102 {
7103   int len;
7104 
7105   if (ONIGENC_MBC_MINLEN(enc) > 1) {
7106     len = 2;
7107   }
7108   else {
7109     len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7110     if (len < 0) return 0;
7111   }
7112   return onig_is_code_in_cc_len(len, code, cc);
7113 }
7114 
7115 
7116 #ifdef ONIG_DEBUG_PARSE
7117 
7118 #ifdef USE_CALL
7119 static void
p_string(FILE * f,int len,UChar * s)7120 p_string(FILE* f, int len, UChar* s)
7121 {
7122   fputs(":", f);
7123   while (len-- > 0) { fputc(*s++, f); }
7124 }
7125 #endif
7126 
7127 static void
Indent(FILE * f,int indent)7128 Indent(FILE* f, int indent)
7129 {
7130   int i;
7131   for (i = 0; i < indent; i++) putc(' ', f);
7132 }
7133 
7134 static void
print_indent_tree(FILE * f,Node * node,int indent)7135 print_indent_tree(FILE* f, Node* node, int indent)
7136 {
7137   int i;
7138   NodeType type;
7139   UChar* p;
7140   int add = 3;
7141 
7142   Indent(f, indent);
7143   if (IS_NULL(node)) {
7144     fprintf(f, "ERROR: null node!!!\n");
7145     exit(0);
7146   }
7147 
7148   type = NODE_TYPE(node);
7149   switch (type) {
7150   case NODE_LIST:
7151   case NODE_ALT:
7152     if (type == NODE_LIST)
7153       fprintf(f, "<list:%p>\n", node);
7154     else
7155       fprintf(f, "<alt:%p>\n", node);
7156 
7157     print_indent_tree(f, NODE_CAR(node), indent + add);
7158     while (IS_NOT_NULL(node = NODE_CDR(node))) {
7159       if (NODE_TYPE(node) != type) {
7160         fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
7161         exit(0);
7162       }
7163       print_indent_tree(f, NODE_CAR(node), indent + add);
7164     }
7165     break;
7166 
7167   case NODE_STRING:
7168     {
7169       char* str;
7170       char* mode;
7171 
7172       if (NODE_STRING_IS_CRUDE(node))
7173         mode = "-crude";
7174       else if (NODE_STRING_IS_CASE_FOLD_MATCH(node))
7175         mode = "-case_fold_match";
7176       else
7177         mode = "";
7178 
7179       if (STR_(node)->s == STR_(node)->end)
7180         str = "empty-string";
7181       else
7182         str = "string";
7183 
7184       fprintf(f, "<%s%s:%p>", str, mode, node);
7185       for (p = STR_(node)->s; p < STR_(node)->end; p++) {
7186         if (*p >= 0x20 && *p < 0x7f)
7187           fputc(*p, f);
7188         else {
7189           fprintf(f, " 0x%02x", *p);
7190         }
7191       }
7192     }
7193     break;
7194 
7195   case NODE_CCLASS:
7196     fprintf(f, "<cclass:%p>", node);
7197     if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
7198     if (CCLASS_(node)->mbuf) {
7199       BBuf* bbuf = CCLASS_(node)->mbuf;
7200       for (i = 0; i < bbuf->used; i++) {
7201         if (i > 0) fprintf(f, ",");
7202         fprintf(f, "%0x", bbuf->p[i]);
7203       }
7204     }
7205     break;
7206 
7207   case NODE_CTYPE:
7208     fprintf(f, "<ctype:%p> ", node);
7209     switch (CTYPE_(node)->ctype) {
7210     case CTYPE_ANYCHAR:
7211       fprintf(f, "<anychar:%p>", node);
7212       break;
7213 
7214     case ONIGENC_CTYPE_WORD:
7215       if (CTYPE_(node)->not != 0)
7216         fputs("not word", f);
7217       else
7218         fputs("word",     f);
7219 
7220       if (CTYPE_(node)->ascii_mode != 0)
7221         fputs(" (ascii)", f);
7222 
7223       break;
7224 
7225     default:
7226       fprintf(f, "ERROR: undefined ctype.\n");
7227       exit(0);
7228     }
7229     break;
7230 
7231   case NODE_ANCHOR:
7232     fprintf(f, "<anchor:%p> ", node);
7233     switch (ANCHOR_(node)->type) {
7234     case ANCR_BEGIN_BUF:        fputs("begin buf",      f); break;
7235     case ANCR_END_BUF:          fputs("end buf",        f); break;
7236     case ANCR_BEGIN_LINE:       fputs("begin line",     f); break;
7237     case ANCR_END_LINE:         fputs("end line",       f); break;
7238     case ANCR_SEMI_END_BUF:     fputs("semi end buf",   f); break;
7239     case ANCR_BEGIN_POSITION:   fputs("begin position", f); break;
7240 
7241     case ANCR_WORD_BOUNDARY:    fputs("word boundary",     f); break;
7242     case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
7243 #ifdef USE_WORD_BEGIN_END
7244     case ANCR_WORD_BEGIN:       fputs("word begin", f);     break;
7245     case ANCR_WORD_END:         fputs("word end", f);       break;
7246 #endif
7247     case ANCR_TEXT_SEGMENT_BOUNDARY:
7248       fputs("text-segment boundary", f); break;
7249     case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
7250       fputs("no text-segment boundary", f); break;
7251     case ANCR_PREC_READ:
7252       fprintf(f, "prec read\n");
7253       print_indent_tree(f, NODE_BODY(node), indent + add);
7254       break;
7255     case ANCR_PREC_READ_NOT:
7256       fprintf(f, "prec read not\n");
7257       print_indent_tree(f, NODE_BODY(node), indent + add);
7258       break;
7259     case ANCR_LOOK_BEHIND:
7260       fprintf(f, "look behind\n");
7261       print_indent_tree(f, NODE_BODY(node), indent + add);
7262       break;
7263     case ANCR_LOOK_BEHIND_NOT:
7264       fprintf(f, "look behind not\n");
7265       print_indent_tree(f, NODE_BODY(node), indent + add);
7266       break;
7267 
7268     default:
7269       fprintf(f, "ERROR: undefined anchor type.\n");
7270       break;
7271     }
7272     break;
7273 
7274   case NODE_BACKREF:
7275     {
7276       int* p;
7277       BackRefNode* br = BACKREF_(node);
7278       p = BACKREFS_P(br);
7279       fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
7280       for (i = 0; i < br->back_num; i++) {
7281         if (i > 0) fputs(", ", f);
7282         fprintf(f, "%d", p[i]);
7283       }
7284     }
7285     break;
7286 
7287 #ifdef USE_CALL
7288   case NODE_CALL:
7289     {
7290       CallNode* cn = CALL_(node);
7291       fprintf(f, "<call:%p>", node);
7292       p_string(f, cn->name_end - cn->name, cn->name);
7293     }
7294     break;
7295 #endif
7296 
7297   case NODE_QUANT:
7298     fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
7299             QUANT_(node)->lower, QUANT_(node)->upper,
7300             (QUANT_(node)->greedy ? "" : "?"));
7301     print_indent_tree(f, NODE_BODY(node), indent + add);
7302     break;
7303 
7304   case NODE_BAG:
7305     fprintf(f, "<bag:%p> ", node);
7306     if (BAG_(node)->type == BAG_IF_ELSE) {
7307       Node* Then;
7308       Node* Else;
7309       BagNode* bn;
7310 
7311       bn = BAG_(node);
7312       fprintf(f, "if-else\n");
7313       print_indent_tree(f, NODE_BODY(node), indent + add);
7314 
7315       Then = bn->te.Then;
7316       Else = bn->te.Else;
7317       if (IS_NULL(Then)) {
7318         Indent(f, indent + add);
7319         fprintf(f, "THEN empty\n");
7320       }
7321       else
7322         print_indent_tree(f, Then, indent + add);
7323 
7324       if (IS_NULL(Else)) {
7325         Indent(f, indent + add);
7326         fprintf(f, "ELSE empty\n");
7327       }
7328       else
7329         print_indent_tree(f, Else, indent + add);
7330 
7331       break;
7332     }
7333 
7334     switch (BAG_(node)->type) {
7335     case BAG_OPTION:
7336       fprintf(f, "option:%d", BAG_(node)->o.options);
7337       break;
7338     case BAG_MEMORY:
7339       fprintf(f, "memory:%d", BAG_(node)->m.regnum);
7340       break;
7341     case BAG_STOP_BACKTRACK:
7342       fprintf(f, "stop-bt");
7343       break;
7344     default:
7345       break;
7346     }
7347     fprintf(f, "\n");
7348     print_indent_tree(f, NODE_BODY(node), indent + add);
7349     break;
7350 
7351   case NODE_GIMMICK:
7352     fprintf(f, "<gimmick:%p> ", node);
7353     switch (GIMMICK_(node)->type) {
7354     case GIMMICK_FAIL:
7355       fprintf(f, "fail");
7356       break;
7357     case GIMMICK_SAVE:
7358       fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7359       break;
7360     case GIMMICK_UPDATE_VAR:
7361       fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7362       break;
7363 #ifdef USE_CALLOUT
7364     case GIMMICK_CALLOUT:
7365       switch (GIMMICK_(node)->detail_type) {
7366       case ONIG_CALLOUT_OF_CONTENTS:
7367         fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
7368         break;
7369       case ONIG_CALLOUT_OF_NAME:
7370         fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
7371         break;
7372       }
7373 #endif
7374     }
7375     break;
7376 
7377   default:
7378     fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
7379     break;
7380   }
7381 
7382   if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
7383       type != NODE_BAG)
7384     fprintf(f, "\n");
7385   fflush(f);
7386 }
7387 
7388 static void
print_tree(FILE * f,Node * node)7389 print_tree(FILE* f, Node* node)
7390 {
7391   print_indent_tree(f, node, 0);
7392 }
7393 #endif
7394