1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 #define OPS_INIT_SIZE 8
33
34 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35
36 #if 0
37 typedef struct {
38 int n;
39 int alloc;
40 int* v;
41 } int_stack;
42
43 static int
44 make_int_stack(int_stack** rs, int init_size)
45 {
46 int_stack* s;
47 int* v;
48
49 *rs = 0;
50
51 s = xmalloc(sizeof(*s));
52 if (IS_NULL(s)) return ONIGERR_MEMORY;
53
54 v = (int* )xmalloc(sizeof(int) * init_size);
55 if (IS_NULL(v)) {
56 xfree(s);
57 return ONIGERR_MEMORY;
58 }
59
60 s->n = 0;
61 s->alloc = init_size;
62 s->v = v;
63
64 *rs = s;
65 return ONIG_NORMAL;
66 }
67
68 static void
69 free_int_stack(int_stack* s)
70 {
71 if (IS_NOT_NULL(s)) {
72 if (IS_NOT_NULL(s->v))
73 xfree(s->v);
74 xfree(s);
75 }
76 }
77
78 static int
79 int_stack_push(int_stack* s, int v)
80 {
81 if (s->n >= s->alloc) {
82 int new_size = s->alloc * 2;
83 int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
84 if (IS_NULL(nv)) return ONIGERR_MEMORY;
85
86 s->alloc = new_size;
87 s->v = nv;
88 }
89
90 s->v[s->n] = v;
91 s->n++;
92 return ONIG_NORMAL;
93 }
94
95 static int
96 int_stack_pop(int_stack* s)
97 {
98 int v;
99
100 #ifdef ONIG_DEBUG
101 if (s->n <= 0) {
102 fprintf(stderr, "int_stack_pop: fail empty. %p\n", s);
103 return 0;
104 }
105 #endif
106
107 v = s->v[s->n];
108 s->n--;
109 return v;
110 }
111 #endif
112
113 static int
ops_init(regex_t * reg,int init_alloc_size)114 ops_init(regex_t* reg, int init_alloc_size)
115 {
116 Operation* p;
117 size_t size;
118
119 if (init_alloc_size > 0) {
120 size = sizeof(Operation) * init_alloc_size;
121 p = (Operation* )xrealloc(reg->ops, size);
122 CHECK_NULL_RETURN_MEMERR(p);
123 #ifdef USE_DIRECT_THREADED_CODE
124 {
125 enum OpCode* cp;
126 size = sizeof(enum OpCode) * init_alloc_size;
127 cp = (enum OpCode* )xrealloc(reg->ocs, size);
128 CHECK_NULL_RETURN_MEMERR(cp);
129 reg->ocs = cp;
130 }
131 #endif
132 }
133 else {
134 p = (Operation* )0;
135 #ifdef USE_DIRECT_THREADED_CODE
136 reg->ocs = (enum OpCode* )0;
137 #endif
138 }
139
140 reg->ops = p;
141 reg->ops_curr = 0; /* !!! not yet done ops_new() */
142 reg->ops_alloc = init_alloc_size;
143 reg->ops_used = 0;
144
145 return ONIG_NORMAL;
146 }
147
148 static int
ops_expand(regex_t * reg,int n)149 ops_expand(regex_t* reg, int n)
150 {
151 #define MIN_OPS_EXPAND_SIZE 4
152
153 #ifdef USE_DIRECT_THREADED_CODE
154 enum OpCode* cp;
155 #endif
156 Operation* p;
157 size_t size;
158
159 if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
160
161 n += reg->ops_alloc;
162
163 size = sizeof(Operation) * n;
164 p = (Operation* )xrealloc(reg->ops, size);
165 CHECK_NULL_RETURN_MEMERR(p);
166
167 #ifdef USE_DIRECT_THREADED_CODE
168 size = sizeof(enum OpCode) * n;
169 cp = (enum OpCode* )xrealloc(reg->ocs, size);
170 CHECK_NULL_RETURN_MEMERR(cp);
171 reg->ocs = cp;
172 #endif
173
174 reg->ops = p;
175 reg->ops_alloc = n;
176 if (reg->ops_used == 0)
177 reg->ops_curr = 0;
178 else
179 reg->ops_curr = reg->ops + (reg->ops_used - 1);
180
181 return ONIG_NORMAL;
182 }
183
184 static int
ops_new(regex_t * reg)185 ops_new(regex_t* reg)
186 {
187 int r;
188
189 if (reg->ops_used >= reg->ops_alloc) {
190 r = ops_expand(reg, reg->ops_alloc);
191 if (r != ONIG_NORMAL) return r;
192 }
193
194 reg->ops_curr = reg->ops + reg->ops_used;
195 reg->ops_used++;
196
197 xmemset(reg->ops_curr, 0, sizeof(Operation));
198 return ONIG_NORMAL;
199 }
200
201 static int
is_in_string_pool(regex_t * reg,UChar * s)202 is_in_string_pool(regex_t* reg, UChar* s)
203 {
204 return (s >= reg->string_pool && s < reg->string_pool_end);
205 }
206
207 static void
ops_free(regex_t * reg)208 ops_free(regex_t* reg)
209 {
210 int i;
211
212 if (IS_NULL(reg->ops)) return ;
213
214 for (i = 0; i < (int )reg->ops_used; i++) {
215 enum OpCode opcode;
216 Operation* op;
217
218 op = reg->ops + i;
219
220 #ifdef USE_DIRECT_THREADED_CODE
221 opcode = *(reg->ocs + i);
222 #else
223 opcode = op->opcode;
224 #endif
225
226 switch (opcode) {
227 case OP_STR_MBN:
228 if (! is_in_string_pool(reg, op->exact_len_n.s))
229 xfree(op->exact_len_n.s);
230 break;
231 case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC:
232 if (! is_in_string_pool(reg, op->exact_n.s))
233 xfree(op->exact_n.s);
234 break;
235 case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
236 case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
237 case OP_STR_MB2N3: case OP_STR_1_IC:
238 break;
239
240 case OP_CCLASS_NOT: case OP_CCLASS:
241 xfree(op->cclass.bsp);
242 break;
243
244 case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
245 xfree(op->cclass_mb.mb);
246 break;
247 case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
248 xfree(op->cclass_mix.mb);
249 xfree(op->cclass_mix.bsp);
250 break;
251
252 case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
253 break;
254 case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC:
255 case OP_BACKREF_WITH_LEVEL:
256 case OP_BACKREF_WITH_LEVEL_IC:
257 case OP_BACKREF_CHECK:
258 case OP_BACKREF_CHECK_WITH_LEVEL:
259 if (op->backref_general.num != 1)
260 xfree(op->backref_general.ns);
261 break;
262
263 default:
264 break;
265 }
266 }
267
268 xfree(reg->ops);
269 #ifdef USE_DIRECT_THREADED_CODE
270 xfree(reg->ocs);
271 reg->ocs = 0;
272 #endif
273
274 reg->ops = 0;
275 reg->ops_curr = 0;
276 reg->ops_alloc = 0;
277 reg->ops_used = 0;
278 }
279
280 static int
ops_calc_size_of_string_pool(regex_t * reg)281 ops_calc_size_of_string_pool(regex_t* reg)
282 {
283 int i;
284 int total;
285
286 if (IS_NULL(reg->ops)) return 0;
287
288 total = 0;
289 for (i = 0; i < (int )reg->ops_used; i++) {
290 enum OpCode opcode;
291 Operation* op;
292
293 op = reg->ops + i;
294 #ifdef USE_DIRECT_THREADED_CODE
295 opcode = *(reg->ocs + i);
296 #else
297 opcode = op->opcode;
298 #endif
299
300 switch (opcode) {
301 case OP_STR_MBN:
302 total += op->exact_len_n.len * op->exact_len_n.n;
303 break;
304 case OP_STR_N:
305 case OP_STR_N_IC:
306 total += op->exact_n.n;
307 break;
308 case OP_STR_MB2N:
309 total += op->exact_n.n * 2;
310 break;
311 case OP_STR_MB3N:
312 total += op->exact_n.n * 3;
313 break;
314
315 default:
316 break;
317 }
318 }
319
320 return total;
321 }
322
323 static int
ops_make_string_pool(regex_t * reg)324 ops_make_string_pool(regex_t* reg)
325 {
326 int i;
327 int len;
328 int size;
329 UChar* pool;
330 UChar* curr;
331
332 size = ops_calc_size_of_string_pool(reg);
333 if (size <= 0) {
334 return 0;
335 }
336
337 curr = pool = (UChar* )xmalloc((size_t )size);
338 CHECK_NULL_RETURN_MEMERR(pool);
339
340 for (i = 0; i < (int )reg->ops_used; i++) {
341 enum OpCode opcode;
342 Operation* op;
343
344 op = reg->ops + i;
345 #ifdef USE_DIRECT_THREADED_CODE
346 opcode = *(reg->ocs + i);
347 #else
348 opcode = op->opcode;
349 #endif
350
351 switch (opcode) {
352 case OP_STR_MBN:
353 len = op->exact_len_n.len * op->exact_len_n.n;
354 xmemcpy(curr, op->exact_len_n.s, len);
355 xfree(op->exact_len_n.s);
356 op->exact_len_n.s = curr;
357 curr += len;
358 break;
359 case OP_STR_N:
360 case OP_STR_N_IC:
361 len = op->exact_n.n;
362 copy:
363 xmemcpy(curr, op->exact_n.s, len);
364 xfree(op->exact_n.s);
365 op->exact_n.s = curr;
366 curr += len;
367 break;
368 case OP_STR_MB2N:
369 len = op->exact_n.n * 2;
370 goto copy;
371 break;
372 case OP_STR_MB3N:
373 len = op->exact_n.n * 3;
374 goto copy;
375 break;
376
377 default:
378 break;
379 }
380 }
381
382 reg->string_pool = pool;
383 reg->string_pool_end = pool + size;
384 return 0;
385 }
386
387 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)388 onig_get_default_case_fold_flag(void)
389 {
390 return OnigDefaultCaseFoldFlag;
391 }
392
393 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)394 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
395 {
396 OnigDefaultCaseFoldFlag = case_fold_flag;
397 return 0;
398 }
399
400 static int
int_multiply_cmp(int x,int y,int v)401 int_multiply_cmp(int x, int y, int v)
402 {
403 if (x == 0 || y == 0) return -1;
404
405 if (x < INT_MAX / y) {
406 int xy = x * y;
407 if (xy > v) return 1;
408 else {
409 if (xy == v) return 0;
410 else return -1;
411 }
412 }
413 else
414 return 1;
415 }
416
417 extern int
onig_positive_int_multiply(int x,int y)418 onig_positive_int_multiply(int x, int y)
419 {
420 if (x == 0 || y == 0) return 0;
421
422 if (x < INT_MAX / y)
423 return x * y;
424 else
425 return -1;
426 }
427
428
429 static void
node_swap(Node * a,Node * b)430 node_swap(Node* a, Node* b)
431 {
432 Node c;
433
434 c = *a; *a = *b; *b = c;
435
436 if (NODE_TYPE(a) == NODE_STRING) {
437 StrNode* sn = STR_(a);
438 if (sn->capacity == 0) {
439 int len = (int )(sn->end - sn->s);
440 sn->s = sn->buf;
441 sn->end = sn->s + len;
442 }
443 }
444
445 if (NODE_TYPE(b) == NODE_STRING) {
446 StrNode* sn = STR_(b);
447 if (sn->capacity == 0) {
448 int len = (int )(sn->end - sn->s);
449 sn->s = sn->buf;
450 sn->end = sn->s + len;
451 }
452 }
453 }
454
455 static int
node_list_len(Node * list)456 node_list_len(Node* list)
457 {
458 int len;
459
460 len = 1;
461 while (IS_NOT_NULL(NODE_CDR(list))) {
462 list = NODE_CDR(list);
463 len++;
464 }
465
466 return len;
467 }
468
469 static Node*
node_list_add(Node * list,Node * x)470 node_list_add(Node* list, Node* x)
471 {
472 Node *n;
473
474 n = onig_node_new_list(x, NULL);
475 if (IS_NULL(n)) return NULL_NODE;
476
477 if (IS_NOT_NULL(list)) {
478 while (IS_NOT_NULL(NODE_CDR(list)))
479 list = NODE_CDR(list);
480
481 NODE_CDR(list) = n;
482 }
483
484 return n;
485 }
486
487 static int
node_str_node_cat(Node * node,Node * add)488 node_str_node_cat(Node* node, Node* add)
489 {
490 int r;
491
492 if (STR_(node)->flag != STR_(add)->flag)
493 return ONIGERR_TYPE_BUG;
494
495 r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
496 if (r != 0) return r;
497
498 if (NODE_STRING_IS_CASE_FOLD_MATCH(node))
499 STR_(node)->case_min_len += STR_(add)->case_min_len;
500
501 return 0;
502 }
503
504 static int
node_str_cat_case_fold(Node * node,const UChar * s,const UChar * end,int case_min_len)505 node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len)
506 {
507 int r;
508
509 if (! NODE_STRING_IS_CASE_FOLD_MATCH(node))
510 return ONIGERR_TYPE_BUG;
511
512 r = onig_node_str_cat(node, s, end);
513 if (r != 0) return r;
514
515 STR_(node)->case_min_len += case_min_len;
516 return 0;
517 }
518
519 static void
node_conv_to_str_node(Node * node,int flag)520 node_conv_to_str_node(Node* node, int flag)
521 {
522 NODE_SET_TYPE(node, NODE_STRING);
523 STR_(node)->flag = flag;
524 STR_(node)->s = STR_(node)->buf;
525 STR_(node)->end = STR_(node)->buf;
526 STR_(node)->capacity = 0;
527 STR_(node)->case_min_len = 0;
528 }
529
530 static OnigLen
distance_add(OnigLen d1,OnigLen d2)531 distance_add(OnigLen d1, OnigLen d2)
532 {
533 if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
534 return INFINITE_LEN;
535 else {
536 if (d1 <= INFINITE_LEN - d2) return d1 + d2;
537 else return INFINITE_LEN;
538 }
539 }
540
541 static OnigLen
distance_multiply(OnigLen d,int m)542 distance_multiply(OnigLen d, int m)
543 {
544 if (m == 0) return 0;
545
546 if (d < INFINITE_LEN / m)
547 return d * m;
548 else
549 return INFINITE_LEN;
550 }
551
552 static int
bitset_is_empty(BitSetRef bs)553 bitset_is_empty(BitSetRef bs)
554 {
555 int i;
556
557 for (i = 0; i < (int )BITSET_SIZE; i++) {
558 if (bs[i] != 0) return 0;
559 }
560 return 1;
561 }
562
563 #ifdef USE_CALL
564
565 static int
unset_addr_list_init(UnsetAddrList * list,int size)566 unset_addr_list_init(UnsetAddrList* list, int size)
567 {
568 UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
569 CHECK_NULL_RETURN_MEMERR(p);
570
571 list->num = 0;
572 list->alloc = size;
573 list->us = p;
574 return 0;
575 }
576
577 static void
unset_addr_list_end(UnsetAddrList * list)578 unset_addr_list_end(UnsetAddrList* list)
579 {
580 if (IS_NOT_NULL(list->us))
581 xfree(list->us);
582 }
583
584 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)585 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
586 {
587 UnsetAddr* p;
588 int size;
589
590 if (list->num >= list->alloc) {
591 size = list->alloc * 2;
592 p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
593 CHECK_NULL_RETURN_MEMERR(p);
594 list->alloc = size;
595 list->us = p;
596 }
597
598 list->us[list->num].offset = offset;
599 list->us[list->num].target = node;
600 list->num++;
601 return 0;
602 }
603 #endif /* USE_CALL */
604
605
606 static int
add_op(regex_t * reg,int opcode)607 add_op(regex_t* reg, int opcode)
608 {
609 int r;
610
611 r = ops_new(reg);
612 if (r != ONIG_NORMAL) return r;
613
614 #ifdef USE_DIRECT_THREADED_CODE
615 *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
616 #else
617 reg->ops_curr->opcode = opcode;
618 #endif
619
620 return 0;
621 }
622
623 static int compile_length_tree(Node* node, regex_t* reg);
624 static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
625
626
627 #define IS_NEED_STR_LEN_OP(op) \
628 ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\
629 (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC)
630
631 static int
select_str_opcode(int mb_len,int str_len)632 select_str_opcode(int mb_len, int str_len)
633 {
634 int op;
635
636 switch (mb_len) {
637 case 1:
638 switch (str_len) {
639 case 1: op = OP_STR_1; break;
640 case 2: op = OP_STR_2; break;
641 case 3: op = OP_STR_3; break;
642 case 4: op = OP_STR_4; break;
643 case 5: op = OP_STR_5; break;
644 default: op = OP_STR_N; break;
645 }
646 break;
647
648 case 2:
649 switch (str_len) {
650 case 1: op = OP_STR_MB2N1; break;
651 case 2: op = OP_STR_MB2N2; break;
652 case 3: op = OP_STR_MB2N3; break;
653 default: op = OP_STR_MB2N; break;
654 }
655 break;
656
657 case 3:
658 op = OP_STR_MB3N;
659 break;
660
661 default:
662 op = OP_STR_MBN;
663 break;
664 }
665
666 return op;
667 }
668
669 static int
is_strict_real_node(Node * node)670 is_strict_real_node(Node* node)
671 {
672 switch (NODE_TYPE(node)) {
673 case NODE_STRING:
674 {
675 StrNode* sn = STR_(node);
676 return (sn->end != sn->s);
677 }
678 break;
679
680 case NODE_CCLASS:
681 case NODE_CTYPE:
682 return 1;
683 break;
684
685 default:
686 return 0;
687 break;
688 }
689 }
690
691 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ScanEnv * env)692 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
693 {
694 int r;
695 int saved_num_empty_check;
696 int emptiness;
697 Node* body;
698
699 body = NODE_BODY((Node* )qn);
700 emptiness = qn->emptiness;
701 saved_num_empty_check = reg->num_empty_check;
702
703 if (emptiness != BODY_IS_NOT_EMPTY) {
704 r = add_op(reg, OP_EMPTY_CHECK_START);
705 if (r != 0) return r;
706 COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
707 reg->num_empty_check++;
708 }
709
710 r = compile_tree(body, reg, env);
711 if (r != 0) return r;
712
713 if (emptiness != BODY_IS_NOT_EMPTY) {
714 if (emptiness == BODY_IS_EMPTY_POSSIBILITY)
715 r = add_op(reg, OP_EMPTY_CHECK_END);
716 else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) {
717 if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0)
718 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
719 else
720 r = add_op(reg, OP_EMPTY_CHECK_END);
721 }
722 #ifdef USE_CALL
723 else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC)
724 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
725 #endif
726
727 if (r != 0) return r;
728 COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
729 }
730 return r;
731 }
732
733 #ifdef USE_CALL
734 static int
compile_call(CallNode * node,regex_t * reg,ScanEnv * env)735 compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
736 {
737 int r;
738 int offset;
739
740 r = add_op(reg, OP_CALL);
741 if (r != 0) return r;
742
743 COP(reg)->call.addr = 0; /* dummy addr. */
744
745 offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
746 r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
747 return r;
748 }
749 #endif
750
751 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ScanEnv * env)752 compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
753 {
754 int i, r;
755
756 for (i = 0; i < n; i++) {
757 r = compile_tree(node, reg, env);
758 if (r != 0) return r;
759 }
760 return 0;
761 }
762
763 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)764 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
765 regex_t* reg ARG_UNUSED)
766 {
767 return 1;
768 }
769
770 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)771 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
772 {
773 int op;
774 int r;
775 int byte_len;
776 UChar* p;
777 UChar* end;
778
779 op = select_str_opcode(mb_len, str_len);
780 r = add_op(reg, op);
781 if (r != 0) return r;
782
783 byte_len = mb_len * str_len;
784 end = s + byte_len;
785
786 if (op == OP_STR_MBN) {
787 p = onigenc_strdup(reg->enc, s, end);
788 CHECK_NULL_RETURN_MEMERR(p);
789
790 COP(reg)->exact_len_n.len = mb_len;
791 COP(reg)->exact_len_n.n = str_len;
792 COP(reg)->exact_len_n.s = p;
793 }
794 else if (IS_NEED_STR_LEN_OP(op)) {
795 p = onigenc_strdup(reg->enc, s, end);
796 CHECK_NULL_RETURN_MEMERR(p);
797
798 if (op == OP_STR_N_IC)
799 COP(reg)->exact_n.n = byte_len;
800 else
801 COP(reg)->exact_n.n = str_len;
802
803 COP(reg)->exact_n.s = p;
804 }
805 else {
806 xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
807 xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
808 }
809
810 return 0;
811 }
812
813 static int
compile_length_string_node(Node * node,regex_t * reg)814 compile_length_string_node(Node* node, regex_t* reg)
815 {
816 int rlen, r, len, prev_len, slen;
817 UChar *p, *prev;
818 StrNode* sn;
819 OnigEncoding enc = reg->enc;
820
821 sn = STR_(node);
822 if (sn->end <= sn->s)
823 return 0;
824
825 if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1;
826
827 p = prev = sn->s;
828 prev_len = enclen(enc, p);
829 p += prev_len;
830 slen = 1;
831 rlen = 0;
832
833 for (; p < sn->end; ) {
834 len = enclen(enc, p);
835 if (len == prev_len) {
836 slen++;
837 }
838 else {
839 r = add_compile_string_length(prev, prev_len, slen, reg);
840 rlen += r;
841 prev = p;
842 slen = 1;
843 prev_len = len;
844 }
845 p += len;
846 }
847
848 r = add_compile_string_length(prev, prev_len, slen, reg);
849 rlen += r;
850 return rlen;
851 }
852
853 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)854 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
855 {
856 if (sn->end <= sn->s)
857 return 0;
858
859 return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
860 reg);
861 }
862
863 static int
compile_ambig_string_node(Node * node,regex_t * reg)864 compile_ambig_string_node(Node* node, regex_t* reg)
865 {
866 int r;
867 int len;
868 int byte_len;
869 UChar* p;
870 StrNode* sn;
871 OnigEncoding enc = reg->enc;
872
873 sn = STR_(node);
874 len = enclen(enc, sn->s);
875 byte_len = (int )(sn->end - sn->s);
876 if (len == byte_len) {
877 r = add_op(reg, OP_STR_1_IC);
878 if (r != 0) return r;
879
880 xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
881 xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len);
882 }
883 else {
884 r = add_op(reg, OP_STR_N_IC);
885 if (r != 0) return r;
886
887 p = onigenc_strdup(enc, sn->s, sn->end);
888 CHECK_NULL_RETURN_MEMERR(p);
889
890 COP(reg)->exact_n.s = p;
891 COP(reg)->exact_n.n = byte_len;
892 }
893
894 return 0;
895 }
896
897 static int
compile_string_node(Node * node,regex_t * reg)898 compile_string_node(Node* node, regex_t* reg)
899 {
900 int r, len, prev_len, slen;
901 UChar *p, *prev, *end;
902 StrNode* sn;
903 OnigEncoding enc = reg->enc;
904
905 sn = STR_(node);
906 if (sn->end <= sn->s)
907 return 0;
908
909 end = sn->end;
910 if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) {
911 return compile_ambig_string_node(node, reg);
912 }
913
914 p = prev = sn->s;
915 prev_len = enclen(enc, p);
916 p += prev_len;
917 slen = 1;
918
919 for (; p < end; ) {
920 len = enclen(enc, p);
921 if (len == prev_len) {
922 slen++;
923 }
924 else {
925 r = add_compile_string(prev, prev_len, slen, reg);
926 if (r != 0) return r;
927
928 prev = p;
929 slen = 1;
930 prev_len = len;
931 }
932
933 p += len;
934 }
935
936 return add_compile_string(prev, prev_len, slen, reg);
937 }
938
939 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)940 compile_string_crude_node(StrNode* sn, regex_t* reg)
941 {
942 if (sn->end <= sn->s)
943 return 0;
944
945 return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
946 }
947
948 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)949 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
950 {
951 size_t len;
952 void* p;
953
954 len = (size_t )mbuf->used;
955 p = xmalloc(len);
956 if (IS_NULL(p)) return NULL;
957
958 xmemcpy(p, mbuf->p, len);
959 return p;
960 }
961
962 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)963 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
964 {
965 return 1;
966 }
967
968 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)969 compile_cclass_node(CClassNode* cc, regex_t* reg)
970 {
971 int r;
972
973 if (IS_NULL(cc->mbuf)) {
974 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
975 if (r != 0) return r;
976
977 COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
978 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
979 xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
980 }
981 else {
982 void* p;
983
984 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
985 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
986 if (r != 0) return r;
987
988 p = set_multi_byte_cclass(cc->mbuf, reg);
989 CHECK_NULL_RETURN_MEMERR(p);
990 COP(reg)->cclass_mb.mb = p;
991 }
992 else {
993 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
994 if (r != 0) return r;
995
996 COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
997 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
998 xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
999
1000 p = set_multi_byte_cclass(cc->mbuf, reg);
1001 CHECK_NULL_RETURN_MEMERR(p);
1002 COP(reg)->cclass_mix.mb = p;
1003 }
1004 }
1005
1006 return 0;
1007 }
1008
1009 static void
set_addr_in_repeat_range(regex_t * reg)1010 set_addr_in_repeat_range(regex_t* reg)
1011 {
1012 int i;
1013
1014 for (i = 0; i < reg->num_repeat; i++) {
1015 RepeatRange* p = reg->repeat_range + i;
1016 int offset = p->u.offset;
1017 p->u.pcode = reg->ops + offset;
1018 }
1019 }
1020
1021 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1022 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1023 {
1024 #define REPEAT_RANGE_ALLOC 4
1025
1026 RepeatRange* p;
1027
1028 if (reg->repeat_range_alloc == 0) {
1029 p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1030 CHECK_NULL_RETURN_MEMERR(p);
1031 reg->repeat_range = p;
1032 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1033 }
1034 else if (reg->repeat_range_alloc <= id) {
1035 int n;
1036 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1037 p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1038 CHECK_NULL_RETURN_MEMERR(p);
1039 reg->repeat_range = p;
1040 reg->repeat_range_alloc = n;
1041 }
1042 else {
1043 p = reg->repeat_range;
1044 }
1045
1046 p[id].lower = lower;
1047 p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1048 p[id].u.offset = ops_index;
1049 return 0;
1050 }
1051
1052 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ScanEnv * env)1053 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1054 regex_t* reg, ScanEnv* env)
1055 {
1056 int r;
1057 int num_repeat = reg->num_repeat++;
1058
1059 r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1060 if (r != 0) return r;
1061
1062 COP(reg)->repeat.id = num_repeat;
1063 COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1064
1065 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1066 COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1067 if (r != 0) return r;
1068
1069 r = compile_quant_body_with_empty_check(qn, reg, env);
1070 if (r != 0) return r;
1071
1072 r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1073 if (r != 0) return r;
1074
1075 COP(reg)->repeat_inc.id = num_repeat;
1076 return r;
1077 }
1078
1079 static int
is_anychar_infinite_greedy(QuantNode * qn)1080 is_anychar_infinite_greedy(QuantNode* qn)
1081 {
1082 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1083 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1084 return 1;
1085 else
1086 return 0;
1087 }
1088
1089 #define QUANTIFIER_EXPAND_LIMIT_SIZE 10
1090 #define CKN_ON (ckn > 0)
1091
1092 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1093 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1094 {
1095 int len, mod_tlen;
1096 int infinite = IS_INFINITE_REPEAT(qn->upper);
1097 enum BodyEmptyType emptiness = qn->emptiness;
1098 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1099
1100 if (tlen < 0) return tlen;
1101 if (tlen == 0) return 0;
1102
1103 /* anychar repeat */
1104 if (is_anychar_infinite_greedy(qn)) {
1105 if (qn->lower <= 1 ||
1106 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1107 if (IS_NOT_NULL(qn->next_head_exact))
1108 return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1109 else
1110 return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1111 }
1112 }
1113
1114 mod_tlen = tlen;
1115 if (emptiness != BODY_IS_NOT_EMPTY)
1116 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1117
1118 if (infinite &&
1119 (qn->lower <= 1 ||
1120 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1121 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1122 len = OPSIZE_JUMP;
1123 }
1124 else {
1125 len = tlen * qn->lower;
1126 }
1127
1128 if (qn->greedy) {
1129 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1130 if (IS_NOT_NULL(qn->head_exact))
1131 len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1132 else
1133 #endif
1134 if (IS_NOT_NULL(qn->next_head_exact))
1135 len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1136 else
1137 len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1138 }
1139 else
1140 len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1141 }
1142 else if (qn->upper == 0) {
1143 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1144 len = OPSIZE_JUMP + tlen;
1145 }
1146 else
1147 len = 0;
1148 }
1149 else if (!infinite && qn->greedy &&
1150 (qn->upper == 1 ||
1151 int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper,
1152 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1153 len = tlen * qn->lower;
1154 len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1155 }
1156 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1157 len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1158 }
1159 else {
1160 len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1161 }
1162
1163 return len;
1164 }
1165
1166 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ScanEnv * env)1167 compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
1168 {
1169 int i, r, mod_tlen;
1170 int infinite = IS_INFINITE_REPEAT(qn->upper);
1171 enum BodyEmptyType emptiness = qn->emptiness;
1172 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1173
1174 if (tlen < 0) return tlen;
1175 if (tlen == 0) return 0;
1176
1177 if (is_anychar_infinite_greedy(qn) &&
1178 (qn->lower <= 1 ||
1179 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1180 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1181 if (r != 0) return r;
1182 if (IS_NOT_NULL(qn->next_head_exact)) {
1183 r = add_op(reg,
1184 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1185 OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1186 if (r != 0) return r;
1187
1188 COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1189 return 0;
1190 }
1191 else {
1192 r = add_op(reg,
1193 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1194 OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1195 return r;
1196 }
1197 }
1198
1199 mod_tlen = tlen;
1200 if (emptiness != BODY_IS_NOT_EMPTY)
1201 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1202
1203 if (infinite &&
1204 (qn->lower <= 1 ||
1205 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1206 int addr;
1207
1208 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1209 r = add_op(reg, OP_JUMP);
1210 if (r != 0) return r;
1211 if (qn->greedy) {
1212 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1213 if (IS_NOT_NULL(qn->head_exact))
1214 COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1215 else
1216 #endif
1217 if (IS_NOT_NULL(qn->next_head_exact))
1218 COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1219 else
1220 COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1221 }
1222 else {
1223 COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1224 }
1225 }
1226 else {
1227 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1228 if (r != 0) return r;
1229 }
1230
1231 if (qn->greedy) {
1232 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1233 if (IS_NOT_NULL(qn->head_exact)) {
1234 r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1235 if (r != 0) return r;
1236 COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1237 COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0];
1238
1239 r = compile_quant_body_with_empty_check(qn, reg, env);
1240 if (r != 0) return r;
1241
1242 addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1243 }
1244 else
1245 #endif
1246 if (IS_NOT_NULL(qn->next_head_exact)) {
1247 r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1248 if (r != 0) return r;
1249 COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1250 COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0];
1251
1252 r = compile_quant_body_with_empty_check(qn, reg, env);
1253 if (r != 0) return r;
1254
1255 addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1256 }
1257 else {
1258 r = add_op(reg, OP_PUSH);
1259 if (r != 0) return r;
1260 COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1261
1262 r = compile_quant_body_with_empty_check(qn, reg, env);
1263 if (r != 0) return r;
1264
1265 addr = -(mod_tlen + (int )OPSIZE_PUSH);
1266 }
1267
1268 r = add_op(reg, OP_JUMP);
1269 if (r != 0) return r;
1270 COP(reg)->jump.addr = addr;
1271 }
1272 else {
1273 r = add_op(reg, OP_JUMP);
1274 if (r != 0) return r;
1275 COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1276
1277 r = compile_quant_body_with_empty_check(qn, reg, env);
1278 if (r != 0) return r;
1279
1280 r = add_op(reg, OP_PUSH);
1281 if (r != 0) return r;
1282 COP(reg)->push.addr = -mod_tlen;
1283 }
1284 }
1285 else if (qn->upper == 0) {
1286 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1287 r = add_op(reg, OP_JUMP);
1288 if (r != 0) return r;
1289 COP(reg)->jump.addr = tlen + SIZE_INC;
1290
1291 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1292 }
1293 else {
1294 /* Nothing output */
1295 r = 0;
1296 }
1297 }
1298 else if (! infinite && qn->greedy &&
1299 (qn->upper == 1 ||
1300 int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper,
1301 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1302 int n = qn->upper - qn->lower;
1303
1304 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1305 if (r != 0) return r;
1306
1307 for (i = 0; i < n; i++) {
1308 int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1309 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1310
1311 r = add_op(reg, OP_PUSH);
1312 if (r != 0) return r;
1313 COP(reg)->push.addr = v;
1314
1315 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1316 if (r != 0) return r;
1317 }
1318 }
1319 else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1320 r = add_op(reg, OP_PUSH);
1321 if (r != 0) return r;
1322 COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1323
1324 r = add_op(reg, OP_JUMP);
1325 if (r != 0) return r;
1326 COP(reg)->jump.addr = tlen + SIZE_INC;
1327
1328 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1329 }
1330 else {
1331 r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1332 }
1333 return r;
1334 }
1335
1336 static int
compile_length_option_node(BagNode * node,regex_t * reg)1337 compile_length_option_node(BagNode* node, regex_t* reg)
1338 {
1339 int tlen;
1340 OnigOptionType prev = reg->options;
1341
1342 reg->options = node->o.options;
1343 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1344 reg->options = prev;
1345
1346 return tlen;
1347 }
1348
1349 static int
compile_option_node(BagNode * node,regex_t * reg,ScanEnv * env)1350 compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
1351 {
1352 int r;
1353 OnigOptionType prev = reg->options;
1354
1355 reg->options = node->o.options;
1356 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1357 reg->options = prev;
1358
1359 return r;
1360 }
1361
1362 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1363 compile_length_bag_node(BagNode* node, regex_t* reg)
1364 {
1365 int len;
1366 int tlen;
1367
1368 if (node->type == BAG_OPTION)
1369 return compile_length_option_node(node, reg);
1370
1371 if (NODE_BAG_BODY(node)) {
1372 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1373 if (tlen < 0) return tlen;
1374 }
1375 else
1376 tlen = 0;
1377
1378 switch (node->type) {
1379 case BAG_MEMORY:
1380 #ifdef USE_CALL
1381
1382 if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1383 len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1384 return len;
1385 }
1386
1387 if (NODE_IS_CALLED(node)) {
1388 len = OPSIZE_MEM_START_PUSH + tlen
1389 + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1390 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1391 len += (NODE_IS_RECURSION(node)
1392 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1393 else
1394 len += (NODE_IS_RECURSION(node)
1395 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1396 }
1397 else if (NODE_IS_RECURSION(node)) {
1398 len = OPSIZE_MEM_START_PUSH;
1399 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1400 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1401 }
1402 else
1403 #endif
1404 {
1405 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1406 len = OPSIZE_MEM_START_PUSH;
1407 else
1408 len = OPSIZE_MEM_START;
1409
1410 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1411 ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1412 }
1413 break;
1414
1415 case BAG_STOP_BACKTRACK:
1416 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1417 int v;
1418 QuantNode* qn;
1419
1420 qn = QUANT_(NODE_BAG_BODY(node));
1421 tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1422 if (tlen < 0) return tlen;
1423
1424 v = onig_positive_int_multiply(qn->lower, tlen);
1425 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1426 len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP;
1427 }
1428 else {
1429 len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END;
1430 }
1431 break;
1432
1433 case BAG_IF_ELSE:
1434 {
1435 Node* cond = NODE_BAG_BODY(node);
1436 Node* Then = node->te.Then;
1437 Node* Else = node->te.Else;
1438
1439 len = compile_length_tree(cond, reg);
1440 if (len < 0) return len;
1441 len += OPSIZE_PUSH;
1442 len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END;
1443
1444 if (IS_NOT_NULL(Then)) {
1445 tlen = compile_length_tree(Then, reg);
1446 if (tlen < 0) return tlen;
1447 len += tlen;
1448 }
1449
1450 len += OPSIZE_JUMP + OPSIZE_ATOMIC_END;
1451
1452 if (IS_NOT_NULL(Else)) {
1453 tlen = compile_length_tree(Else, reg);
1454 if (tlen < 0) return tlen;
1455 len += tlen;
1456 }
1457 }
1458 break;
1459
1460 case BAG_OPTION:
1461 /* never come here, but set for escape warning */
1462 len = 0;
1463 break;
1464 }
1465
1466 return len;
1467 }
1468
1469 static int get_char_len_node(Node* node, regex_t* reg, int* len);
1470
1471 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ScanEnv * env)1472 compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
1473 {
1474 int r;
1475
1476 #ifdef USE_CALL
1477 if (NODE_IS_CALLED(node)) {
1478 int len;
1479
1480 r = add_op(reg, OP_CALL);
1481 if (r != 0) return r;
1482
1483 node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1484 NODE_STATUS_ADD(node, ADDR_FIXED);
1485 COP(reg)->call.addr = (int )node->m.called_addr;
1486
1487 if (node->m.regnum == 0) {
1488 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1489 len += OPSIZE_RETURN;
1490
1491 r = add_op(reg, OP_JUMP);
1492 if (r != 0) return r;
1493 COP(reg)->jump.addr = len + SIZE_INC;
1494
1495 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1496 if (r != 0) return r;
1497
1498 r = add_op(reg, OP_RETURN);
1499 return r;
1500 }
1501 else {
1502 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1503 len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1504 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1505 len += (NODE_IS_RECURSION(node)
1506 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1507 else
1508 len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1509
1510 r = add_op(reg, OP_JUMP);
1511 if (r != 0) return r;
1512 COP(reg)->jump.addr = len + SIZE_INC;
1513 }
1514 }
1515 #endif
1516
1517 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1518 r = add_op(reg, OP_MEM_START_PUSH);
1519 else
1520 r = add_op(reg, OP_MEM_START);
1521 if (r != 0) return r;
1522 COP(reg)->memory_start.num = node->m.regnum;
1523
1524 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1525 if (r != 0) return r;
1526
1527 #ifdef USE_CALL
1528 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1529 r = add_op(reg, (NODE_IS_RECURSION(node)
1530 ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1531 else
1532 r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1533 if (r != 0) return r;
1534 COP(reg)->memory_end.num = node->m.regnum;
1535
1536 if (NODE_IS_CALLED(node)) {
1537 if (r != 0) return r;
1538 r = add_op(reg, OP_RETURN);
1539 }
1540 #else
1541 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1542 r = add_op(reg, OP_MEM_END_PUSH);
1543 else
1544 r = add_op(reg, OP_MEM_END);
1545 if (r != 0) return r;
1546 COP(reg)->memory_end.num = node->m.regnum;
1547 #endif
1548
1549 return r;
1550 }
1551
1552 static int
compile_bag_node(BagNode * node,regex_t * reg,ScanEnv * env)1553 compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
1554 {
1555 int r, len;
1556
1557 switch (node->type) {
1558 case BAG_MEMORY:
1559 r = compile_bag_memory_node(node, reg, env);
1560 break;
1561
1562 case BAG_OPTION:
1563 r = compile_option_node(node, reg, env);
1564 break;
1565
1566 case BAG_STOP_BACKTRACK:
1567 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1568 QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1569 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1570 if (r != 0) return r;
1571
1572 len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1573 if (len < 0) return len;
1574
1575 r = add_op(reg, OP_PUSH);
1576 if (r != 0) return r;
1577 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP;
1578
1579 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1580 if (r != 0) return r;
1581 r = add_op(reg, OP_POP_OUT);
1582 if (r != 0) return r;
1583
1584 r = add_op(reg, OP_JUMP);
1585 if (r != 0) return r;
1586 COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT);
1587 }
1588 else {
1589 r = add_op(reg, OP_ATOMIC_START);
1590 if (r != 0) return r;
1591 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1592 if (r != 0) return r;
1593 r = add_op(reg, OP_ATOMIC_END);
1594 }
1595 break;
1596
1597 case BAG_IF_ELSE:
1598 {
1599 int cond_len, then_len, else_len, jump_len;
1600 Node* cond = NODE_BAG_BODY(node);
1601 Node* Then = node->te.Then;
1602 Node* Else = node->te.Else;
1603
1604 r = add_op(reg, OP_ATOMIC_START);
1605 if (r != 0) return r;
1606
1607 cond_len = compile_length_tree(cond, reg);
1608 if (cond_len < 0) return cond_len;
1609 if (IS_NOT_NULL(Then)) {
1610 then_len = compile_length_tree(Then, reg);
1611 if (then_len < 0) return then_len;
1612 }
1613 else
1614 then_len = 0;
1615
1616 jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP;
1617
1618 r = add_op(reg, OP_PUSH);
1619 if (r != 0) return r;
1620 COP(reg)->push.addr = SIZE_INC + jump_len;
1621
1622 r = compile_tree(cond, reg, env);
1623 if (r != 0) return r;
1624 r = add_op(reg, OP_ATOMIC_END);
1625 if (r != 0) return r;
1626
1627 if (IS_NOT_NULL(Then)) {
1628 r = compile_tree(Then, reg, env);
1629 if (r != 0) return r;
1630 }
1631
1632 if (IS_NOT_NULL(Else)) {
1633 else_len = compile_length_tree(Else, reg);
1634 if (else_len < 0) return else_len;
1635 }
1636 else
1637 else_len = 0;
1638
1639 r = add_op(reg, OP_JUMP);
1640 if (r != 0) return r;
1641 COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC;
1642
1643 r = add_op(reg, OP_ATOMIC_END);
1644 if (r != 0) return r;
1645
1646 if (IS_NOT_NULL(Else)) {
1647 r = compile_tree(Else, reg, env);
1648 }
1649 }
1650 break;
1651 }
1652
1653 return r;
1654 }
1655
1656 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1657 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1658 {
1659 int len;
1660 int tlen = 0;
1661
1662 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1663 tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1664 if (tlen < 0) return tlen;
1665 }
1666
1667 switch (node->type) {
1668 case ANCR_PREC_READ:
1669 len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END;
1670 break;
1671 case ANCR_PREC_READ_NOT:
1672 len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END;
1673 break;
1674 case ANCR_LOOK_BEHIND:
1675 len = OPSIZE_LOOK_BEHIND + tlen;
1676 break;
1677 case ANCR_LOOK_BEHIND_NOT:
1678 len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END;
1679 break;
1680
1681 case ANCR_WORD_BOUNDARY:
1682 case ANCR_NO_WORD_BOUNDARY:
1683 #ifdef USE_WORD_BEGIN_END
1684 case ANCR_WORD_BEGIN:
1685 case ANCR_WORD_END:
1686 #endif
1687 len = OPSIZE_WORD_BOUNDARY;
1688 break;
1689
1690 case ANCR_TEXT_SEGMENT_BOUNDARY:
1691 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1692 len = SIZE_OPCODE;
1693 break;
1694
1695 default:
1696 len = SIZE_OPCODE;
1697 break;
1698 }
1699
1700 return len;
1701 }
1702
1703 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ScanEnv * env)1704 compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
1705 {
1706 int r, len;
1707 enum OpCode op;
1708
1709 switch (node->type) {
1710 case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break;
1711 case ANCR_END_BUF: r = add_op(reg, OP_END_BUF); break;
1712 case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break;
1713 case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break;
1714 case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break;
1715 case ANCR_BEGIN_POSITION: r = add_op(reg, OP_BEGIN_POSITION); break;
1716
1717 case ANCR_WORD_BOUNDARY:
1718 op = OP_WORD_BOUNDARY;
1719 word:
1720 r = add_op(reg, op);
1721 if (r != 0) return r;
1722 COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
1723 break;
1724
1725 case ANCR_NO_WORD_BOUNDARY:
1726 op = OP_NO_WORD_BOUNDARY; goto word;
1727 break;
1728 #ifdef USE_WORD_BEGIN_END
1729 case ANCR_WORD_BEGIN:
1730 op = OP_WORD_BEGIN; goto word;
1731 break;
1732 case ANCR_WORD_END:
1733 op = OP_WORD_END; goto word;
1734 break;
1735 #endif
1736
1737 case ANCR_TEXT_SEGMENT_BOUNDARY:
1738 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1739 {
1740 enum TextSegmentBoundaryType type;
1741
1742 r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
1743 if (r != 0) return r;
1744
1745 type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
1746 #ifdef USE_UNICODE_WORD_BREAK
1747 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD))
1748 type = WORD_BOUNDARY;
1749 #endif
1750
1751 COP(reg)->text_segment_boundary.type = type;
1752 COP(reg)->text_segment_boundary.not =
1753 (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
1754 }
1755 break;
1756
1757 case ANCR_PREC_READ:
1758 r = add_op(reg, OP_PREC_READ_START);
1759 if (r != 0) return r;
1760 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1761 if (r != 0) return r;
1762 r = add_op(reg, OP_PREC_READ_END);
1763 break;
1764
1765 case ANCR_PREC_READ_NOT:
1766 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1767 if (len < 0) return len;
1768
1769 r = add_op(reg, OP_PREC_READ_NOT_START);
1770 if (r != 0) return r;
1771 COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END;
1772 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1773 if (r != 0) return r;
1774 r = add_op(reg, OP_PREC_READ_NOT_END);
1775 break;
1776
1777 case ANCR_LOOK_BEHIND:
1778 {
1779 int n;
1780 r = add_op(reg, OP_LOOK_BEHIND);
1781 if (r != 0) return r;
1782 if (node->char_len < 0) {
1783 r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1784 if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1785 }
1786 else
1787 n = node->char_len;
1788
1789 COP(reg)->look_behind.len = n;
1790 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1791 }
1792 break;
1793
1794 case ANCR_LOOK_BEHIND_NOT:
1795 {
1796 int n;
1797
1798 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1799 r = add_op(reg, OP_LOOK_BEHIND_NOT_START);
1800 if (r != 0) return r;
1801 COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END;
1802
1803 if (node->char_len < 0) {
1804 r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1805 if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1806 }
1807 else
1808 n = node->char_len;
1809
1810 COP(reg)->look_behind_not_start.len = n;
1811
1812 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1813 if (r != 0) return r;
1814 r = add_op(reg, OP_LOOK_BEHIND_NOT_END);
1815 }
1816 break;
1817
1818 default:
1819 return ONIGERR_TYPE_BUG;
1820 break;
1821 }
1822
1823 return r;
1824 }
1825
1826 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)1827 compile_gimmick_node(GimmickNode* node, regex_t* reg)
1828 {
1829 int r;
1830
1831 switch (node->type) {
1832 case GIMMICK_FAIL:
1833 r = add_op(reg, OP_FAIL);
1834 break;
1835
1836 case GIMMICK_SAVE:
1837 r = add_op(reg, OP_PUSH_SAVE_VAL);
1838 if (r != 0) return r;
1839 COP(reg)->push_save_val.type = node->detail_type;
1840 COP(reg)->push_save_val.id = node->id;
1841 break;
1842
1843 case GIMMICK_UPDATE_VAR:
1844 r = add_op(reg, OP_UPDATE_VAR);
1845 if (r != 0) return r;
1846 COP(reg)->update_var.type = node->detail_type;
1847 COP(reg)->update_var.id = node->id;
1848 break;
1849
1850 #ifdef USE_CALLOUT
1851 case GIMMICK_CALLOUT:
1852 switch (node->detail_type) {
1853 case ONIG_CALLOUT_OF_CONTENTS:
1854 case ONIG_CALLOUT_OF_NAME:
1855 {
1856 if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
1857 r = add_op(reg, OP_CALLOUT_NAME);
1858 if (r != 0) return r;
1859 COP(reg)->callout_name.id = node->id;
1860 COP(reg)->callout_name.num = node->num;
1861 }
1862 else {
1863 r = add_op(reg, OP_CALLOUT_CONTENTS);
1864 if (r != 0) return r;
1865 COP(reg)->callout_contents.num = node->num;
1866 }
1867 }
1868 break;
1869
1870 default:
1871 r = ONIGERR_TYPE_BUG;
1872 break;
1873 }
1874 #endif
1875 }
1876
1877 return r;
1878 }
1879
1880 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)1881 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
1882 {
1883 int len;
1884
1885 switch (node->type) {
1886 case GIMMICK_FAIL:
1887 len = OPSIZE_FAIL;
1888 break;
1889
1890 case GIMMICK_SAVE:
1891 len = OPSIZE_PUSH_SAVE_VAL;
1892 break;
1893
1894 case GIMMICK_UPDATE_VAR:
1895 len = OPSIZE_UPDATE_VAR;
1896 break;
1897
1898 #ifdef USE_CALLOUT
1899 case GIMMICK_CALLOUT:
1900 switch (node->detail_type) {
1901 case ONIG_CALLOUT_OF_CONTENTS:
1902 len = OPSIZE_CALLOUT_CONTENTS;
1903 break;
1904 case ONIG_CALLOUT_OF_NAME:
1905 len = OPSIZE_CALLOUT_NAME;
1906 break;
1907
1908 default:
1909 len = ONIGERR_TYPE_BUG;
1910 break;
1911 }
1912 break;
1913 #endif
1914 }
1915
1916 return len;
1917 }
1918
1919 static int
compile_length_tree(Node * node,regex_t * reg)1920 compile_length_tree(Node* node, regex_t* reg)
1921 {
1922 int len, r;
1923
1924 switch (NODE_TYPE(node)) {
1925 case NODE_LIST:
1926 len = 0;
1927 do {
1928 r = compile_length_tree(NODE_CAR(node), reg);
1929 if (r < 0) return r;
1930 len += r;
1931 } while (IS_NOT_NULL(node = NODE_CDR(node)));
1932 r = len;
1933 break;
1934
1935 case NODE_ALT:
1936 {
1937 int n;
1938
1939 n = r = 0;
1940 do {
1941 r += compile_length_tree(NODE_CAR(node), reg);
1942 n++;
1943 } while (IS_NOT_NULL(node = NODE_CDR(node)));
1944 r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
1945 }
1946 break;
1947
1948 case NODE_STRING:
1949 if (NODE_STRING_IS_CRUDE(node))
1950 r = compile_length_string_crude_node(STR_(node), reg);
1951 else
1952 r = compile_length_string_node(node, reg);
1953 break;
1954
1955 case NODE_CCLASS:
1956 r = compile_length_cclass_node(CCLASS_(node), reg);
1957 break;
1958
1959 case NODE_CTYPE:
1960 r = SIZE_OPCODE;
1961 break;
1962
1963 case NODE_BACKREF:
1964 r = OPSIZE_BACKREF;
1965 break;
1966
1967 #ifdef USE_CALL
1968 case NODE_CALL:
1969 r = OPSIZE_CALL;
1970 break;
1971 #endif
1972
1973 case NODE_QUANT:
1974 r = compile_length_quantifier_node(QUANT_(node), reg);
1975 break;
1976
1977 case NODE_BAG:
1978 r = compile_length_bag_node(BAG_(node), reg);
1979 break;
1980
1981 case NODE_ANCHOR:
1982 r = compile_length_anchor_node(ANCHOR_(node), reg);
1983 break;
1984
1985 case NODE_GIMMICK:
1986 r = compile_length_gimmick_node(GIMMICK_(node), reg);
1987 break;
1988
1989 default:
1990 return ONIGERR_TYPE_BUG;
1991 break;
1992 }
1993
1994 return r;
1995 }
1996
1997 static int
compile_tree(Node * node,regex_t * reg,ScanEnv * env)1998 compile_tree(Node* node, regex_t* reg, ScanEnv* env)
1999 {
2000 int n, len, pos, r = 0;
2001
2002 switch (NODE_TYPE(node)) {
2003 case NODE_LIST:
2004 do {
2005 r = compile_tree(NODE_CAR(node), reg, env);
2006 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2007 break;
2008
2009 case NODE_ALT:
2010 {
2011 Node* x = node;
2012 len = 0;
2013 do {
2014 len += compile_length_tree(NODE_CAR(x), reg);
2015 if (IS_NOT_NULL(NODE_CDR(x))) {
2016 len += OPSIZE_PUSH + OPSIZE_JUMP;
2017 }
2018 } while (IS_NOT_NULL(x = NODE_CDR(x)));
2019 pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */
2020
2021 do {
2022 len = compile_length_tree(NODE_CAR(node), reg);
2023 if (IS_NOT_NULL(NODE_CDR(node))) {
2024 enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2025 r = add_op(reg, push);
2026 if (r != 0) break;
2027 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2028 }
2029 r = compile_tree(NODE_CAR(node), reg, env);
2030 if (r != 0) break;
2031 if (IS_NOT_NULL(NODE_CDR(node))) {
2032 len = pos - (COP_CURR_OFFSET(reg) + 1);
2033 r = add_op(reg, OP_JUMP);
2034 if (r != 0) break;
2035 COP(reg)->jump.addr = len;
2036 }
2037 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2038 }
2039 break;
2040
2041 case NODE_STRING:
2042 if (NODE_STRING_IS_CRUDE(node))
2043 r = compile_string_crude_node(STR_(node), reg);
2044 else
2045 r = compile_string_node(node, reg);
2046 break;
2047
2048 case NODE_CCLASS:
2049 r = compile_cclass_node(CCLASS_(node), reg);
2050 break;
2051
2052 case NODE_CTYPE:
2053 {
2054 int op;
2055
2056 switch (CTYPE_(node)->ctype) {
2057 case CTYPE_ANYCHAR:
2058 r = add_op(reg, IS_MULTILINE(CTYPE_OPTION(node, reg)) ?
2059 OP_ANYCHAR_ML : OP_ANYCHAR);
2060 break;
2061
2062 case ONIGENC_CTYPE_WORD:
2063 if (CTYPE_(node)->ascii_mode == 0) {
2064 op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2065 }
2066 else {
2067 op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2068 }
2069 r = add_op(reg, op);
2070 break;
2071
2072 default:
2073 return ONIGERR_TYPE_BUG;
2074 break;
2075 }
2076 }
2077 break;
2078
2079 case NODE_BACKREF:
2080 {
2081 BackRefNode* br = BACKREF_(node);
2082
2083 if (NODE_IS_CHECKER(node)) {
2084 #ifdef USE_BACKREF_WITH_LEVEL
2085 if (NODE_IS_NEST_LEVEL(node)) {
2086 r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2087 if (r != 0) return r;
2088 COP(reg)->backref_general.nest_level = br->nest_level;
2089 }
2090 else
2091 #endif
2092 {
2093 r = add_op(reg, OP_BACKREF_CHECK);
2094 if (r != 0) return r;
2095 }
2096 goto add_bacref_mems;
2097 }
2098 else {
2099 #ifdef USE_BACKREF_WITH_LEVEL
2100 if (NODE_IS_NEST_LEVEL(node)) {
2101 if ((reg->options & ONIG_OPTION_IGNORECASE) != 0)
2102 r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2103 else
2104 r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2105
2106 if (r != 0) return r;
2107 COP(reg)->backref_general.nest_level = br->nest_level;
2108 goto add_bacref_mems;
2109 }
2110 else
2111 #endif
2112 if (br->back_num == 1) {
2113 n = br->back_static[0];
2114 if (IS_IGNORECASE(reg->options)) {
2115 r = add_op(reg, OP_BACKREF_N_IC);
2116 if (r != 0) return r;
2117 COP(reg)->backref_n.n1 = n;
2118 }
2119 else {
2120 switch (n) {
2121 case 1: r = add_op(reg, OP_BACKREF1); break;
2122 case 2: r = add_op(reg, OP_BACKREF2); break;
2123 default:
2124 r = add_op(reg, OP_BACKREF_N);
2125 if (r != 0) return r;
2126 COP(reg)->backref_n.n1 = n;
2127 break;
2128 }
2129 }
2130 }
2131 else {
2132 int num;
2133 int* p;
2134
2135 r = add_op(reg, IS_IGNORECASE(reg->options) ?
2136 OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2137 if (r != 0) return r;
2138
2139 add_bacref_mems:
2140 num = br->back_num;
2141 COP(reg)->backref_general.num = num;
2142 if (num == 1) {
2143 COP(reg)->backref_general.n1 = br->back_static[0];
2144 }
2145 else {
2146 int i, j;
2147 MemNumType* ns;
2148
2149 ns = xmalloc(sizeof(MemNumType) * num);
2150 CHECK_NULL_RETURN_MEMERR(ns);
2151 COP(reg)->backref_general.ns = ns;
2152 p = BACKREFS_P(br);
2153 for (i = num - 1, j = 0; i >= 0; i--, j++) {
2154 ns[j] = p[i];
2155 }
2156 }
2157 }
2158 }
2159 }
2160 break;
2161
2162 #ifdef USE_CALL
2163 case NODE_CALL:
2164 r = compile_call(CALL_(node), reg, env);
2165 break;
2166 #endif
2167
2168 case NODE_QUANT:
2169 r = compile_quantifier_node(QUANT_(node), reg, env);
2170 break;
2171
2172 case NODE_BAG:
2173 r = compile_bag_node(BAG_(node), reg, env);
2174 break;
2175
2176 case NODE_ANCHOR:
2177 r = compile_anchor_node(ANCHOR_(node), reg, env);
2178 break;
2179
2180 case NODE_GIMMICK:
2181 r = compile_gimmick_node(GIMMICK_(node), reg);
2182 break;
2183
2184 default:
2185 #ifdef ONIG_DEBUG
2186 fprintf(stderr, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2187 #endif
2188 break;
2189 }
2190
2191 return r;
2192 }
2193
2194 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)2195 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
2196 {
2197 int r = 0;
2198 Node* node = *plink;
2199
2200 switch (NODE_TYPE(node)) {
2201 case NODE_LIST:
2202 case NODE_ALT:
2203 do {
2204 r = noname_disable_map(&(NODE_CAR(node)), map, counter);
2205 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2206 break;
2207
2208 case NODE_QUANT:
2209 {
2210 Node** ptarget = &(NODE_BODY(node));
2211 Node* old = *ptarget;
2212 r = noname_disable_map(ptarget, map, counter);
2213 if (r != 0) return r;
2214 if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) {
2215 r = onig_reduce_nested_quantifier(node);
2216 }
2217 }
2218 break;
2219
2220 case NODE_BAG:
2221 {
2222 BagNode* en = BAG_(node);
2223 if (en->type == BAG_MEMORY) {
2224 if (NODE_IS_NAMED_GROUP(node)) {
2225 (*counter)++;
2226 map[en->m.regnum].new_val = *counter;
2227 en->m.regnum = *counter;
2228 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2229 }
2230 else {
2231 *plink = NODE_BODY(node);
2232 NODE_BODY(node) = NULL_NODE;
2233 onig_node_free(node);
2234 r = noname_disable_map(plink, map, counter);
2235 }
2236 }
2237 else if (en->type == BAG_IF_ELSE) {
2238 r = noname_disable_map(&(NODE_BAG_BODY(en)), map, counter);
2239 if (r != 0) return r;
2240 if (IS_NOT_NULL(en->te.Then)) {
2241 r = noname_disable_map(&(en->te.Then), map, counter);
2242 if (r != 0) return r;
2243 }
2244 if (IS_NOT_NULL(en->te.Else)) {
2245 r = noname_disable_map(&(en->te.Else), map, counter);
2246 if (r != 0) return r;
2247 }
2248 }
2249 else
2250 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2251 }
2252 break;
2253
2254 case NODE_ANCHOR:
2255 if (IS_NOT_NULL(NODE_BODY(node)))
2256 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2257 break;
2258
2259 default:
2260 break;
2261 }
2262
2263 return r;
2264 }
2265
2266 static int
renumber_node_backref(Node * node,GroupNumRemap * map)2267 renumber_node_backref(Node* node, GroupNumRemap* map)
2268 {
2269 int i, pos, n, old_num;
2270 int *backs;
2271 BackRefNode* bn = BACKREF_(node);
2272
2273 if (! NODE_IS_BY_NAME(node))
2274 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2275
2276 old_num = bn->back_num;
2277 if (IS_NULL(bn->back_dynamic))
2278 backs = bn->back_static;
2279 else
2280 backs = bn->back_dynamic;
2281
2282 for (i = 0, pos = 0; i < old_num; i++) {
2283 n = map[backs[i]].new_val;
2284 if (n > 0) {
2285 backs[pos] = n;
2286 pos++;
2287 }
2288 }
2289
2290 bn->back_num = pos;
2291 return 0;
2292 }
2293
2294 static int
renumber_by_map(Node * node,GroupNumRemap * map)2295 renumber_by_map(Node* node, GroupNumRemap* map)
2296 {
2297 int r = 0;
2298
2299 switch (NODE_TYPE(node)) {
2300 case NODE_LIST:
2301 case NODE_ALT:
2302 do {
2303 r = renumber_by_map(NODE_CAR(node), map);
2304 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2305 break;
2306
2307 case NODE_QUANT:
2308 r = renumber_by_map(NODE_BODY(node), map);
2309 break;
2310
2311 case NODE_BAG:
2312 {
2313 BagNode* en = BAG_(node);
2314
2315 r = renumber_by_map(NODE_BODY(node), map);
2316 if (r != 0) return r;
2317
2318 if (en->type == BAG_IF_ELSE) {
2319 if (IS_NOT_NULL(en->te.Then)) {
2320 r = renumber_by_map(en->te.Then, map);
2321 if (r != 0) return r;
2322 }
2323 if (IS_NOT_NULL(en->te.Else)) {
2324 r = renumber_by_map(en->te.Else, map);
2325 if (r != 0) return r;
2326 }
2327 }
2328 }
2329 break;
2330
2331 case NODE_BACKREF:
2332 r = renumber_node_backref(node, map);
2333 break;
2334
2335 case NODE_ANCHOR:
2336 if (IS_NOT_NULL(NODE_BODY(node)))
2337 r = renumber_by_map(NODE_BODY(node), map);
2338 break;
2339
2340 default:
2341 break;
2342 }
2343
2344 return r;
2345 }
2346
2347 static int
numbered_ref_check(Node * node)2348 numbered_ref_check(Node* node)
2349 {
2350 int r = 0;
2351
2352 switch (NODE_TYPE(node)) {
2353 case NODE_LIST:
2354 case NODE_ALT:
2355 do {
2356 r = numbered_ref_check(NODE_CAR(node));
2357 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2358 break;
2359
2360 case NODE_ANCHOR:
2361 if (IS_NULL(NODE_BODY(node)))
2362 break;
2363 /* fall */
2364 case NODE_QUANT:
2365 r = numbered_ref_check(NODE_BODY(node));
2366 break;
2367
2368 case NODE_BAG:
2369 {
2370 BagNode* en = BAG_(node);
2371
2372 r = numbered_ref_check(NODE_BODY(node));
2373 if (r != 0) return r;
2374
2375 if (en->type == BAG_IF_ELSE) {
2376 if (IS_NOT_NULL(en->te.Then)) {
2377 r = numbered_ref_check(en->te.Then);
2378 if (r != 0) return r;
2379 }
2380 if (IS_NOT_NULL(en->te.Else)) {
2381 r = numbered_ref_check(en->te.Else);
2382 if (r != 0) return r;
2383 }
2384 }
2385 }
2386
2387 break;
2388
2389 case NODE_BACKREF:
2390 if (! NODE_IS_BY_NAME(node))
2391 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2392 break;
2393
2394 default:
2395 break;
2396 }
2397
2398 return r;
2399 }
2400
2401 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)2402 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2403 {
2404 int r, i, pos, counter;
2405 MemStatusType loc;
2406 GroupNumRemap* map;
2407
2408 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2409 CHECK_NULL_RETURN_MEMERR(map);
2410 for (i = 1; i <= env->num_mem; i++) {
2411 map[i].new_val = 0;
2412 }
2413 counter = 0;
2414 r = noname_disable_map(root, map, &counter);
2415 if (r != 0) return r;
2416
2417 r = renumber_by_map(*root, map);
2418 if (r != 0) return r;
2419
2420 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2421 if (map[i].new_val > 0) {
2422 SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
2423 pos++;
2424 }
2425 }
2426
2427 loc = env->cap_history;
2428 MEM_STATUS_CLEAR(env->cap_history);
2429 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2430 if (MEM_STATUS_AT(loc, i)) {
2431 MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
2432 }
2433 }
2434
2435 env->num_mem = env->num_named;
2436 reg->num_mem = env->num_named;
2437
2438 return onig_renumber_name_table(reg, map);
2439 }
2440
2441 #ifdef USE_CALL
2442 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)2443 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
2444 {
2445 int i, offset;
2446 BagNode* en;
2447 AbsAddrType addr;
2448 AbsAddrType* paddr;
2449
2450 for (i = 0; i < uslist->num; i++) {
2451 if (! NODE_IS_ADDR_FIXED(uslist->us[i].target))
2452 return ONIGERR_PARSER_BUG;
2453
2454 en = BAG_(uslist->us[i].target);
2455 addr = en->m.called_addr;
2456 offset = uslist->us[i].offset;
2457
2458 paddr = (AbsAddrType* )((char* )reg->ops + offset);
2459 *paddr = addr;
2460 }
2461 return 0;
2462 }
2463 #endif
2464
2465
2466 #define GET_CHAR_LEN_VARLEN -1
2467 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2468
2469 /* fixed size pattern node only */
2470 static int
get_char_len_node1(Node * node,regex_t * reg,int * len,int level)2471 get_char_len_node1(Node* node, regex_t* reg, int* len, int level)
2472 {
2473 int tlen;
2474 int r = 0;
2475
2476 level++;
2477 *len = 0;
2478 switch (NODE_TYPE(node)) {
2479 case NODE_LIST:
2480 do {
2481 r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2482 if (r == 0)
2483 *len = distance_add(*len, tlen);
2484 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2485 break;
2486
2487 case NODE_ALT:
2488 {
2489 int tlen2;
2490 int varlen = 0;
2491
2492 r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2493 while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))) {
2494 r = get_char_len_node1(NODE_CAR(node), reg, &tlen2, level);
2495 if (r == 0) {
2496 if (tlen != tlen2)
2497 varlen = 1;
2498 }
2499 }
2500 if (r == 0) {
2501 if (varlen != 0) {
2502 if (level == 1)
2503 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2504 else
2505 r = GET_CHAR_LEN_VARLEN;
2506 }
2507 else
2508 *len = tlen;
2509 }
2510 }
2511 break;
2512
2513 case NODE_STRING:
2514 {
2515 StrNode* sn = STR_(node);
2516 UChar *s = sn->s;
2517
2518 while (s < sn->end) {
2519 s += enclen(reg->enc, s);
2520 (*len)++;
2521 }
2522 }
2523 break;
2524
2525 case NODE_QUANT:
2526 {
2527 QuantNode* qn = QUANT_(node);
2528
2529 if (qn->lower == qn->upper) {
2530 if (qn->upper == 0) {
2531 *len = 0;
2532 }
2533 else {
2534 r = get_char_len_node1(NODE_BODY(node), reg, &tlen, level);
2535 if (r == 0)
2536 *len = distance_multiply(tlen, qn->lower);
2537 }
2538 }
2539 else
2540 r = GET_CHAR_LEN_VARLEN;
2541 }
2542 break;
2543
2544 #ifdef USE_CALL
2545 case NODE_CALL:
2546 if (! NODE_IS_RECURSION(node))
2547 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2548 else
2549 r = GET_CHAR_LEN_VARLEN;
2550 break;
2551 #endif
2552
2553 case NODE_CTYPE:
2554 case NODE_CCLASS:
2555 *len = 1;
2556 break;
2557
2558 case NODE_BAG:
2559 {
2560 BagNode* en = BAG_(node);
2561
2562 switch (en->type) {
2563 case BAG_MEMORY:
2564 #ifdef USE_CALL
2565 if (NODE_IS_CLEN_FIXED(node))
2566 *len = en->char_len;
2567 else {
2568 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2569 if (r == 0) {
2570 en->char_len = *len;
2571 NODE_STATUS_ADD(node, CLEN_FIXED);
2572 }
2573 }
2574 break;
2575 #endif
2576 case BAG_OPTION:
2577 case BAG_STOP_BACKTRACK:
2578 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2579 break;
2580 case BAG_IF_ELSE:
2581 {
2582 int clen, elen;
2583
2584 r = get_char_len_node1(NODE_BODY(node), reg, &clen, level);
2585 if (r == 0) {
2586 if (IS_NOT_NULL(en->te.Then)) {
2587 r = get_char_len_node1(en->te.Then, reg, &tlen, level);
2588 if (r != 0) break;
2589 }
2590 else tlen = 0;
2591 if (IS_NOT_NULL(en->te.Else)) {
2592 r = get_char_len_node1(en->te.Else, reg, &elen, level);
2593 if (r != 0) break;
2594 }
2595 else elen = 0;
2596
2597 if (clen + tlen != elen) {
2598 r = GET_CHAR_LEN_VARLEN;
2599 }
2600 else {
2601 *len = elen;
2602 }
2603 }
2604 }
2605 break;
2606 }
2607 }
2608 break;
2609
2610 case NODE_ANCHOR:
2611 case NODE_GIMMICK:
2612 break;
2613
2614 case NODE_BACKREF:
2615 if (NODE_IS_CHECKER(node))
2616 break;
2617 /* fall */
2618 default:
2619 r = GET_CHAR_LEN_VARLEN;
2620 break;
2621 }
2622
2623 return r;
2624 }
2625
2626 static int
get_char_len_node(Node * node,regex_t * reg,int * len)2627 get_char_len_node(Node* node, regex_t* reg, int* len)
2628 {
2629 return get_char_len_node1(node, reg, len, 0);
2630 }
2631
2632 /* x is not included y ==> 1 : 0 */
2633 static int
is_exclusive(Node * x,Node * y,regex_t * reg)2634 is_exclusive(Node* x, Node* y, regex_t* reg)
2635 {
2636 int i, len;
2637 OnigCodePoint code;
2638 UChar *p;
2639 NodeType ytype;
2640
2641 retry:
2642 ytype = NODE_TYPE(y);
2643 switch (NODE_TYPE(x)) {
2644 case NODE_CTYPE:
2645 {
2646 if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
2647 CTYPE_(y)->ctype == CTYPE_ANYCHAR)
2648 break;
2649
2650 switch (ytype) {
2651 case NODE_CTYPE:
2652 if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
2653 CTYPE_(y)->not != CTYPE_(x)->not &&
2654 CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
2655 return 1;
2656 else
2657 return 0;
2658 break;
2659
2660 case NODE_CCLASS:
2661 swap:
2662 {
2663 Node* tmp;
2664 tmp = x; x = y; y = tmp;
2665 goto retry;
2666 }
2667 break;
2668
2669 case NODE_STRING:
2670 goto swap;
2671 break;
2672
2673 default:
2674 break;
2675 }
2676 }
2677 break;
2678
2679 case NODE_CCLASS:
2680 {
2681 int range;
2682 CClassNode* xc = CCLASS_(x);
2683
2684 switch (ytype) {
2685 case NODE_CTYPE:
2686 switch (CTYPE_(y)->ctype) {
2687 case CTYPE_ANYCHAR:
2688 return 0;
2689 break;
2690
2691 case ONIGENC_CTYPE_WORD:
2692 if (CTYPE_(y)->not == 0) {
2693 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2694 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2695 for (i = 0; i < range; i++) {
2696 if (BITSET_AT(xc->bs, i)) {
2697 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2698 }
2699 }
2700 return 1;
2701 }
2702 return 0;
2703 }
2704 else {
2705 if (IS_NOT_NULL(xc->mbuf)) return 0;
2706 if (IS_NCCLASS_NOT(xc)) return 0;
2707
2708 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2709 for (i = 0; i < range; i++) {
2710 if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
2711 if (BITSET_AT(xc->bs, i))
2712 return 0;
2713 }
2714 }
2715 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
2716 if (BITSET_AT(xc->bs, i)) return 0;
2717 }
2718 return 1;
2719 }
2720 break;
2721
2722 default:
2723 break;
2724 }
2725 break;
2726
2727 case NODE_CCLASS:
2728 {
2729 int v;
2730 CClassNode* yc = CCLASS_(y);
2731
2732 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2733 v = BITSET_AT(xc->bs, i);
2734 if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
2735 v = BITSET_AT(yc->bs, i);
2736 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2737 (v == 0 && IS_NCCLASS_NOT(yc)))
2738 return 0;
2739 }
2740 }
2741 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2742 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2743 return 1;
2744 return 0;
2745 }
2746 break;
2747
2748 case NODE_STRING:
2749 goto swap;
2750 break;
2751
2752 default:
2753 break;
2754 }
2755 }
2756 break;
2757
2758 case NODE_STRING:
2759 {
2760 StrNode* xs = STR_(x);
2761
2762 if (NODE_STRING_LEN(x) == 0)
2763 break;
2764
2765 switch (ytype) {
2766 case NODE_CTYPE:
2767 switch (CTYPE_(y)->ctype) {
2768 case CTYPE_ANYCHAR:
2769 break;
2770
2771 case ONIGENC_CTYPE_WORD:
2772 if (CTYPE_(y)->ascii_mode == 0) {
2773 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2774 return CTYPE_(y)->not;
2775 else
2776 return !(CTYPE_(y)->not);
2777 }
2778 else {
2779 if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
2780 return CTYPE_(y)->not;
2781 else
2782 return !(CTYPE_(y)->not);
2783 }
2784 break;
2785 default:
2786 break;
2787 }
2788 break;
2789
2790 case NODE_CCLASS:
2791 {
2792 CClassNode* cc = CCLASS_(y);
2793
2794 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2795 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2796 return onig_is_code_in_cc(reg->enc, code, cc) == 0;
2797 }
2798 break;
2799
2800 case NODE_STRING:
2801 {
2802 UChar *q;
2803 StrNode* ys = STR_(y);
2804
2805 len = NODE_STRING_LEN(x);
2806 if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
2807 if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) {
2808 /* tiny version */
2809 return 0;
2810 }
2811 else {
2812 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2813 if (*p != *q) return 1;
2814 }
2815 }
2816 }
2817 break;
2818
2819 default:
2820 break;
2821 }
2822 }
2823 break;
2824
2825 default:
2826 break;
2827 }
2828
2829 return 0;
2830 }
2831
2832 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2833 get_head_value_node(Node* node, int exact, regex_t* reg)
2834 {
2835 Node* n = NULL_NODE;
2836
2837 switch (NODE_TYPE(node)) {
2838 case NODE_BACKREF:
2839 case NODE_ALT:
2840 #ifdef USE_CALL
2841 case NODE_CALL:
2842 #endif
2843 break;
2844
2845 case NODE_CTYPE:
2846 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
2847 break;
2848 /* fall */
2849 case NODE_CCLASS:
2850 if (exact == 0) {
2851 n = node;
2852 }
2853 break;
2854
2855 case NODE_LIST:
2856 n = get_head_value_node(NODE_CAR(node), exact, reg);
2857 break;
2858
2859 case NODE_STRING:
2860 {
2861 StrNode* sn = STR_(node);
2862
2863 if (sn->end <= sn->s)
2864 break;
2865
2866 if (exact == 0 ||
2867 ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) {
2868 n = node;
2869 }
2870 }
2871 break;
2872
2873 case NODE_QUANT:
2874 {
2875 QuantNode* qn = QUANT_(node);
2876 if (qn->lower > 0) {
2877 if (IS_NOT_NULL(qn->head_exact))
2878 n = qn->head_exact;
2879 else
2880 n = get_head_value_node(NODE_BODY(node), exact, reg);
2881 }
2882 }
2883 break;
2884
2885 case NODE_BAG:
2886 {
2887 BagNode* en = BAG_(node);
2888 switch (en->type) {
2889 case BAG_OPTION:
2890 {
2891 OnigOptionType options = reg->options;
2892
2893 reg->options = BAG_(node)->o.options;
2894 n = get_head_value_node(NODE_BODY(node), exact, reg);
2895 reg->options = options;
2896 }
2897 break;
2898
2899 case BAG_MEMORY:
2900 case BAG_STOP_BACKTRACK:
2901 case BAG_IF_ELSE:
2902 n = get_head_value_node(NODE_BODY(node), exact, reg);
2903 break;
2904 }
2905 }
2906 break;
2907
2908 case NODE_ANCHOR:
2909 if (ANCHOR_(node)->type == ANCR_PREC_READ)
2910 n = get_head_value_node(NODE_BODY(node), exact, reg);
2911 break;
2912
2913 case NODE_GIMMICK:
2914 default:
2915 break;
2916 }
2917
2918 return n;
2919 }
2920
2921 static int
check_type_tree(Node * node,int type_mask,int bag_mask,int anchor_mask)2922 check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask)
2923 {
2924 NodeType type;
2925 int r = 0;
2926
2927 type = NODE_TYPE(node);
2928 if ((NODE_TYPE2BIT(type) & type_mask) == 0)
2929 return 1;
2930
2931 switch (type) {
2932 case NODE_LIST:
2933 case NODE_ALT:
2934 do {
2935 r = check_type_tree(NODE_CAR(node), type_mask, bag_mask, anchor_mask);
2936 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2937 break;
2938
2939 case NODE_QUANT:
2940 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2941 break;
2942
2943 case NODE_BAG:
2944 {
2945 BagNode* en = BAG_(node);
2946 if (((1<<en->type) & bag_mask) == 0)
2947 return 1;
2948
2949 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2950 if (r == 0 && en->type == BAG_IF_ELSE) {
2951 if (IS_NOT_NULL(en->te.Then)) {
2952 r = check_type_tree(en->te.Then, type_mask, bag_mask, anchor_mask);
2953 if (r != 0) break;
2954 }
2955 if (IS_NOT_NULL(en->te.Else)) {
2956 r = check_type_tree(en->te.Else, type_mask, bag_mask, anchor_mask);
2957 }
2958 }
2959 }
2960 break;
2961
2962 case NODE_ANCHOR:
2963 type = ANCHOR_(node)->type;
2964 if ((type & anchor_mask) == 0)
2965 return 1;
2966
2967 if (IS_NOT_NULL(NODE_BODY(node)))
2968 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2969 break;
2970
2971 case NODE_GIMMICK:
2972 default:
2973 break;
2974 }
2975 return r;
2976 }
2977
2978 static OnigLen
tree_min_len(Node * node,ScanEnv * env)2979 tree_min_len(Node* node, ScanEnv* env)
2980 {
2981 OnigLen len;
2982 OnigLen tmin;
2983
2984 len = 0;
2985 switch (NODE_TYPE(node)) {
2986 case NODE_BACKREF:
2987 if (! NODE_IS_CHECKER(node)) {
2988 int i;
2989 int* backs;
2990 MemEnv* mem_env = SCANENV_MEMENV(env);
2991 BackRefNode* br = BACKREF_(node);
2992 if (NODE_IS_RECURSION(node)) break;
2993
2994 backs = BACKREFS_P(br);
2995 len = tree_min_len(mem_env[backs[0]].mem_node, env);
2996 for (i = 1; i < br->back_num; i++) {
2997 tmin = tree_min_len(mem_env[backs[i]].mem_node, env);
2998 if (len > tmin) len = tmin;
2999 }
3000 }
3001 break;
3002
3003 #ifdef USE_CALL
3004 case NODE_CALL:
3005 {
3006 Node* t = NODE_BODY(node);
3007 if (NODE_IS_RECURSION(node)) {
3008 if (NODE_IS_MIN_FIXED(t))
3009 len = BAG_(t)->min_len;
3010 }
3011 else
3012 len = tree_min_len(t, env);
3013 }
3014 break;
3015 #endif
3016
3017 case NODE_LIST:
3018 do {
3019 tmin = tree_min_len(NODE_CAR(node), env);
3020 len = distance_add(len, tmin);
3021 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3022 break;
3023
3024 case NODE_ALT:
3025 {
3026 Node *x, *y;
3027 y = node;
3028 do {
3029 x = NODE_CAR(y);
3030 tmin = tree_min_len(x, env);
3031 if (y == node) len = tmin;
3032 else if (len > tmin) len = tmin;
3033 } while (IS_NOT_NULL(y = NODE_CDR(y)));
3034 }
3035 break;
3036
3037 case NODE_STRING:
3038 {
3039 StrNode* sn = STR_(node);
3040 len = (int )(sn->end - sn->s);
3041 }
3042 break;
3043
3044 case NODE_CTYPE:
3045 case NODE_CCLASS:
3046 len = ONIGENC_MBC_MINLEN(env->enc);
3047 break;
3048
3049 case NODE_QUANT:
3050 {
3051 QuantNode* qn = QUANT_(node);
3052
3053 if (qn->lower > 0) {
3054 len = tree_min_len(NODE_BODY(node), env);
3055 len = distance_multiply(len, qn->lower);
3056 }
3057 }
3058 break;
3059
3060 case NODE_BAG:
3061 {
3062 BagNode* en = BAG_(node);
3063 switch (en->type) {
3064 case BAG_MEMORY:
3065 if (NODE_IS_MIN_FIXED(node))
3066 len = en->min_len;
3067 else {
3068 if (NODE_IS_MARK1(node))
3069 len = 0; /* recursive */
3070 else {
3071 NODE_STATUS_ADD(node, MARK1);
3072 len = tree_min_len(NODE_BODY(node), env);
3073 NODE_STATUS_REMOVE(node, MARK1);
3074
3075 en->min_len = len;
3076 NODE_STATUS_ADD(node, MIN_FIXED);
3077 }
3078 }
3079 break;
3080
3081 case BAG_OPTION:
3082 case BAG_STOP_BACKTRACK:
3083 len = tree_min_len(NODE_BODY(node), env);
3084 break;
3085 case BAG_IF_ELSE:
3086 {
3087 OnigLen elen;
3088
3089 len = tree_min_len(NODE_BODY(node), env);
3090 if (IS_NOT_NULL(en->te.Then))
3091 len += tree_min_len(en->te.Then, env);
3092 if (IS_NOT_NULL(en->te.Else))
3093 elen = tree_min_len(en->te.Else, env);
3094 else elen = 0;
3095
3096 if (elen < len) len = elen;
3097 }
3098 break;
3099 }
3100 }
3101 break;
3102
3103 case NODE_GIMMICK:
3104 {
3105 GimmickNode* g = GIMMICK_(node);
3106 if (g->type == GIMMICK_FAIL) {
3107 len = INFINITE_LEN;
3108 break;
3109 }
3110 }
3111 /* fall */
3112 case NODE_ANCHOR:
3113 default:
3114 break;
3115 }
3116
3117 return len;
3118 }
3119
3120 static OnigLen
tree_max_len(Node * node,ScanEnv * env)3121 tree_max_len(Node* node, ScanEnv* env)
3122 {
3123 OnigLen len;
3124 OnigLen tmax;
3125
3126 len = 0;
3127 switch (NODE_TYPE(node)) {
3128 case NODE_LIST:
3129 do {
3130 tmax = tree_max_len(NODE_CAR(node), env);
3131 len = distance_add(len, tmax);
3132 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3133 break;
3134
3135 case NODE_ALT:
3136 do {
3137 tmax = tree_max_len(NODE_CAR(node), env);
3138 if (len < tmax) len = tmax;
3139 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3140 break;
3141
3142 case NODE_STRING:
3143 {
3144 StrNode* sn = STR_(node);
3145 len = (OnigLen )(sn->end - sn->s);
3146 }
3147 break;
3148
3149 case NODE_CTYPE:
3150 case NODE_CCLASS:
3151 len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
3152 break;
3153
3154 case NODE_BACKREF:
3155 if (! NODE_IS_CHECKER(node)) {
3156 int i;
3157 int* backs;
3158 MemEnv* mem_env = SCANENV_MEMENV(env);
3159 BackRefNode* br = BACKREF_(node);
3160 if (NODE_IS_RECURSION(node)) {
3161 len = INFINITE_LEN;
3162 break;
3163 }
3164 backs = BACKREFS_P(br);
3165 for (i = 0; i < br->back_num; i++) {
3166 tmax = tree_max_len(mem_env[backs[i]].mem_node, env);
3167 if (len < tmax) len = tmax;
3168 }
3169 }
3170 break;
3171
3172 #ifdef USE_CALL
3173 case NODE_CALL:
3174 if (! NODE_IS_RECURSION(node))
3175 len = tree_max_len(NODE_BODY(node), env);
3176 else
3177 len = INFINITE_LEN;
3178 break;
3179 #endif
3180
3181 case NODE_QUANT:
3182 {
3183 QuantNode* qn = QUANT_(node);
3184
3185 if (qn->upper != 0) {
3186 len = tree_max_len(NODE_BODY(node), env);
3187 if (len != 0) {
3188 if (! IS_INFINITE_REPEAT(qn->upper))
3189 len = distance_multiply(len, qn->upper);
3190 else
3191 len = INFINITE_LEN;
3192 }
3193 }
3194 }
3195 break;
3196
3197 case NODE_BAG:
3198 {
3199 BagNode* en = BAG_(node);
3200 switch (en->type) {
3201 case BAG_MEMORY:
3202 if (NODE_IS_MAX_FIXED(node))
3203 len = en->max_len;
3204 else {
3205 if (NODE_IS_MARK1(node))
3206 len = INFINITE_LEN;
3207 else {
3208 NODE_STATUS_ADD(node, MARK1);
3209 len = tree_max_len(NODE_BODY(node), env);
3210 NODE_STATUS_REMOVE(node, MARK1);
3211
3212 en->max_len = len;
3213 NODE_STATUS_ADD(node, MAX_FIXED);
3214 }
3215 }
3216 break;
3217
3218 case BAG_OPTION:
3219 case BAG_STOP_BACKTRACK:
3220 len = tree_max_len(NODE_BODY(node), env);
3221 break;
3222 case BAG_IF_ELSE:
3223 {
3224 OnigLen tlen, elen;
3225
3226 len = tree_max_len(NODE_BODY(node), env);
3227 if (IS_NOT_NULL(en->te.Then)) {
3228 tlen = tree_max_len(en->te.Then, env);
3229 len = distance_add(len, tlen);
3230 }
3231 if (IS_NOT_NULL(en->te.Else))
3232 elen = tree_max_len(en->te.Else, env);
3233 else elen = 0;
3234
3235 if (elen > len) len = elen;
3236 }
3237 break;
3238 }
3239 }
3240 break;
3241
3242 case NODE_ANCHOR:
3243 case NODE_GIMMICK:
3244 default:
3245 break;
3246 }
3247
3248 return len;
3249 }
3250
3251 static int
check_backrefs(Node * node,ScanEnv * env)3252 check_backrefs(Node* node, ScanEnv* env)
3253 {
3254 int r;
3255
3256 switch (NODE_TYPE(node)) {
3257 case NODE_LIST:
3258 case NODE_ALT:
3259 do {
3260 r = check_backrefs(NODE_CAR(node), env);
3261 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3262 break;
3263
3264 case NODE_ANCHOR:
3265 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3266 r = 0;
3267 break;
3268 }
3269 /* fall */
3270 case NODE_QUANT:
3271 r = check_backrefs(NODE_BODY(node), env);
3272 break;
3273
3274 case NODE_BAG:
3275 r = check_backrefs(NODE_BODY(node), env);
3276 {
3277 BagNode* en = BAG_(node);
3278
3279 if (en->type == BAG_IF_ELSE) {
3280 if (r != 0) return r;
3281 if (IS_NOT_NULL(en->te.Then)) {
3282 r = check_backrefs(en->te.Then, env);
3283 if (r != 0) return r;
3284 }
3285 if (IS_NOT_NULL(en->te.Else)) {
3286 r = check_backrefs(en->te.Else, env);
3287 }
3288 }
3289 }
3290 break;
3291
3292 case NODE_BACKREF:
3293 {
3294 int i;
3295 BackRefNode* br = BACKREF_(node);
3296 int* backs = BACKREFS_P(br);
3297 MemEnv* mem_env = SCANENV_MEMENV(env);
3298
3299 for (i = 0; i < br->back_num; i++) {
3300 if (backs[i] > env->num_mem)
3301 return ONIGERR_INVALID_BACKREF;
3302
3303 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3304 }
3305 r = 0;
3306 }
3307 break;
3308
3309 default:
3310 r = 0;
3311 break;
3312 }
3313
3314 return r;
3315 }
3316
3317 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ScanEnv * env)3318 set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
3319 {
3320 int r;
3321
3322 switch (NODE_TYPE(node)) {
3323 case NODE_LIST:
3324 case NODE_ALT:
3325 do {
3326 r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3327 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3328 break;
3329
3330 case NODE_ANCHOR:
3331 {
3332 AnchorNode* an = ANCHOR_(node);
3333
3334 if (! ANCHOR_HAS_BODY(an)) {
3335 r = 0;
3336 break;
3337 }
3338
3339 switch (an->type) {
3340 case ANCR_PREC_READ:
3341 case ANCR_LOOK_BEHIND:
3342 empty = NULL_NODE;
3343 break;
3344 default:
3345 break;
3346 }
3347 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3348 }
3349 break;
3350
3351 case NODE_QUANT:
3352 {
3353 QuantNode* qn = QUANT_(node);
3354
3355 if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3356 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3357 }
3358 break;
3359
3360 case NODE_BAG:
3361 if (IS_NOT_NULL(NODE_BODY(node))) {
3362 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3363 if (r != 0) return r;
3364 }
3365 {
3366 BagNode* en = BAG_(node);
3367
3368 if (en->type == BAG_MEMORY) {
3369 if (NODE_IS_BACKREF(node)) {
3370 if (IS_NOT_NULL(empty))
3371 SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3372 }
3373 }
3374 else if (en->type == BAG_IF_ELSE) {
3375 if (IS_NOT_NULL(en->te.Then)) {
3376 r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3377 if (r != 0) return r;
3378 }
3379 if (IS_NOT_NULL(en->te.Else)) {
3380 r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3381 }
3382 }
3383 }
3384 break;
3385
3386 default:
3387 r = 0;
3388 break;
3389 }
3390
3391 return r;
3392 }
3393
3394 static int
is_ancestor_node(Node * node,Node * me)3395 is_ancestor_node(Node* node, Node* me)
3396 {
3397 Node* parent;
3398
3399 while ((parent = NODE_PARENT(me)) != NULL_NODE) {
3400 if (parent == node) return 1;
3401 me = parent;
3402 }
3403 return 0;
3404 }
3405
3406 static void
set_empty_status_check_trav(Node * node,ScanEnv * env)3407 set_empty_status_check_trav(Node* node, ScanEnv* env)
3408 {
3409 switch (NODE_TYPE(node)) {
3410 case NODE_LIST:
3411 case NODE_ALT:
3412 do {
3413 set_empty_status_check_trav(NODE_CAR(node), env);
3414 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3415 break;
3416
3417 case NODE_ANCHOR:
3418 {
3419 AnchorNode* an = ANCHOR_(node);
3420
3421 if (! ANCHOR_HAS_BODY(an)) break;
3422 set_empty_status_check_trav(NODE_BODY(node), env);
3423 }
3424 break;
3425
3426 case NODE_QUANT:
3427 set_empty_status_check_trav(NODE_BODY(node), env);
3428 break;
3429
3430 case NODE_BAG:
3431 if (IS_NOT_NULL(NODE_BODY(node)))
3432 set_empty_status_check_trav(NODE_BODY(node), env);
3433 {
3434 BagNode* en = BAG_(node);
3435
3436 if (en->type == BAG_IF_ELSE) {
3437 if (IS_NOT_NULL(en->te.Then)) {
3438 set_empty_status_check_trav(en->te.Then, env);
3439 }
3440 if (IS_NOT_NULL(en->te.Else)) {
3441 set_empty_status_check_trav(en->te.Else, env);
3442 }
3443 }
3444 }
3445 break;
3446
3447 case NODE_BACKREF:
3448 {
3449 int i;
3450 int* backs;
3451 MemEnv* mem_env = SCANENV_MEMENV(env);
3452 BackRefNode* br = BACKREF_(node);
3453 backs = BACKREFS_P(br);
3454 for (i = 0; i < br->back_num; i++) {
3455 Node* ernode = mem_env[backs[i]].empty_repeat_node;
3456 if (IS_NOT_NULL(ernode)) {
3457 if (! is_ancestor_node(ernode, node)) {
3458 MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]);
3459 NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
3460 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
3461 }
3462 }
3463 }
3464 }
3465 break;
3466
3467 default:
3468 break;
3469 }
3470 }
3471
3472 static void
set_parent_node_trav(Node * node,Node * parent)3473 set_parent_node_trav(Node* node, Node* parent)
3474 {
3475 NODE_PARENT(node) = parent;
3476
3477 switch (NODE_TYPE(node)) {
3478 case NODE_LIST:
3479 case NODE_ALT:
3480 do {
3481 set_parent_node_trav(NODE_CAR(node), node);
3482 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3483 break;
3484
3485 case NODE_ANCHOR:
3486 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
3487 set_parent_node_trav(NODE_BODY(node), node);
3488 break;
3489
3490 case NODE_QUANT:
3491 set_parent_node_trav(NODE_BODY(node), node);
3492 break;
3493
3494 case NODE_BAG:
3495 if (IS_NOT_NULL(NODE_BODY(node)))
3496 set_parent_node_trav(NODE_BODY(node), node);
3497 {
3498 BagNode* en = BAG_(node);
3499
3500 if (en->type == BAG_IF_ELSE) {
3501 if (IS_NOT_NULL(en->te.Then))
3502 set_parent_node_trav(en->te.Then, node);
3503 if (IS_NOT_NULL(en->te.Else)) {
3504 set_parent_node_trav(en->te.Else, node);
3505 }
3506 }
3507 }
3508 break;
3509
3510 default:
3511 break;
3512 }
3513 }
3514
3515
3516 #ifdef USE_CALL
3517
3518 #define RECURSION_EXIST (1<<0)
3519 #define RECURSION_MUST (1<<1)
3520 #define RECURSION_INFINITE (1<<2)
3521
3522 static int
infinite_recursive_call_check(Node * node,ScanEnv * env,int head)3523 infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
3524 {
3525 int ret;
3526 int r = 0;
3527
3528 switch (NODE_TYPE(node)) {
3529 case NODE_LIST:
3530 {
3531 Node *x;
3532 OnigLen min;
3533
3534 x = node;
3535 do {
3536 ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
3537 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3538 r |= ret;
3539 if (head != 0) {
3540 min = tree_min_len(NODE_CAR(x), env);
3541 if (min != 0) head = 0;
3542 }
3543 } while (IS_NOT_NULL(x = NODE_CDR(x)));
3544 }
3545 break;
3546
3547 case NODE_ALT:
3548 {
3549 int must;
3550
3551 must = RECURSION_MUST;
3552 do {
3553 ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
3554 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3555
3556 r |= (ret & RECURSION_EXIST);
3557 must &= ret;
3558 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3559 r |= must;
3560 }
3561 break;
3562
3563 case NODE_QUANT:
3564 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3565 if (r < 0) return r;
3566 if ((r & RECURSION_MUST) != 0) {
3567 if (QUANT_(node)->lower == 0)
3568 r &= ~RECURSION_MUST;
3569 }
3570 break;
3571
3572 case NODE_ANCHOR:
3573 if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
3574 break;
3575 /* fall */
3576 case NODE_CALL:
3577 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3578 break;
3579
3580 case NODE_BAG:
3581 {
3582 BagNode* en = BAG_(node);
3583
3584 if (en->type == BAG_MEMORY) {
3585 if (NODE_IS_MARK2(node))
3586 return 0;
3587 else if (NODE_IS_MARK1(node))
3588 return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
3589 : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
3590 else {
3591 NODE_STATUS_ADD(node, MARK2);
3592 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3593 NODE_STATUS_REMOVE(node, MARK2);
3594 }
3595 }
3596 else if (en->type == BAG_IF_ELSE) {
3597 int eret;
3598
3599 ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
3600 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3601 r |= ret;
3602 if (IS_NOT_NULL(en->te.Then)) {
3603 OnigLen min;
3604 if (head != 0) {
3605 min = tree_min_len(NODE_BODY(node), env);
3606 }
3607 else min = 0;
3608
3609 ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
3610 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3611 r |= ret;
3612 }
3613 if (IS_NOT_NULL(en->te.Else)) {
3614 eret = infinite_recursive_call_check(en->te.Else, env, head);
3615 if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
3616 r |= (eret & RECURSION_EXIST);
3617 if ((eret & RECURSION_MUST) == 0)
3618 r &= ~RECURSION_MUST;
3619 }
3620 else {
3621 r &= ~RECURSION_MUST;
3622 }
3623 }
3624 else {
3625 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3626 }
3627 }
3628 break;
3629
3630 default:
3631 break;
3632 }
3633
3634 return r;
3635 }
3636
3637 static int
infinite_recursive_call_check_trav(Node * node,ScanEnv * env)3638 infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
3639 {
3640 int r;
3641
3642 switch (NODE_TYPE(node)) {
3643 case NODE_LIST:
3644 case NODE_ALT:
3645 do {
3646 r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
3647 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3648 break;
3649
3650 case NODE_ANCHOR:
3651 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3652 r = 0;
3653 break;
3654 }
3655 /* fall */
3656 case NODE_QUANT:
3657 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3658 break;
3659
3660 case NODE_BAG:
3661 {
3662 BagNode* en = BAG_(node);
3663
3664 if (en->type == BAG_MEMORY) {
3665 if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
3666 int ret;
3667
3668 NODE_STATUS_ADD(node, MARK1);
3669
3670 ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
3671 if (ret < 0) return ret;
3672 else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
3673 return ONIGERR_NEVER_ENDING_RECURSION;
3674
3675 NODE_STATUS_REMOVE(node, MARK1);
3676 }
3677 }
3678 else if (en->type == BAG_IF_ELSE) {
3679 if (IS_NOT_NULL(en->te.Then)) {
3680 r = infinite_recursive_call_check_trav(en->te.Then, env);
3681 if (r != 0) return r;
3682 }
3683 if (IS_NOT_NULL(en->te.Else)) {
3684 r = infinite_recursive_call_check_trav(en->te.Else, env);
3685 if (r != 0) return r;
3686 }
3687 }
3688 }
3689
3690 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3691 break;
3692
3693 default:
3694 r = 0;
3695 break;
3696 }
3697
3698 return r;
3699 }
3700
3701 static int
recursive_call_check(Node * node)3702 recursive_call_check(Node* node)
3703 {
3704 int r;
3705
3706 switch (NODE_TYPE(node)) {
3707 case NODE_LIST:
3708 case NODE_ALT:
3709 r = 0;
3710 do {
3711 r |= recursive_call_check(NODE_CAR(node));
3712 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3713 break;
3714
3715 case NODE_ANCHOR:
3716 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3717 r = 0;
3718 break;
3719 }
3720 /* fall */
3721 case NODE_QUANT:
3722 r = recursive_call_check(NODE_BODY(node));
3723 break;
3724
3725 case NODE_CALL:
3726 r = recursive_call_check(NODE_BODY(node));
3727 if (r != 0) {
3728 if (NODE_IS_MARK1(NODE_BODY(node)))
3729 NODE_STATUS_ADD(node, RECURSION);
3730 }
3731 break;
3732
3733 case NODE_BAG:
3734 {
3735 BagNode* en = BAG_(node);
3736
3737 if (en->type == BAG_MEMORY) {
3738 if (NODE_IS_MARK2(node))
3739 return 0;
3740 else if (NODE_IS_MARK1(node))
3741 return 1; /* recursion */
3742 else {
3743 NODE_STATUS_ADD(node, MARK2);
3744 r = recursive_call_check(NODE_BODY(node));
3745 NODE_STATUS_REMOVE(node, MARK2);
3746 }
3747 }
3748 else if (en->type == BAG_IF_ELSE) {
3749 r = 0;
3750 if (IS_NOT_NULL(en->te.Then)) {
3751 r |= recursive_call_check(en->te.Then);
3752 }
3753 if (IS_NOT_NULL(en->te.Else)) {
3754 r |= recursive_call_check(en->te.Else);
3755 }
3756 r |= recursive_call_check(NODE_BODY(node));
3757 }
3758 else {
3759 r = recursive_call_check(NODE_BODY(node));
3760 }
3761 }
3762 break;
3763
3764 default:
3765 r = 0;
3766 break;
3767 }
3768
3769 return r;
3770 }
3771
3772 #define IN_RECURSION (1<<0)
3773 #define FOUND_CALLED_NODE 1
3774
3775 static int
recursive_call_check_trav(Node * node,ScanEnv * env,int state)3776 recursive_call_check_trav(Node* node, ScanEnv* env, int state)
3777 {
3778 int r = 0;
3779
3780 switch (NODE_TYPE(node)) {
3781 case NODE_LIST:
3782 case NODE_ALT:
3783 {
3784 int ret;
3785 do {
3786 ret = recursive_call_check_trav(NODE_CAR(node), env, state);
3787 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3788 else if (ret < 0) return ret;
3789 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3790 }
3791 break;
3792
3793 case NODE_QUANT:
3794 r = recursive_call_check_trav(NODE_BODY(node), env, state);
3795 if (QUANT_(node)->upper == 0) {
3796 if (r == FOUND_CALLED_NODE)
3797 QUANT_(node)->include_referred = 1;
3798 }
3799 break;
3800
3801 case NODE_ANCHOR:
3802 {
3803 AnchorNode* an = ANCHOR_(node);
3804 if (ANCHOR_HAS_BODY(an))
3805 r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
3806 }
3807 break;
3808
3809 case NODE_BAG:
3810 {
3811 int ret;
3812 int state1;
3813 BagNode* en = BAG_(node);
3814
3815 if (en->type == BAG_MEMORY) {
3816 if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
3817 if (! NODE_IS_RECURSION(node)) {
3818 NODE_STATUS_ADD(node, MARK1);
3819 r = recursive_call_check(NODE_BODY(node));
3820 if (r != 0) {
3821 NODE_STATUS_ADD(node, RECURSION);
3822 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
3823 }
3824 NODE_STATUS_REMOVE(node, MARK1);
3825 }
3826
3827 if (NODE_IS_CALLED(node))
3828 r = FOUND_CALLED_NODE;
3829 }
3830 }
3831
3832 state1 = state;
3833 if (NODE_IS_RECURSION(node))
3834 state1 |= IN_RECURSION;
3835
3836 ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
3837 if (ret == FOUND_CALLED_NODE)
3838 r = FOUND_CALLED_NODE;
3839
3840 if (en->type == BAG_IF_ELSE) {
3841 if (IS_NOT_NULL(en->te.Then)) {
3842 ret = recursive_call_check_trav(en->te.Then, env, state1);
3843 if (ret == FOUND_CALLED_NODE)
3844 r = FOUND_CALLED_NODE;
3845 }
3846 if (IS_NOT_NULL(en->te.Else)) {
3847 ret = recursive_call_check_trav(en->te.Else, env, state1);
3848 if (ret == FOUND_CALLED_NODE)
3849 r = FOUND_CALLED_NODE;
3850 }
3851 }
3852 }
3853 break;
3854
3855 default:
3856 break;
3857 }
3858
3859 return r;
3860 }
3861
3862 #endif
3863
3864 static void
remove_from_list(Node * prev,Node * a)3865 remove_from_list(Node* prev, Node* a)
3866 {
3867 if (NODE_CDR(prev) != a) return ;
3868
3869 NODE_CDR(prev) = NODE_CDR(a);
3870 NODE_CDR(a) = NULL_NODE;
3871 }
3872
3873 static int
reduce_string_list(Node * node)3874 reduce_string_list(Node* node)
3875 {
3876 int r = 0;
3877
3878 switch (NODE_TYPE(node)) {
3879 case NODE_LIST:
3880 {
3881 Node* prev;
3882 Node* curr;
3883 Node* prev_node;
3884 Node* next_node;
3885
3886 prev = NULL_NODE;
3887 do {
3888 next_node = NODE_CDR(node);
3889 curr = NODE_CAR(node);
3890 if (NODE_TYPE(curr) == NODE_STRING) {
3891 if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) {
3892 prev = curr;
3893 prev_node = node;
3894 }
3895 else {
3896 r = node_str_node_cat(prev, curr);
3897 if (r != 0) return r;
3898 remove_from_list(prev_node, node);
3899 onig_node_free(node);
3900 }
3901 }
3902 else {
3903 prev = NULL_NODE;
3904 prev_node = node;
3905 }
3906
3907 node = next_node;
3908 } while (r == 0 && IS_NOT_NULL(node));
3909 }
3910 break;
3911
3912 case NODE_ALT:
3913 do {
3914 r = reduce_string_list(NODE_CAR(node));
3915 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3916 break;
3917
3918 case NODE_ANCHOR:
3919 if (IS_NULL(NODE_BODY(node)))
3920 break;
3921 /* fall */
3922 case NODE_QUANT:
3923 r = reduce_string_list(NODE_BODY(node));
3924 break;
3925
3926 case NODE_BAG:
3927 {
3928 BagNode* en = BAG_(node);
3929
3930 r = reduce_string_list(NODE_BODY(node));
3931 if (r != 0) return r;
3932
3933 if (en->type == BAG_IF_ELSE) {
3934 if (IS_NOT_NULL(en->te.Then)) {
3935 r = reduce_string_list(en->te.Then);
3936 if (r != 0) return r;
3937 }
3938 if (IS_NOT_NULL(en->te.Else)) {
3939 r = reduce_string_list(en->te.Else);
3940 if (r != 0) return r;
3941 }
3942 }
3943 }
3944 break;
3945
3946 default:
3947 break;
3948 }
3949
3950 return r;
3951 }
3952
3953
3954 #define IN_ALT (1<<0)
3955 #define IN_NOT (1<<1)
3956 #define IN_REAL_REPEAT (1<<2)
3957 #define IN_VAR_REPEAT (1<<3)
3958 #define IN_ZERO_REPEAT (1<<4)
3959 #define IN_MULTI_ENTRY (1<<5)
3960 #define IN_LOOK_BEHIND (1<<6)
3961
3962
3963 /* divide different length alternatives in look-behind.
3964 (?<=A|B) ==> (?<=A)|(?<=B)
3965 (?<!A|B) ==> (?<!A)(?<!B)
3966 */
3967 static int
divide_look_behind_alternatives(Node * node)3968 divide_look_behind_alternatives(Node* node)
3969 {
3970 Node *head, *np, *insert_node;
3971 AnchorNode* an = ANCHOR_(node);
3972 int anc_type = an->type;
3973
3974 head = NODE_ANCHOR_BODY(an);
3975 np = NODE_CAR(head);
3976 node_swap(node, head);
3977 NODE_CAR(node) = head;
3978 NODE_BODY(head) = np;
3979
3980 np = node;
3981 while (IS_NOT_NULL(np = NODE_CDR(np))) {
3982 insert_node = onig_node_new_anchor(anc_type, an->ascii_mode);
3983 CHECK_NULL_RETURN_MEMERR(insert_node);
3984 NODE_BODY(insert_node) = NODE_CAR(np);
3985 NODE_CAR(np) = insert_node;
3986 }
3987
3988 if (anc_type == ANCR_LOOK_BEHIND_NOT) {
3989 np = node;
3990 do {
3991 NODE_SET_TYPE(np, NODE_LIST); /* alt -> list */
3992 } while (IS_NOT_NULL(np = NODE_CDR(np)));
3993 }
3994 return 0;
3995 }
3996
3997 static int
tune_look_behind(Node * node,regex_t * reg,ScanEnv * env)3998 tune_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3999 {
4000 int r, len;
4001 AnchorNode* an = ANCHOR_(node);
4002
4003 r = get_char_len_node(NODE_ANCHOR_BODY(an), reg, &len);
4004 if (r == 0)
4005 an->char_len = len;
4006 else if (r == GET_CHAR_LEN_VARLEN)
4007 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4008 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
4009 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
4010 r = divide_look_behind_alternatives(node);
4011 else
4012 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4013 }
4014
4015 return r;
4016 }
4017
4018 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4019 tune_next(Node* node, Node* next_node, regex_t* reg)
4020 {
4021 NodeType type;
4022
4023 retry:
4024 type = NODE_TYPE(node);
4025 if (type == NODE_QUANT) {
4026 QuantNode* qn = QUANT_(node);
4027 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4028 #ifdef USE_QUANT_PEEK_NEXT
4029 Node* n = get_head_value_node(next_node, 1, reg);
4030 /* '\0': for UTF-16BE etc... */
4031 if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4032 qn->next_head_exact = n;
4033 }
4034 #endif
4035 /* automatic posseivation a*b ==> (?>a*)b */
4036 if (qn->lower <= 1) {
4037 if (is_strict_real_node(NODE_BODY(node))) {
4038 Node *x, *y;
4039 x = get_head_value_node(NODE_BODY(node), 0, reg);
4040 if (IS_NOT_NULL(x)) {
4041 y = get_head_value_node(next_node, 0, reg);
4042 if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4043 Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4044 CHECK_NULL_RETURN_MEMERR(en);
4045 NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4046 node_swap(node, en);
4047 NODE_BODY(node) = en;
4048 }
4049 }
4050 }
4051 }
4052 }
4053 }
4054 else if (type == NODE_BAG) {
4055 BagNode* en = BAG_(node);
4056 if (en->type == BAG_MEMORY) {
4057 node = NODE_BODY(node);
4058 goto retry;
4059 }
4060 }
4061 return 0;
4062 }
4063
4064
4065 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4066 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4067 {
4068 int i;
4069
4070 for (i = 0; i < n; i++) {
4071 OnigCaseFoldCodeItem* item = items + i;
4072 if (item->code_len != 1) return 0;
4073 }
4074
4075 return 1;
4076 }
4077
4078 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],int * rmin,int * rmax)4079 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax)
4080 {
4081 int i, len, minlen, maxlen;
4082
4083 minlen = INT_MAX;
4084 maxlen = 0;
4085 for (i = 0; i < n; i++) {
4086 OnigCaseFoldCodeItem* item = items + i;
4087
4088 len = item->byte_len;
4089 if (len < minlen) minlen = len;
4090 if (len > maxlen) maxlen = len;
4091 }
4092
4093 *rmin = minlen;
4094 *rmax = maxlen;
4095 return 0;
4096 }
4097
4098 static int
conv_string_case_fold(OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end,UChar ** rs,UChar ** rend,int * rcase_min_len)4099 conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag,
4100 UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len)
4101 {
4102 UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
4103 UChar *sbuf, *ebuf, *sp;
4104 int i, n, len, sbuf_size;
4105
4106 *rs = NULL;
4107 sbuf_size = (int )(end - s) * 2;
4108 sbuf = (UChar* )xmalloc(sbuf_size);
4109 CHECK_NULL_RETURN_MEMERR(sbuf);
4110 ebuf = sbuf + sbuf_size;
4111
4112 n = 0;
4113 sp = sbuf;
4114 p = s;
4115 while (p < end) {
4116 len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf);
4117 for (i = 0; i < len; i++) {
4118 if (sp >= ebuf) {
4119 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
4120 CHECK_NULL_RETURN_MEMERR(sbuf);
4121 sp = sbuf + sbuf_size;
4122 sbuf_size *= 2;
4123 ebuf = sbuf + sbuf_size;
4124 }
4125
4126 *sp++ = buf[i];
4127 }
4128 n++;
4129 }
4130
4131 *rs = sbuf;
4132 *rend = sp;
4133 *rcase_min_len = n;
4134 return 0;
4135 }
4136
4137 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4138 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4139 int n, OnigCodePoint codes[])
4140 {
4141 int r, i, len;
4142 Node* node;
4143 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4144
4145 *rnode = NULL_NODE;
4146 node = onig_node_new_str(NULL, NULL);
4147 CHECK_NULL_RETURN_MEMERR(node);
4148
4149 for (i = 0; i < n; i++) {
4150 len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4151 if (len < 0) {
4152 r = len;
4153 goto err;
4154 }
4155
4156 r = onig_node_str_cat(node, buf, buf + len);
4157 if (r != 0) goto err;
4158 }
4159
4160 *rnode = node;
4161 return 0;
4162
4163 err:
4164 onig_node_free(node);
4165 return r;
4166 }
4167
4168 static int
unravel_cf_node_add(Node ** rlist,Node * add)4169 unravel_cf_node_add(Node** rlist, Node* add)
4170 {
4171 Node *list;
4172
4173 list = *rlist;
4174 if (IS_NULL(list)) {
4175 list = onig_node_new_list(add, NULL);
4176 CHECK_NULL_RETURN_MEMERR(list);
4177 *rlist = list;
4178 }
4179 else {
4180 Node* r = node_list_add(list, add);
4181 CHECK_NULL_RETURN_MEMERR(r);
4182 }
4183
4184 return 0;
4185 }
4186
4187 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag,int case_min_len)4188 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4189 unsigned int flag, int case_min_len)
4190 {
4191 int r;
4192 Node *sn, *list;
4193
4194 list = *rlist;
4195 sn = *rsn;
4196
4197 if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4198 if (NODE_STRING_IS_CASE_FOLD_MATCH(sn))
4199 r = node_str_cat_case_fold(sn, s, end, case_min_len);
4200 else
4201 r = onig_node_str_cat(sn, s, end);
4202 }
4203 else {
4204 sn = onig_node_new_str(s, end);
4205 CHECK_NULL_RETURN_MEMERR(sn);
4206
4207 STR_(sn)->flag = flag;
4208 STR_(sn)->case_min_len = case_min_len;
4209 r = unravel_cf_node_add(&list, sn);
4210 }
4211
4212 if (r == 0) {
4213 *rlist = list;
4214 *rsn = sn;
4215 }
4216 return r;
4217 }
4218
4219 static int
unravel_cf_string_fold_add(Node ** rlist,Node ** rsn,OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4220 unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc,
4221 OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4222 {
4223 int r;
4224 int case_min_len;
4225 UChar *rs, *rend;
4226
4227 r = conv_string_case_fold(enc, case_fold_flag, s, end,
4228 &rs, &rend, &case_min_len);
4229 if (r != 0) return r;
4230
4231 r = unravel_cf_string_add(rlist, rsn, rs, rend,
4232 NODE_STRING_CASE_FOLD_MATCH, case_min_len);
4233 xfree(rs);
4234
4235 return r;
4236 }
4237
4238 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],int byte_len,OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4239 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4240 OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc,
4241 OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4242 {
4243 int r, i;
4244 Node* node;
4245
4246 if (is_all_code_len_1_items(n, items)) {
4247 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4248
4249 codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4250 for (i = 0; i < n; i++) {
4251 OnigCaseFoldCodeItem* item = items + i;
4252 codes[i+1] = item->code[0];
4253 }
4254 r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4255 if (r != 0) return r;
4256 }
4257 else {
4258 Node *snode, *alt, *curr;
4259
4260 snode = onig_node_new_str(s, end);
4261 CHECK_NULL_RETURN_MEMERR(snode);
4262 node = curr = onig_node_new_alt(snode, NULL_NODE);
4263 if (IS_NULL(curr)) {
4264 onig_node_free(snode);
4265 return ONIGERR_MEMORY;
4266 }
4267
4268 r = 0;
4269 for (i = 0; i < n; i++) {
4270 OnigCaseFoldCodeItem* item = items + i;
4271 r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4272 if (r != 0) {
4273 onig_node_free(node);
4274 return r;
4275 }
4276
4277 alt = onig_node_new_alt(snode, NULL_NODE);
4278 if (IS_NULL(alt)) {
4279 onig_node_free(snode);
4280 onig_node_free(node);
4281 return ONIGERR_MEMORY;
4282 }
4283
4284 NODE_CDR(curr) = alt;
4285 curr = alt;
4286 }
4287 }
4288
4289 r = unravel_cf_node_add(rlist, node);
4290 if (r != 0) onig_node_free(node);
4291 return r;
4292 }
4293
4294 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,int one_len)4295 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4296 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4297 UChar* s, int one_len)
4298 {
4299 int r, i, found;
4300
4301 found = 0;
4302 for (i = 0; i < n; i++) {
4303 OnigCaseFoldCodeItem* item = items + i;
4304 if (item->byte_len == one_len) {
4305 if (item->code_len == 1) {
4306 found = 1;
4307 }
4308 }
4309 }
4310
4311 if (found == 0) {
4312 r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0);
4313 }
4314 else {
4315 Node* node;
4316 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4317
4318 found = 0;
4319 codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
4320 for (i = 0; i < n; i++) {
4321 OnigCaseFoldCodeItem* item = items + i;
4322 if (item->byte_len == one_len) {
4323 if (item->code_len == 1) {
4324 codes[found++] = item->code[0];
4325 }
4326 }
4327 }
4328 r = onig_new_cclass_with_code_list(&node, enc, found, codes);
4329 if (r != 0) return r;
4330
4331 r = unravel_cf_node_add(rlist, node);
4332 if (r != 0) onig_node_free(node);
4333
4334 *rsn = NULL_NODE;
4335 }
4336
4337 return r;
4338 }
4339
4340 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)4341 unravel_case_fold_string(Node* node, regex_t* reg, int state)
4342 {
4343 int r, n, one_len, min_len, max_len, in_look_behind;
4344 UChar *start, *end, *p, *q;
4345 StrNode* snode;
4346 Node *sn, *list;
4347 OnigEncoding enc;
4348 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4349
4350 if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
4351
4352 snode = STR_(node);
4353
4354 start = snode->s;
4355 end = snode->end;
4356 if (start >= end) return 0;
4357
4358 in_look_behind = (state & IN_LOOK_BEHIND) != 0;
4359 enc = reg->enc;
4360
4361 list = sn = NULL_NODE;
4362 p = start;
4363 while (p < end) {
4364 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
4365 items);
4366 if (n < 0) {
4367 r = n;
4368 goto err;
4369 }
4370
4371 one_len = enclen(enc, p);
4372 if (n == 0) {
4373 q = p + one_len;
4374 r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0);
4375 if (r != 0) goto err;
4376 }
4377 else {
4378 if (in_look_behind != 0) {
4379 q = p + one_len;
4380 r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
4381 if (r != 0) goto err;
4382 }
4383 else {
4384 get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
4385 q = p + max_len;
4386 if (one_len == max_len && min_len == max_len) {
4387 r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc,
4388 reg->case_fold_flag, p, q);
4389 if (r != 0) goto err;
4390 sn = NULL_NODE;
4391 }
4392 else {
4393 r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag,
4394 p, q);
4395 if (r != 0) goto err;
4396 }
4397 }
4398 }
4399
4400 p = q;
4401 }
4402
4403 if (IS_NOT_NULL(list)) {
4404 if (node_list_len(list) == 1) {
4405 node_swap(node, NODE_CAR(list));
4406 }
4407 else {
4408 node_swap(node, list);
4409 }
4410 onig_node_free(list);
4411 }
4412 else {
4413 node_swap(node, sn);
4414 onig_node_free(sn);
4415 }
4416 return 0;
4417
4418 err:
4419 if (IS_NOT_NULL(list))
4420 onig_node_free(list);
4421 else if (IS_NOT_NULL(sn))
4422 onig_node_free(sn);
4423
4424 return r;
4425 }
4426
4427 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4428 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)4429 quantifiers_memory_node_info(Node* node)
4430 {
4431 int r = BODY_IS_EMPTY_POSSIBILITY;
4432
4433 switch (NODE_TYPE(node)) {
4434 case NODE_LIST:
4435 case NODE_ALT:
4436 {
4437 int v;
4438 do {
4439 v = quantifiers_memory_node_info(NODE_CAR(node));
4440 if (v > r) r = v;
4441 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4442 }
4443 break;
4444
4445 #ifdef USE_CALL
4446 case NODE_CALL:
4447 if (NODE_IS_RECURSION(node)) {
4448 return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */
4449 }
4450 else
4451 r = quantifiers_memory_node_info(NODE_BODY(node));
4452 break;
4453 #endif
4454
4455 case NODE_QUANT:
4456 {
4457 QuantNode* qn = QUANT_(node);
4458 if (qn->upper != 0) {
4459 r = quantifiers_memory_node_info(NODE_BODY(node));
4460 }
4461 }
4462 break;
4463
4464 case NODE_BAG:
4465 {
4466 BagNode* en = BAG_(node);
4467 switch (en->type) {
4468 case BAG_MEMORY:
4469 if (NODE_IS_RECURSION(node)) {
4470 return BODY_IS_EMPTY_POSSIBILITY_REC;
4471 }
4472 return BODY_IS_EMPTY_POSSIBILITY_MEM;
4473 break;
4474
4475 case BAG_OPTION:
4476 case BAG_STOP_BACKTRACK:
4477 r = quantifiers_memory_node_info(NODE_BODY(node));
4478 break;
4479 case BAG_IF_ELSE:
4480 {
4481 int v;
4482 r = quantifiers_memory_node_info(NODE_BODY(node));
4483 if (IS_NOT_NULL(en->te.Then)) {
4484 v = quantifiers_memory_node_info(en->te.Then);
4485 if (v > r) r = v;
4486 }
4487 if (IS_NOT_NULL(en->te.Else)) {
4488 v = quantifiers_memory_node_info(en->te.Else);
4489 if (v > r) r = v;
4490 }
4491 }
4492 break;
4493 }
4494 }
4495 break;
4496
4497 case NODE_BACKREF:
4498 case NODE_STRING:
4499 case NODE_CTYPE:
4500 case NODE_CCLASS:
4501 case NODE_ANCHOR:
4502 case NODE_GIMMICK:
4503 default:
4504 break;
4505 }
4506
4507 return r;
4508 }
4509 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
4510
4511
4512 #ifdef USE_CALL
4513
4514 #ifdef __GNUC__
4515 __inline
4516 #endif
4517 static int
tune_call_node_call(CallNode * cn,ScanEnv * env,int state)4518 tune_call_node_call(CallNode* cn, ScanEnv* env, int state)
4519 {
4520 MemEnv* mem_env = SCANENV_MEMENV(env);
4521
4522 if (cn->by_number != 0) {
4523 int gnum = cn->group_num;
4524
4525 if (env->num_named > 0 &&
4526 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4527 ! ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) {
4528 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4529 }
4530
4531 if (gnum > env->num_mem) {
4532 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
4533 cn->name, cn->name_end);
4534 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
4535 }
4536
4537 set_call_attr:
4538 NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node;
4539 if (IS_NULL(NODE_CALL_BODY(cn))) {
4540 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4541 cn->name, cn->name_end);
4542 return ONIGERR_UNDEFINED_NAME_REFERENCE;
4543 }
4544 }
4545 else {
4546 int *refs;
4547
4548 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
4549 if (n <= 0) {
4550 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4551 cn->name, cn->name_end);
4552 return ONIGERR_UNDEFINED_NAME_REFERENCE;
4553 }
4554 else if (n > 1) {
4555 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
4556 cn->name, cn->name_end);
4557 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
4558 }
4559 else {
4560 cn->group_num = refs[0];
4561 goto set_call_attr;
4562 }
4563 }
4564
4565 return 0;
4566 }
4567
4568 static void
tune_call2_call(Node * node)4569 tune_call2_call(Node* node)
4570 {
4571 switch (NODE_TYPE(node)) {
4572 case NODE_LIST:
4573 case NODE_ALT:
4574 do {
4575 tune_call2_call(NODE_CAR(node));
4576 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4577 break;
4578
4579 case NODE_QUANT:
4580 tune_call2_call(NODE_BODY(node));
4581 break;
4582
4583 case NODE_ANCHOR:
4584 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4585 tune_call2_call(NODE_BODY(node));
4586 break;
4587
4588 case NODE_BAG:
4589 {
4590 BagNode* en = BAG_(node);
4591
4592 if (en->type == BAG_MEMORY) {
4593 if (! NODE_IS_MARK1(node)) {
4594 NODE_STATUS_ADD(node, MARK1);
4595 tune_call2_call(NODE_BODY(node));
4596 NODE_STATUS_REMOVE(node, MARK1);
4597 }
4598 }
4599 else if (en->type == BAG_IF_ELSE) {
4600 tune_call2_call(NODE_BODY(node));
4601 if (IS_NOT_NULL(en->te.Then))
4602 tune_call2_call(en->te.Then);
4603 if (IS_NOT_NULL(en->te.Else))
4604 tune_call2_call(en->te.Else);
4605 }
4606 else {
4607 tune_call2_call(NODE_BODY(node));
4608 }
4609 }
4610 break;
4611
4612 case NODE_CALL:
4613 if (! NODE_IS_MARK1(node)) {
4614 NODE_STATUS_ADD(node, MARK1);
4615 {
4616 CallNode* cn = CALL_(node);
4617 Node* called = NODE_CALL_BODY(cn);
4618
4619 cn->entry_count++;
4620
4621 NODE_STATUS_ADD(called, CALLED);
4622 BAG_(called)->m.entry_count++;
4623 tune_call2_call(called);
4624 }
4625 NODE_STATUS_REMOVE(node, MARK1);
4626 }
4627 break;
4628
4629 default:
4630 break;
4631 }
4632 }
4633
4634 static int
tune_call(Node * node,ScanEnv * env,int state)4635 tune_call(Node* node, ScanEnv* env, int state)
4636 {
4637 int r;
4638
4639 switch (NODE_TYPE(node)) {
4640 case NODE_LIST:
4641 case NODE_ALT:
4642 do {
4643 r = tune_call(NODE_CAR(node), env, state);
4644 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4645 break;
4646
4647 case NODE_QUANT:
4648 if (QUANT_(node)->upper == 0)
4649 state |= IN_ZERO_REPEAT;
4650
4651 r = tune_call(NODE_BODY(node), env, state);
4652 break;
4653
4654 case NODE_ANCHOR:
4655 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4656 r = tune_call(NODE_BODY(node), env, state);
4657 else
4658 r = 0;
4659 break;
4660
4661 case NODE_BAG:
4662 {
4663 BagNode* en = BAG_(node);
4664
4665 if (en->type == BAG_MEMORY) {
4666 if ((state & IN_ZERO_REPEAT) != 0) {
4667 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4668 BAG_(node)->m.entry_count--;
4669 }
4670 r = tune_call(NODE_BODY(node), env, state);
4671 }
4672 else if (en->type == BAG_IF_ELSE) {
4673 r = tune_call(NODE_BODY(node), env, state);
4674 if (r != 0) return r;
4675 if (IS_NOT_NULL(en->te.Then)) {
4676 r = tune_call(en->te.Then, env, state);
4677 if (r != 0) return r;
4678 }
4679 if (IS_NOT_NULL(en->te.Else))
4680 r = tune_call(en->te.Else, env, state);
4681 }
4682 else
4683 r = tune_call(NODE_BODY(node), env, state);
4684 }
4685 break;
4686
4687 case NODE_CALL:
4688 if ((state & IN_ZERO_REPEAT) != 0) {
4689 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4690 CALL_(node)->entry_count--;
4691 }
4692
4693 r = tune_call_node_call(CALL_(node), env, state);
4694 break;
4695
4696 default:
4697 r = 0;
4698 break;
4699 }
4700
4701 return r;
4702 }
4703
4704 static int
tune_call2(Node * node)4705 tune_call2(Node* node)
4706 {
4707 int r = 0;
4708
4709 switch (NODE_TYPE(node)) {
4710 case NODE_LIST:
4711 case NODE_ALT:
4712 do {
4713 r = tune_call2(NODE_CAR(node));
4714 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4715 break;
4716
4717 case NODE_QUANT:
4718 if (QUANT_(node)->upper != 0)
4719 r = tune_call2(NODE_BODY(node));
4720 break;
4721
4722 case NODE_ANCHOR:
4723 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4724 r = tune_call2(NODE_BODY(node));
4725 break;
4726
4727 case NODE_BAG:
4728 if (! NODE_IS_IN_ZERO_REPEAT(node))
4729 r = tune_call2(NODE_BODY(node));
4730
4731 {
4732 BagNode* en = BAG_(node);
4733
4734 if (r != 0) return r;
4735 if (en->type == BAG_IF_ELSE) {
4736 if (IS_NOT_NULL(en->te.Then)) {
4737 r = tune_call2(en->te.Then);
4738 if (r != 0) return r;
4739 }
4740 if (IS_NOT_NULL(en->te.Else))
4741 r = tune_call2(en->te.Else);
4742 }
4743 }
4744 break;
4745
4746 case NODE_CALL:
4747 if (! NODE_IS_IN_ZERO_REPEAT(node)) {
4748 tune_call2_call(node);
4749 }
4750 break;
4751
4752 default:
4753 break;
4754 }
4755
4756 return r;
4757 }
4758
4759
4760 static void
tune_called_state_call(Node * node,int state)4761 tune_called_state_call(Node* node, int state)
4762 {
4763 switch (NODE_TYPE(node)) {
4764 case NODE_ALT:
4765 state |= IN_ALT;
4766 /* fall */
4767 case NODE_LIST:
4768 do {
4769 tune_called_state_call(NODE_CAR(node), state);
4770 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4771 break;
4772
4773 case NODE_QUANT:
4774 {
4775 QuantNode* qn = QUANT_(node);
4776
4777 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4778 state |= IN_REAL_REPEAT;
4779 if (qn->lower != qn->upper)
4780 state |= IN_VAR_REPEAT;
4781
4782 tune_called_state_call(NODE_QUANT_BODY(qn), state);
4783 }
4784 break;
4785
4786 case NODE_ANCHOR:
4787 {
4788 AnchorNode* an = ANCHOR_(node);
4789
4790 switch (an->type) {
4791 case ANCR_PREC_READ_NOT:
4792 case ANCR_LOOK_BEHIND_NOT:
4793 state |= IN_NOT;
4794 /* fall */
4795 case ANCR_PREC_READ:
4796 case ANCR_LOOK_BEHIND:
4797 tune_called_state_call(NODE_ANCHOR_BODY(an), state);
4798 break;
4799 default:
4800 break;
4801 }
4802 }
4803 break;
4804
4805 case NODE_BAG:
4806 {
4807 BagNode* en = BAG_(node);
4808
4809 if (en->type == BAG_MEMORY) {
4810 if (NODE_IS_MARK1(node)) {
4811 if ((~en->m.called_state & state) != 0) {
4812 en->m.called_state |= state;
4813 tune_called_state_call(NODE_BODY(node), state);
4814 }
4815 }
4816 else {
4817 NODE_STATUS_ADD(node, MARK1);
4818 en->m.called_state |= state;
4819 tune_called_state_call(NODE_BODY(node), state);
4820 NODE_STATUS_REMOVE(node, MARK1);
4821 }
4822 }
4823 else if (en->type == BAG_IF_ELSE) {
4824 state |= IN_ALT;
4825 tune_called_state_call(NODE_BODY(node), state);
4826 if (IS_NOT_NULL(en->te.Then)) {
4827 tune_called_state_call(en->te.Then, state);
4828 }
4829 if (IS_NOT_NULL(en->te.Else))
4830 tune_called_state_call(en->te.Else, state);
4831 }
4832 else {
4833 tune_called_state_call(NODE_BODY(node), state);
4834 }
4835 }
4836 break;
4837
4838 case NODE_CALL:
4839 tune_called_state_call(NODE_BODY(node), state);
4840 break;
4841
4842 default:
4843 break;
4844 }
4845 }
4846
4847 static void
tune_called_state(Node * node,int state)4848 tune_called_state(Node* node, int state)
4849 {
4850 switch (NODE_TYPE(node)) {
4851 case NODE_ALT:
4852 state |= IN_ALT;
4853 /* fall */
4854 case NODE_LIST:
4855 do {
4856 tune_called_state(NODE_CAR(node), state);
4857 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4858 break;
4859
4860 #ifdef USE_CALL
4861 case NODE_CALL:
4862 tune_called_state_call(node, state);
4863 break;
4864 #endif
4865
4866 case NODE_BAG:
4867 {
4868 BagNode* en = BAG_(node);
4869
4870 switch (en->type) {
4871 case BAG_MEMORY:
4872 if (en->m.entry_count > 1)
4873 state |= IN_MULTI_ENTRY;
4874
4875 en->m.called_state |= state;
4876 /* fall */
4877 case BAG_OPTION:
4878 case BAG_STOP_BACKTRACK:
4879 tune_called_state(NODE_BODY(node), state);
4880 break;
4881 case BAG_IF_ELSE:
4882 state |= IN_ALT;
4883 tune_called_state(NODE_BODY(node), state);
4884 if (IS_NOT_NULL(en->te.Then))
4885 tune_called_state(en->te.Then, state);
4886 if (IS_NOT_NULL(en->te.Else))
4887 tune_called_state(en->te.Else, state);
4888 break;
4889 }
4890 }
4891 break;
4892
4893 case NODE_QUANT:
4894 {
4895 QuantNode* qn = QUANT_(node);
4896
4897 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4898 state |= IN_REAL_REPEAT;
4899 if (qn->lower != qn->upper)
4900 state |= IN_VAR_REPEAT;
4901
4902 tune_called_state(NODE_QUANT_BODY(qn), state);
4903 }
4904 break;
4905
4906 case NODE_ANCHOR:
4907 {
4908 AnchorNode* an = ANCHOR_(node);
4909
4910 switch (an->type) {
4911 case ANCR_PREC_READ_NOT:
4912 case ANCR_LOOK_BEHIND_NOT:
4913 state |= IN_NOT;
4914 /* fall */
4915 case ANCR_PREC_READ:
4916 case ANCR_LOOK_BEHIND:
4917 tune_called_state(NODE_ANCHOR_BODY(an), state);
4918 break;
4919 default:
4920 break;
4921 }
4922 }
4923 break;
4924
4925 case NODE_BACKREF:
4926 case NODE_STRING:
4927 case NODE_CTYPE:
4928 case NODE_CCLASS:
4929 case NODE_GIMMICK:
4930 default:
4931 break;
4932 }
4933 }
4934
4935 #endif /* USE_CALL */
4936
4937
4938 static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
4939
4940 #ifdef __GNUC__
4941 __inline
4942 #endif
4943 static int
tune_anchor(Node * node,regex_t * reg,int state,ScanEnv * env)4944 tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
4945 {
4946 /* allowed node types in look-behind */
4947 #define ALLOWED_TYPE_IN_LB \
4948 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
4949 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
4950 | NODE_BIT_CALL | NODE_BIT_GIMMICK)
4951
4952 #define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4953 #define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4954
4955 #define ALLOWED_ANCHOR_IN_LB \
4956 ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
4957 | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
4958 | ANCR_WORD_BEGIN | ANCR_WORD_END \
4959 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4960
4961 #define ALLOWED_ANCHOR_IN_LB_NOT \
4962 ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
4963 | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
4964 | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
4965 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4966
4967 int r;
4968 AnchorNode* an = ANCHOR_(node);
4969
4970 switch (an->type) {
4971 case ANCR_PREC_READ:
4972 r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env);
4973 break;
4974 case ANCR_PREC_READ_NOT:
4975 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
4976 break;
4977
4978 case ANCR_LOOK_BEHIND:
4979 {
4980 r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4981 ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB);
4982 if (r < 0) return r;
4983 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4984 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env);
4985 if (r != 0) return r;
4986 r = tune_look_behind(node, reg, env);
4987 }
4988 break;
4989
4990 case ANCR_LOOK_BEHIND_NOT:
4991 {
4992 r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4993 ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4994 if (r < 0) return r;
4995 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4996 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND),
4997 env);
4998 if (r != 0) return r;
4999 r = tune_look_behind(node, reg, env);
5000 }
5001 break;
5002
5003 default:
5004 r = 0;
5005 break;
5006 }
5007
5008 return r;
5009 }
5010
5011 #ifdef __GNUC__
5012 __inline
5013 #endif
5014 static int
tune_quant(Node * node,regex_t * reg,int state,ScanEnv * env)5015 tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
5016 {
5017 int r;
5018 OnigLen d;
5019 QuantNode* qn = QUANT_(node);
5020 Node* body = NODE_BODY(node);
5021
5022 if ((state & IN_REAL_REPEAT) != 0) {
5023 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5024 }
5025 if ((state & IN_MULTI_ENTRY) != 0) {
5026 NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5027 }
5028
5029 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5030 d = tree_min_len(body, env);
5031 if (d == 0) {
5032 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5033 qn->emptiness = quantifiers_memory_node_info(body);
5034 #else
5035 qn->emptiness = BODY_IS_EMPTY_POSSIBILITY;
5036 #endif
5037 }
5038 }
5039
5040 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5041 state |= IN_REAL_REPEAT;
5042 if (qn->lower != qn->upper)
5043 state |= IN_VAR_REPEAT;
5044
5045 r = tune_tree(body, reg, state, env);
5046 if (r != 0) return r;
5047
5048 /* expand string */
5049 #define EXPAND_STRING_MAX_LENGTH 100
5050 if (NODE_TYPE(body) == NODE_STRING) {
5051 if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5052 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5053 int len = NODE_STRING_LEN(body);
5054
5055 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5056 int i, n = qn->lower;
5057 node_conv_to_str_node(node, STR_(body)->flag);
5058 for (i = 0; i < n; i++) {
5059 r = node_str_node_cat(node, body);
5060 if (r != 0) return r;
5061 }
5062 onig_node_free(body);
5063 return r;
5064 }
5065 }
5066 }
5067
5068 if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5069 if (NODE_TYPE(body) == NODE_QUANT) {
5070 QuantNode* tqn = QUANT_(body);
5071 if (IS_NOT_NULL(tqn->head_exact)) {
5072 qn->head_exact = tqn->head_exact;
5073 tqn->head_exact = NULL;
5074 }
5075 }
5076 else {
5077 qn->head_exact = get_head_value_node(NODE_BODY(node), 1, reg);
5078 }
5079 }
5080
5081 return r;
5082 }
5083
5084 /* tune_tree does the following work.
5085 1. check empty loop. (set qn->emptiness)
5086 2. expand ignore-case in char class.
5087 3. set memory status bit flags. (reg->mem_stats)
5088 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5089 5. find invalid patterns in look-behind.
5090 6. expand repeated string.
5091 */
5092 static int
tune_tree(Node * node,regex_t * reg,int state,ScanEnv * env)5093 tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
5094 {
5095 int r = 0;
5096
5097 switch (NODE_TYPE(node)) {
5098 case NODE_LIST:
5099 {
5100 Node* prev = NULL_NODE;
5101 do {
5102 r = tune_tree(NODE_CAR(node), reg, state, env);
5103 if (IS_NOT_NULL(prev) && r == 0) {
5104 r = tune_next(prev, NODE_CAR(node), reg);
5105 }
5106 prev = NODE_CAR(node);
5107 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5108 }
5109 break;
5110
5111 case NODE_ALT:
5112 do {
5113 r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5114 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5115 break;
5116
5117 case NODE_STRING:
5118 if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) {
5119 r = unravel_case_fold_string(node, reg, state);
5120 }
5121 break;
5122
5123 case NODE_BACKREF:
5124 {
5125 int i;
5126 int* p;
5127 BackRefNode* br = BACKREF_(node);
5128 p = BACKREFS_P(br);
5129 for (i = 0; i < br->back_num; i++) {
5130 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
5131 MEM_STATUS_ON(env->backrefed_mem, p[i]);
5132 #if 0
5133 #ifdef USE_BACKREF_WITH_LEVEL
5134 if (NODE_IS_NEST_LEVEL(node)) {
5135 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5136 }
5137 #endif
5138 #else
5139 /* More precisely, it should be checked whether alt/repeat exists before
5140 the subject capture node, and then this backreference position
5141 exists before (or in) the capture node. */
5142 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5143 #endif
5144 }
5145 }
5146 break;
5147
5148 case NODE_BAG:
5149 {
5150 BagNode* en = BAG_(node);
5151
5152 switch (en->type) {
5153 case BAG_OPTION:
5154 {
5155 OnigOptionType options = reg->options;
5156 reg->options = BAG_(node)->o.options;
5157 r = tune_tree(NODE_BODY(node), reg, state, env);
5158 reg->options = options;
5159 }
5160 break;
5161
5162 case BAG_MEMORY:
5163 #ifdef USE_CALL
5164 state |= en->m.called_state;
5165 #endif
5166
5167 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5168 || NODE_IS_RECURSION(node)) {
5169 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5170 }
5171 r = tune_tree(NODE_BODY(node), reg, state, env);
5172 break;
5173
5174 case BAG_STOP_BACKTRACK:
5175 {
5176 Node* target = NODE_BODY(node);
5177 r = tune_tree(target, reg, state, env);
5178 if (NODE_TYPE(target) == NODE_QUANT) {
5179 QuantNode* tqn = QUANT_(target);
5180 if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5181 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
5182 if (is_strict_real_node(NODE_BODY(target)))
5183 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5184 }
5185 }
5186 }
5187 break;
5188
5189 case BAG_IF_ELSE:
5190 r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5191 if (r != 0) return r;
5192 if (IS_NOT_NULL(en->te.Then)) {
5193 r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5194 if (r != 0) return r;
5195 }
5196 if (IS_NOT_NULL(en->te.Else))
5197 r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5198 break;
5199 }
5200 }
5201 break;
5202
5203 case NODE_QUANT:
5204 r = tune_quant(node, reg, state, env);
5205 break;
5206
5207 case NODE_ANCHOR:
5208 r = tune_anchor(node, reg, state, env);
5209 break;
5210
5211 #ifdef USE_CALL
5212 case NODE_CALL:
5213 #endif
5214 case NODE_CTYPE:
5215 case NODE_CCLASS:
5216 case NODE_GIMMICK:
5217 default:
5218 break;
5219 }
5220
5221 return r;
5222 }
5223
5224 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5225 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5226 UChar* s, UChar* end,
5227 UChar skip[], int* roffset)
5228 {
5229 int i, j, k, len, offset;
5230 int n, clen;
5231 UChar* p;
5232 OnigEncoding enc;
5233 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5234 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5235
5236 enc = reg->enc;
5237 offset = ENC_GET_SKIP_OFFSET(enc);
5238 if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5239 UChar* p = s;
5240 while (1) {
5241 len = enclen(enc, p);
5242 if (p + len >= end) {
5243 if (len == 1) offset = 1;
5244 else offset = 0;
5245 break;
5246 }
5247 p += len;
5248 }
5249 }
5250
5251 len = (int )(end - s);
5252 if (len + offset >= UCHAR_MAX)
5253 return ONIGERR_PARSER_BUG;
5254
5255 *roffset = offset;
5256
5257 for (i = 0; i < CHAR_MAP_SIZE; i++) {
5258 skip[i] = (UChar )(len + offset);
5259 }
5260
5261 for (p = s; p < end; ) {
5262 int z;
5263
5264 clen = enclen(enc, p);
5265 if (p + clen > end) clen = (int )(end - p);
5266
5267 len = (int )(end - p);
5268 for (j = 0; j < clen; j++) {
5269 z = len - j + (offset - 1);
5270 if (z <= 0) break;
5271 skip[p[j]] = z;
5272 }
5273
5274 if (case_expand != 0) {
5275 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5276 p, end, items);
5277 for (k = 0; k < n; k++) {
5278 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5279 for (j = 0; j < clen; j++) {
5280 z = len - j + (offset - 1);
5281 if (z <= 0) break;
5282 if (skip[buf[j]] > z)
5283 skip[buf[j]] = z;
5284 }
5285 }
5286 }
5287
5288 p += clen;
5289 }
5290
5291 return 0;
5292 }
5293
5294
5295 #define OPT_EXACT_MAXLEN 24
5296
5297 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5298 #error Too big OPT_EXACT_MAXLEN
5299 #endif
5300
5301 typedef struct {
5302 OnigLen min; /* min byte length */
5303 OnigLen max; /* max byte length */
5304 } MinMax;
5305
5306 typedef struct {
5307 MinMax mm;
5308 OnigEncoding enc;
5309 OnigOptionType options;
5310 OnigCaseFoldType case_fold_flag;
5311 ScanEnv* scan_env;
5312 } OptEnv;
5313
5314 typedef struct {
5315 int left;
5316 int right;
5317 } OptAnc;
5318
5319 typedef struct {
5320 MinMax mm; /* position */
5321 OptAnc anc;
5322 int reach_end;
5323 int case_fold;
5324 int len;
5325 UChar s[OPT_EXACT_MAXLEN];
5326 } OptStr;
5327
5328 typedef struct {
5329 MinMax mm; /* position */
5330 OptAnc anc;
5331 int value; /* weighted value */
5332 UChar map[CHAR_MAP_SIZE];
5333 } OptMap;
5334
5335 typedef struct {
5336 MinMax len;
5337 OptAnc anc;
5338 OptStr sb; /* boundary */
5339 OptStr sm; /* middle */
5340 OptStr spr; /* prec read (?=...) */
5341 OptMap map; /* boundary */
5342 } OptNode;
5343
5344
5345 static int
map_position_value(OnigEncoding enc,int i)5346 map_position_value(OnigEncoding enc, int i)
5347 {
5348 static const short int Vals[] = {
5349 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
5350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5351 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
5352 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
5353 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
5354 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
5355 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
5356 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
5357 };
5358
5359 if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
5360 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
5361 return 20;
5362 else
5363 return (int )Vals[i];
5364 }
5365 else
5366 return 4; /* Take it easy. */
5367 }
5368
5369 static int
distance_value(MinMax * mm)5370 distance_value(MinMax* mm)
5371 {
5372 /* 1000 / (min-max-dist + 1) */
5373 static const short int dist_vals[] = {
5374 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
5375 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
5376 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
5377 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
5378 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
5379 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
5380 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
5381 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
5382 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
5383 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
5384 };
5385
5386 OnigLen d;
5387
5388 if (mm->max == INFINITE_LEN) return 0;
5389
5390 d = mm->max - mm->min;
5391 if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
5392 /* return dist_vals[d] * 16 / (mm->min + 12); */
5393 return (int )dist_vals[d];
5394 else
5395 return 1;
5396 }
5397
5398 static int
comp_distance_value(MinMax * d1,MinMax * d2,int v1,int v2)5399 comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2)
5400 {
5401 if (v2 <= 0) return -1;
5402 if (v1 <= 0) return 1;
5403
5404 v1 *= distance_value(d1);
5405 v2 *= distance_value(d2);
5406
5407 if (v2 > v1) return 1;
5408 if (v2 < v1) return -1;
5409
5410 if (d2->min < d1->min) return 1;
5411 if (d2->min > d1->min) return -1;
5412 return 0;
5413 }
5414
5415 static int
is_equal_mml(MinMax * a,MinMax * b)5416 is_equal_mml(MinMax* a, MinMax* b)
5417 {
5418 return a->min == b->min && a->max == b->max;
5419 }
5420
5421 static void
set_mml(MinMax * l,OnigLen min,OnigLen max)5422 set_mml(MinMax* l, OnigLen min, OnigLen max)
5423 {
5424 l->min = min;
5425 l->max = max;
5426 }
5427
5428 static void
clear_mml(MinMax * l)5429 clear_mml(MinMax* l)
5430 {
5431 l->min = l->max = 0;
5432 }
5433
5434 static void
copy_mml(MinMax * to,MinMax * from)5435 copy_mml(MinMax* to, MinMax* from)
5436 {
5437 to->min = from->min;
5438 to->max = from->max;
5439 }
5440
5441 static void
add_mml(MinMax * to,MinMax * from)5442 add_mml(MinMax* to, MinMax* from)
5443 {
5444 to->min = distance_add(to->min, from->min);
5445 to->max = distance_add(to->max, from->max);
5446 }
5447
5448 static void
alt_merge_mml(MinMax * to,MinMax * from)5449 alt_merge_mml(MinMax* to, MinMax* from)
5450 {
5451 if (to->min > from->min) to->min = from->min;
5452 if (to->max < from->max) to->max = from->max;
5453 }
5454
5455 static void
copy_opt_env(OptEnv * to,OptEnv * from)5456 copy_opt_env(OptEnv* to, OptEnv* from)
5457 {
5458 *to = *from;
5459 }
5460
5461 static void
clear_opt_anc_info(OptAnc * a)5462 clear_opt_anc_info(OptAnc* a)
5463 {
5464 a->left = 0;
5465 a->right = 0;
5466 }
5467
5468 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)5469 copy_opt_anc_info(OptAnc* to, OptAnc* from)
5470 {
5471 *to = *from;
5472 }
5473
5474 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)5475 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
5476 OnigLen left_len, OnigLen right_len)
5477 {
5478 clear_opt_anc_info(to);
5479
5480 to->left = left->left;
5481 if (left_len == 0) {
5482 to->left |= right->left;
5483 }
5484
5485 to->right = right->right;
5486 if (right_len == 0) {
5487 to->right |= left->right;
5488 }
5489 else {
5490 to->right |= (left->right & ANCR_PREC_READ_NOT);
5491 }
5492 }
5493
5494 static int
is_left(int a)5495 is_left(int a)
5496 {
5497 if (a == ANCR_END_BUF || a == ANCR_SEMI_END_BUF ||
5498 a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
5499 return 0;
5500
5501 return 1;
5502 }
5503
5504 static int
is_set_opt_anc_info(OptAnc * to,int anc)5505 is_set_opt_anc_info(OptAnc* to, int anc)
5506 {
5507 if ((to->left & anc) != 0) return 1;
5508
5509 return ((to->right & anc) != 0 ? 1 : 0);
5510 }
5511
5512 static void
add_opt_anc_info(OptAnc * to,int anc)5513 add_opt_anc_info(OptAnc* to, int anc)
5514 {
5515 if (is_left(anc))
5516 to->left |= anc;
5517 else
5518 to->right |= anc;
5519 }
5520
5521 static void
remove_opt_anc_info(OptAnc * to,int anc)5522 remove_opt_anc_info(OptAnc* to, int anc)
5523 {
5524 if (is_left(anc))
5525 to->left &= ~anc;
5526 else
5527 to->right &= ~anc;
5528 }
5529
5530 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)5531 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
5532 {
5533 to->left &= add->left;
5534 to->right &= add->right;
5535 }
5536
5537 static int
is_full_opt_exact(OptStr * e)5538 is_full_opt_exact(OptStr* e)
5539 {
5540 return e->len >= OPT_EXACT_MAXLEN;
5541 }
5542
5543 static void
clear_opt_exact(OptStr * e)5544 clear_opt_exact(OptStr* e)
5545 {
5546 clear_mml(&e->mm);
5547 clear_opt_anc_info(&e->anc);
5548 e->reach_end = 0;
5549 e->case_fold = 0;
5550 e->len = 0;
5551 e->s[0] = '\0';
5552 }
5553
5554 static void
copy_opt_exact(OptStr * to,OptStr * from)5555 copy_opt_exact(OptStr* to, OptStr* from)
5556 {
5557 *to = *from;
5558 }
5559
5560 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)5561 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
5562 {
5563 int i, j, len, r;
5564 UChar *p, *end;
5565 OptAnc tanc;
5566
5567 if (add->case_fold != 0) {
5568 if (! to->case_fold) {
5569 if (to->len > 1 || to->len >= add->len) return 0; /* avoid */
5570
5571 to->case_fold = 1;
5572 }
5573 }
5574
5575 r = 0;
5576 p = add->s;
5577 end = p + add->len;
5578 for (i = to->len; p < end; ) {
5579 len = enclen(enc, p);
5580 if (i + len > OPT_EXACT_MAXLEN) {
5581 r = 1; /* 1:full */
5582 break;
5583 }
5584 for (j = 0; j < len && p < end; j++)
5585 to->s[i++] = *p++;
5586 }
5587
5588 to->len = i;
5589 to->reach_end = (p == end ? add->reach_end : 0);
5590
5591 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
5592 if (! to->reach_end) tanc.right = 0;
5593 copy_opt_anc_info(&to->anc, &tanc);
5594
5595 return r;
5596 }
5597
5598 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)5599 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
5600 {
5601 int i, j, len;
5602 UChar *p;
5603
5604 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
5605 len = enclen(enc, p);
5606 if (i + len > OPT_EXACT_MAXLEN) break;
5607 for (j = 0; j < len && p < end; j++)
5608 to->s[i++] = *p++;
5609 }
5610
5611 to->len = i;
5612
5613 if (p >= end && to->len == (int )(end - s))
5614 to->reach_end = 1;
5615 }
5616
5617 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)5618 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
5619 {
5620 int i, j, len;
5621
5622 if (add->len == 0 || to->len == 0) {
5623 clear_opt_exact(to);
5624 return ;
5625 }
5626
5627 if (! is_equal_mml(&to->mm, &add->mm)) {
5628 clear_opt_exact(to);
5629 return ;
5630 }
5631
5632 for (i = 0; i < to->len && i < add->len; ) {
5633 if (to->s[i] != add->s[i]) break;
5634 len = enclen(env->enc, to->s + i);
5635
5636 for (j = 1; j < len; j++) {
5637 if (to->s[i+j] != add->s[i+j]) break;
5638 }
5639 if (j < len) break;
5640 i += len;
5641 }
5642
5643 if (! add->reach_end || i < add->len || i < to->len) {
5644 to->reach_end = 0;
5645 }
5646 to->len = i;
5647 if (add->case_fold != 0)
5648 to->case_fold = 1;
5649
5650 alt_merge_opt_anc_info(&to->anc, &add->anc);
5651 if (! to->reach_end) to->anc.right = 0;
5652 }
5653
5654 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)5655 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
5656 {
5657 int vn, va;
5658
5659 vn = now->len;
5660 va = alt->len;
5661
5662 if (va == 0) {
5663 return ;
5664 }
5665 else if (vn == 0) {
5666 copy_opt_exact(now, alt);
5667 return ;
5668 }
5669 else if (vn <= 2 && va <= 2) {
5670 /* ByteValTable[x] is big value --> low price */
5671 va = map_position_value(enc, now->s[0]);
5672 vn = map_position_value(enc, alt->s[0]);
5673
5674 if (now->len > 1) vn += 5;
5675 if (alt->len > 1) va += 5;
5676 }
5677
5678 if (now->case_fold == 0) vn *= 2;
5679 if (alt->case_fold == 0) va *= 2;
5680
5681 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
5682 copy_opt_exact(now, alt);
5683 }
5684
5685 static void
clear_opt_map(OptMap * map)5686 clear_opt_map(OptMap* map)
5687 {
5688 static const OptMap clean_info = {
5689 {0, 0}, {0, 0}, 0,
5690 {
5691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5707 }
5708 };
5709
5710 xmemcpy(map, &clean_info, sizeof(OptMap));
5711 }
5712
5713 static void
copy_opt_map(OptMap * to,OptMap * from)5714 copy_opt_map(OptMap* to, OptMap* from)
5715 {
5716 *to = *from;
5717 }
5718
5719 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)5720 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
5721 {
5722 if (m->map[c] == 0) {
5723 m->map[c] = 1;
5724 m->value += map_position_value(enc, c);
5725 }
5726 }
5727
5728 static int
add_char_amb_opt_map(OptMap * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType fold_flag)5729 add_char_amb_opt_map(OptMap* map, UChar* p, UChar* end,
5730 OnigEncoding enc, OnigCaseFoldType fold_flag)
5731 {
5732 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5733 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5734 int i, n;
5735
5736 add_char_opt_map(map, p[0], enc);
5737
5738 fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag);
5739 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, fold_flag, p, end, items);
5740 if (n < 0) return n;
5741
5742 for (i = 0; i < n; i++) {
5743 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
5744 add_char_opt_map(map, buf[0], enc);
5745 }
5746
5747 return 0;
5748 }
5749
5750 static void
select_opt_map(OptMap * now,OptMap * alt)5751 select_opt_map(OptMap* now, OptMap* alt)
5752 {
5753 static int z = 1<<15; /* 32768: something big value */
5754
5755 int vn, va;
5756
5757 if (alt->value == 0) return ;
5758 if (now->value == 0) {
5759 copy_opt_map(now, alt);
5760 return ;
5761 }
5762
5763 vn = z / now->value;
5764 va = z / alt->value;
5765 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
5766 copy_opt_map(now, alt);
5767 }
5768
5769 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)5770 comp_opt_exact_or_map(OptStr* e, OptMap* m)
5771 {
5772 #define COMP_EM_BASE 20
5773 int ae, am;
5774 int case_value;
5775
5776 if (m->value <= 0) return -1;
5777
5778 if (e->case_fold != 0) {
5779 case_value = 1;
5780 }
5781 else
5782 case_value = 3;
5783
5784 ae = COMP_EM_BASE * e->len * case_value;
5785 am = COMP_EM_BASE * 5 * 2 / m->value;
5786 return comp_distance_value(&e->mm, &m->mm, ae, am);
5787 }
5788
5789 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)5790 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
5791 {
5792 int i, val;
5793
5794 /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */
5795 if (to->value == 0) return ;
5796 if (add->value == 0 || to->mm.max < add->mm.min) {
5797 clear_opt_map(to);
5798 return ;
5799 }
5800
5801 alt_merge_mml(&to->mm, &add->mm);
5802
5803 val = 0;
5804 for (i = 0; i < CHAR_MAP_SIZE; i++) {
5805 if (add->map[i])
5806 to->map[i] = 1;
5807
5808 if (to->map[i])
5809 val += map_position_value(enc, i);
5810 }
5811 to->value = val;
5812
5813 alt_merge_opt_anc_info(&to->anc, &add->anc);
5814 }
5815
5816 static void
set_bound_node_opt_info(OptNode * opt,MinMax * plen)5817 set_bound_node_opt_info(OptNode* opt, MinMax* plen)
5818 {
5819 copy_mml(&(opt->sb.mm), plen);
5820 copy_mml(&(opt->spr.mm), plen);
5821 copy_mml(&(opt->map.mm), plen);
5822 }
5823
5824 static void
clear_node_opt_info(OptNode * opt)5825 clear_node_opt_info(OptNode* opt)
5826 {
5827 clear_mml(&opt->len);
5828 clear_opt_anc_info(&opt->anc);
5829 clear_opt_exact(&opt->sb);
5830 clear_opt_exact(&opt->sm);
5831 clear_opt_exact(&opt->spr);
5832 clear_opt_map(&opt->map);
5833 }
5834
5835 static void
copy_node_opt_info(OptNode * to,OptNode * from)5836 copy_node_opt_info(OptNode* to, OptNode* from)
5837 {
5838 *to = *from;
5839 }
5840
5841 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)5842 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
5843 {
5844 int sb_reach, sm_reach;
5845 OptAnc tanc;
5846
5847 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
5848 copy_opt_anc_info(&to->anc, &tanc);
5849
5850 if (add->sb.len > 0 && to->len.max == 0) {
5851 concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
5852 copy_opt_anc_info(&add->sb.anc, &tanc);
5853 }
5854
5855 if (add->map.value > 0 && to->len.max == 0) {
5856 if (add->map.mm.max == 0)
5857 add->map.anc.left |= to->anc.left;
5858 }
5859
5860 sb_reach = to->sb.reach_end;
5861 sm_reach = to->sm.reach_end;
5862
5863 if (add->len.max != 0)
5864 to->sb.reach_end = to->sm.reach_end = 0;
5865
5866 if (add->sb.len > 0) {
5867 if (sb_reach) {
5868 concat_opt_exact(&to->sb, &add->sb, enc);
5869 clear_opt_exact(&add->sb);
5870 }
5871 else if (sm_reach) {
5872 concat_opt_exact(&to->sm, &add->sb, enc);
5873 clear_opt_exact(&add->sb);
5874 }
5875 }
5876 select_opt_exact(enc, &to->sm, &add->sb);
5877 select_opt_exact(enc, &to->sm, &add->sm);
5878
5879 if (to->spr.len > 0) {
5880 if (add->len.max > 0) {
5881 if (to->spr.mm.max == 0)
5882 select_opt_exact(enc, &to->sb, &to->spr);
5883 else
5884 select_opt_exact(enc, &to->sm, &to->spr);
5885 }
5886 }
5887 else if (add->spr.len > 0) {
5888 copy_opt_exact(&to->spr, &add->spr);
5889 }
5890
5891 select_opt_map(&to->map, &add->map);
5892 add_mml(&to->len, &add->len);
5893 }
5894
5895 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)5896 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
5897 {
5898 alt_merge_opt_anc_info(&to->anc, &add->anc);
5899 alt_merge_opt_exact(&to->sb, &add->sb, env);
5900 alt_merge_opt_exact(&to->sm, &add->sm, env);
5901 alt_merge_opt_exact(&to->spr, &add->spr, env);
5902 alt_merge_opt_map(env->enc, &to->map, &add->map);
5903
5904 alt_merge_mml(&to->len, &add->len);
5905 }
5906
5907
5908 #define MAX_NODE_OPT_INFO_REF_COUNT 5
5909
5910 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)5911 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
5912 {
5913 int i;
5914 int r;
5915 OptNode xo;
5916 OnigEncoding enc;
5917
5918 r = 0;
5919 enc = env->enc;
5920 clear_node_opt_info(opt);
5921 set_bound_node_opt_info(opt, &env->mm);
5922
5923 switch (NODE_TYPE(node)) {
5924 case NODE_LIST:
5925 {
5926 OptEnv nenv;
5927 Node* nd = node;
5928
5929 copy_opt_env(&nenv, env);
5930 do {
5931 r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
5932 if (r == 0) {
5933 add_mml(&nenv.mm, &xo.len);
5934 concat_left_node_opt_info(enc, opt, &xo);
5935 }
5936 } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
5937 }
5938 break;
5939
5940 case NODE_ALT:
5941 {
5942 Node* nd = node;
5943
5944 do {
5945 r = optimize_nodes(NODE_CAR(nd), &xo, env);
5946 if (r == 0) {
5947 if (nd == node) copy_node_opt_info(opt, &xo);
5948 else alt_merge_node_opt_info(opt, &xo, env);
5949 }
5950 } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
5951 }
5952 break;
5953
5954 case NODE_STRING:
5955 {
5956 StrNode* sn = STR_(node);
5957 int slen = (int )(sn->end - sn->s);
5958
5959 if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) {
5960 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5961 if (slen > 0) {
5962 add_char_opt_map(&opt->map, *(sn->s), enc);
5963 }
5964 set_mml(&opt->len, slen, slen);
5965 }
5966 else {
5967 int max, min;
5968
5969 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5970 opt->sb.case_fold = 1;
5971
5972 if (slen > 0) {
5973 r = add_char_amb_opt_map(&opt->map, sn->s, sn->end,
5974 enc, env->case_fold_flag);
5975 if (r != 0) break;
5976 }
5977
5978 max = slen;
5979 min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc);
5980 set_mml(&opt->len, min, max);
5981 }
5982 }
5983 break;
5984
5985 case NODE_CCLASS:
5986 {
5987 int z;
5988 CClassNode* cc = CCLASS_(node);
5989
5990 /* no need to check ignore case. (set in tune_tree()) */
5991
5992 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
5993 OnigLen min = ONIGENC_MBC_MINLEN(enc);
5994 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
5995
5996 set_mml(&opt->len, min, max);
5997 }
5998 else {
5999 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6000 z = BITSET_AT(cc->bs, i);
6001 if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6002 add_char_opt_map(&opt->map, (UChar )i, enc);
6003 }
6004 }
6005 set_mml(&opt->len, 1, 1);
6006 }
6007 }
6008 break;
6009
6010 case NODE_CTYPE:
6011 {
6012 int min, max;
6013 int range;
6014
6015 max = ONIGENC_MBC_MAXLEN_DIST(enc);
6016
6017 if (max == 1) {
6018 min = 1;
6019
6020 switch (CTYPE_(node)->ctype) {
6021 case CTYPE_ANYCHAR:
6022 break;
6023
6024 case ONIGENC_CTYPE_WORD:
6025 range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6026 if (CTYPE_(node)->not != 0) {
6027 for (i = 0; i < range; i++) {
6028 if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6029 add_char_opt_map(&opt->map, (UChar )i, enc);
6030 }
6031 }
6032 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6033 add_char_opt_map(&opt->map, (UChar )i, enc);
6034 }
6035 }
6036 else {
6037 for (i = 0; i < range; i++) {
6038 if (ONIGENC_IS_CODE_WORD(enc, i)) {
6039 add_char_opt_map(&opt->map, (UChar )i, enc);
6040 }
6041 }
6042 }
6043 break;
6044 }
6045 }
6046 else {
6047 min = ONIGENC_MBC_MINLEN(enc);
6048 }
6049 set_mml(&opt->len, min, max);
6050 }
6051 break;
6052
6053 case NODE_ANCHOR:
6054 switch (ANCHOR_(node)->type) {
6055 case ANCR_BEGIN_BUF:
6056 case ANCR_BEGIN_POSITION:
6057 case ANCR_BEGIN_LINE:
6058 case ANCR_END_BUF:
6059 case ANCR_SEMI_END_BUF:
6060 case ANCR_END_LINE:
6061 case ANCR_PREC_READ_NOT:
6062 case ANCR_LOOK_BEHIND:
6063 add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6064 break;
6065
6066 case ANCR_PREC_READ:
6067 {
6068 r = optimize_nodes(NODE_BODY(node), &xo, env);
6069 if (r == 0) {
6070 if (xo.sb.len > 0)
6071 copy_opt_exact(&opt->spr, &xo.sb);
6072 else if (xo.sm.len > 0)
6073 copy_opt_exact(&opt->spr, &xo.sm);
6074
6075 opt->spr.reach_end = 0;
6076
6077 if (xo.map.value > 0)
6078 copy_opt_map(&opt->map, &xo.map);
6079 }
6080 }
6081 break;
6082
6083 case ANCR_LOOK_BEHIND_NOT:
6084 break;
6085 }
6086 break;
6087
6088 case NODE_BACKREF:
6089 if (! NODE_IS_CHECKER(node)) {
6090 int* backs;
6091 OnigLen min, max, tmin, tmax;
6092 MemEnv* mem_env = SCANENV_MEMENV(env->scan_env);
6093 BackRefNode* br = BACKREF_(node);
6094
6095 if (NODE_IS_RECURSION(node)) {
6096 set_mml(&opt->len, 0, INFINITE_LEN);
6097 break;
6098 }
6099 backs = BACKREFS_P(br);
6100 min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env);
6101 max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env);
6102 for (i = 1; i < br->back_num; i++) {
6103 tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env);
6104 tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env);
6105 if (min > tmin) min = tmin;
6106 if (max < tmax) max = tmax;
6107 }
6108 set_mml(&opt->len, min, max);
6109 }
6110 break;
6111
6112 #ifdef USE_CALL
6113 case NODE_CALL:
6114 if (NODE_IS_RECURSION(node))
6115 set_mml(&opt->len, 0, INFINITE_LEN);
6116 else {
6117 OnigOptionType save = env->options;
6118 env->options = BAG_(NODE_BODY(node))->o.options;
6119 r = optimize_nodes(NODE_BODY(node), opt, env);
6120 env->options = save;
6121 }
6122 break;
6123 #endif
6124
6125 case NODE_QUANT:
6126 {
6127 OnigLen min, max;
6128 QuantNode* qn = QUANT_(node);
6129
6130 r = optimize_nodes(NODE_BODY(node), &xo, env);
6131 if (r != 0) break;
6132
6133 if (qn->lower > 0) {
6134 copy_node_opt_info(opt, &xo);
6135 if (xo.sb.len > 0) {
6136 if (xo.sb.reach_end) {
6137 for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6138 int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6139 if (rc > 0) break;
6140 }
6141 if (i < qn->lower) opt->sb.reach_end = 0;
6142 }
6143 }
6144
6145 if (qn->lower != qn->upper) {
6146 opt->sb.reach_end = 0;
6147 opt->sm.reach_end = 0;
6148 }
6149 if (qn->lower > 1)
6150 opt->sm.reach_end = 0;
6151 }
6152
6153 if (IS_INFINITE_REPEAT(qn->upper)) {
6154 if (env->mm.max == 0 &&
6155 NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6156 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env)))
6157 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6158 else
6159 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6160 }
6161
6162 max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6163 }
6164 else {
6165 max = distance_multiply(xo.len.max, qn->upper);
6166 }
6167
6168 min = distance_multiply(xo.len.min, qn->lower);
6169 set_mml(&opt->len, min, max);
6170 }
6171 break;
6172
6173 case NODE_BAG:
6174 {
6175 BagNode* en = BAG_(node);
6176
6177 switch (en->type) {
6178 case BAG_OPTION:
6179 {
6180 OnigOptionType save = env->options;
6181
6182 env->options = en->o.options;
6183 r = optimize_nodes(NODE_BODY(node), opt, env);
6184 env->options = save;
6185 }
6186 break;
6187
6188 case BAG_MEMORY:
6189 #ifdef USE_CALL
6190 en->opt_count++;
6191 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6192 OnigLen min, max;
6193
6194 min = 0;
6195 max = INFINITE_LEN;
6196 if (NODE_IS_MIN_FIXED(node)) min = en->min_len;
6197 if (NODE_IS_MAX_FIXED(node)) max = en->max_len;
6198 set_mml(&opt->len, min, max);
6199 }
6200 else
6201 #endif
6202 {
6203 r = optimize_nodes(NODE_BODY(node), opt, env);
6204 if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6205 if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6206 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6207 }
6208 }
6209 break;
6210
6211 case BAG_STOP_BACKTRACK:
6212 r = optimize_nodes(NODE_BODY(node), opt, env);
6213 break;
6214
6215 case BAG_IF_ELSE:
6216 {
6217 OptEnv nenv;
6218
6219 copy_opt_env(&nenv, env);
6220 r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6221 if (r == 0) {
6222 add_mml(&nenv.mm, &xo.len);
6223 concat_left_node_opt_info(enc, opt, &xo);
6224 if (IS_NOT_NULL(en->te.Then)) {
6225 r = optimize_nodes(en->te.Then, &xo, &nenv);
6226 if (r == 0) {
6227 concat_left_node_opt_info(enc, opt, &xo);
6228 }
6229 }
6230
6231 if (IS_NOT_NULL(en->te.Else)) {
6232 r = optimize_nodes(en->te.Else, &xo, env);
6233 if (r == 0)
6234 alt_merge_node_opt_info(opt, &xo, env);
6235 }
6236 }
6237 }
6238 break;
6239 }
6240 }
6241 break;
6242
6243 case NODE_GIMMICK:
6244 break;
6245
6246 default:
6247 #ifdef ONIG_DEBUG
6248 fprintf(stderr, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6249 #endif
6250 r = ONIGERR_TYPE_BUG;
6251 break;
6252 }
6253
6254 return r;
6255 }
6256
6257 static int
set_optimize_exact(regex_t * reg,OptStr * e)6258 set_optimize_exact(regex_t* reg, OptStr* e)
6259 {
6260 int r;
6261
6262 if (e->len == 0) return 0;
6263
6264 reg->exact = (UChar* )xmalloc(e->len);
6265 CHECK_NULL_RETURN_MEMERR(reg->exact);
6266 xmemcpy(reg->exact, e->s, e->len);
6267 reg->exact_end = reg->exact + e->len;
6268
6269 if (e->case_fold) {
6270 reg->optimize = OPTIMIZE_STR_CASE_FOLD;
6271 }
6272 else {
6273 int allow_reverse;
6274
6275 allow_reverse =
6276 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6277
6278 if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6279 r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6280 reg->exact, reg->exact_end,
6281 reg->map, &(reg->map_offset));
6282 if (r != 0) return r;
6283
6284 reg->optimize = (allow_reverse != 0
6285 ? OPTIMIZE_STR_FAST
6286 : OPTIMIZE_STR_FAST_STEP_FORWARD);
6287 }
6288 else {
6289 reg->optimize = OPTIMIZE_STR;
6290 }
6291 }
6292
6293 reg->dist_min = e->mm.min;
6294 reg->dist_max = e->mm.max;
6295
6296 if (reg->dist_min != INFINITE_LEN) {
6297 int n;
6298 if (e->case_fold != 0)
6299 n = 1;
6300 else
6301 n = (int )(reg->exact_end - reg->exact);
6302
6303 reg->threshold_len = reg->dist_min + n;
6304 }
6305
6306 return 0;
6307 }
6308
6309 static void
set_optimize_map(regex_t * reg,OptMap * m)6310 set_optimize_map(regex_t* reg, OptMap* m)
6311 {
6312 int i;
6313
6314 for (i = 0; i < CHAR_MAP_SIZE; i++)
6315 reg->map[i] = m->map[i];
6316
6317 reg->optimize = OPTIMIZE_MAP;
6318 reg->dist_min = m->mm.min;
6319 reg->dist_max = m->mm.max;
6320
6321 if (reg->dist_min != INFINITE_LEN) {
6322 reg->threshold_len = reg->dist_min + 1;
6323 }
6324 }
6325
6326 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6327 set_sub_anchor(regex_t* reg, OptAnc* anc)
6328 {
6329 reg->sub_anchor |= anc->left & ANCR_BEGIN_LINE;
6330 reg->sub_anchor |= anc->right & ANCR_END_LINE;
6331 }
6332
6333 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6334 static void print_optimize_info(FILE* f, regex_t* reg);
6335 #endif
6336
6337 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)6338 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
6339 {
6340 int r;
6341 OptNode opt;
6342 OptEnv env;
6343
6344 env.enc = reg->enc;
6345 env.options = reg->options;
6346 env.case_fold_flag = reg->case_fold_flag;
6347 env.scan_env = scan_env;
6348 clear_mml(&env.mm);
6349
6350 r = optimize_nodes(node, &opt, &env);
6351 if (r != 0) return r;
6352
6353 reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6354 ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6355 ANCR_LOOK_BEHIND);
6356
6357 if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6358 reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6359
6360 reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6361 ANCR_PREC_READ_NOT);
6362
6363 if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6364 reg->anc_dist_min = opt.len.min;
6365 reg->anc_dist_max = opt.len.max;
6366 }
6367
6368 if (opt.sb.len > 0 || opt.sm.len > 0) {
6369 select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6370 if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6371 goto set_map;
6372 }
6373 else {
6374 r = set_optimize_exact(reg, &opt.sb);
6375 set_sub_anchor(reg, &opt.sb.anc);
6376 }
6377 }
6378 else if (opt.map.value > 0) {
6379 set_map:
6380 set_optimize_map(reg, &opt.map);
6381 set_sub_anchor(reg, &opt.map.anc);
6382 }
6383 else {
6384 reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
6385 if (opt.len.max == 0)
6386 reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
6387 }
6388
6389 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6390 print_optimize_info(stderr, reg);
6391 #endif
6392 return r;
6393 }
6394
6395 static void
clear_optimize_info(regex_t * reg)6396 clear_optimize_info(regex_t* reg)
6397 {
6398 reg->optimize = OPTIMIZE_NONE;
6399 reg->anchor = 0;
6400 reg->anc_dist_min = 0;
6401 reg->anc_dist_max = 0;
6402 reg->sub_anchor = 0;
6403 reg->exact_end = (UChar* )NULL;
6404 reg->map_offset = 0;
6405 reg->threshold_len = 0;
6406 if (IS_NOT_NULL(reg->exact)) {
6407 xfree(reg->exact);
6408 reg->exact = (UChar* )NULL;
6409 }
6410 }
6411
6412 #ifdef ONIG_DEBUG
6413
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)6414 static void print_enc_string(FILE* fp, OnigEncoding enc,
6415 const UChar *s, const UChar *end)
6416 {
6417 fprintf(fp, "\nPATTERN: /");
6418
6419 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6420 const UChar *p;
6421 OnigCodePoint code;
6422
6423 p = s;
6424 while (p < end) {
6425 code = ONIGENC_MBC_TO_CODE(enc, p, end);
6426 if (code >= 0x80) {
6427 fprintf(fp, " 0x%04x ", (int )code);
6428 }
6429 else {
6430 fputc((int )code, fp);
6431 }
6432
6433 p += enclen(enc, p);
6434 }
6435 }
6436 else {
6437 while (s < end) {
6438 fputc((int )*s, fp);
6439 s++;
6440 }
6441 }
6442
6443 fprintf(fp, "/\n");
6444 }
6445
6446 #endif /* ONIG_DEBUG */
6447
6448 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6449
6450 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)6451 print_distance_range(FILE* f, OnigLen a, OnigLen b)
6452 {
6453 if (a == INFINITE_LEN)
6454 fputs("inf", f);
6455 else
6456 fprintf(f, "(%u)", a);
6457
6458 fputs("-", f);
6459
6460 if (b == INFINITE_LEN)
6461 fputs("inf", f);
6462 else
6463 fprintf(f, "(%u)", b);
6464 }
6465
6466 static void
print_anchor(FILE * f,int anchor)6467 print_anchor(FILE* f, int anchor)
6468 {
6469 int q = 0;
6470
6471 fprintf(f, "[");
6472
6473 if (anchor & ANCR_BEGIN_BUF) {
6474 fprintf(f, "begin-buf");
6475 q = 1;
6476 }
6477 if (anchor & ANCR_BEGIN_LINE) {
6478 if (q) fprintf(f, ", ");
6479 q = 1;
6480 fprintf(f, "begin-line");
6481 }
6482 if (anchor & ANCR_BEGIN_POSITION) {
6483 if (q) fprintf(f, ", ");
6484 q = 1;
6485 fprintf(f, "begin-pos");
6486 }
6487 if (anchor & ANCR_END_BUF) {
6488 if (q) fprintf(f, ", ");
6489 q = 1;
6490 fprintf(f, "end-buf");
6491 }
6492 if (anchor & ANCR_SEMI_END_BUF) {
6493 if (q) fprintf(f, ", ");
6494 q = 1;
6495 fprintf(f, "semi-end-buf");
6496 }
6497 if (anchor & ANCR_END_LINE) {
6498 if (q) fprintf(f, ", ");
6499 q = 1;
6500 fprintf(f, "end-line");
6501 }
6502 if (anchor & ANCR_ANYCHAR_INF) {
6503 if (q) fprintf(f, ", ");
6504 q = 1;
6505 fprintf(f, "anychar-inf");
6506 }
6507 if (anchor & ANCR_ANYCHAR_INF_ML) {
6508 if (q) fprintf(f, ", ");
6509 fprintf(f, "anychar-inf-ml");
6510 }
6511
6512 fprintf(f, "]");
6513 }
6514
6515 static void
print_optimize_info(FILE * f,regex_t * reg)6516 print_optimize_info(FILE* f, regex_t* reg)
6517 {
6518 static const char* on[] = { "NONE", "STR",
6519 "STR_FAST", "STR_FAST_STEP_FORWARD",
6520 "STR_CASE_FOLD", "MAP" };
6521
6522 fprintf(f, "optimize: %s\n", on[reg->optimize]);
6523 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
6524 if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
6525 print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
6526 fprintf(f, "\n");
6527
6528 if (reg->optimize) {
6529 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
6530 fprintf(f, "\n");
6531 }
6532 fprintf(f, "\n");
6533
6534 if (reg->exact) {
6535 UChar *p;
6536 fprintf(f, "exact: [");
6537 for (p = reg->exact; p < reg->exact_end; p++) {
6538 fputc(*p, f);
6539 }
6540 fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
6541 }
6542 else if (reg->optimize & OPTIMIZE_MAP) {
6543 int c, i, n = 0;
6544
6545 for (i = 0; i < CHAR_MAP_SIZE; i++)
6546 if (reg->map[i]) n++;
6547
6548 fprintf(f, "map: n=%d\n", n);
6549 if (n > 0) {
6550 c = 0;
6551 fputc('[', f);
6552 for (i = 0; i < CHAR_MAP_SIZE; i++) {
6553 if (reg->map[i] != 0) {
6554 if (c > 0) fputs(", ", f);
6555 c++;
6556 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
6557 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
6558 fputc(i, f);
6559 else
6560 fprintf(f, "%d", i);
6561 }
6562 }
6563 fprintf(f, "]\n");
6564 }
6565 }
6566 }
6567 #endif
6568
6569
6570 extern RegexExt*
onig_get_regex_ext(regex_t * reg)6571 onig_get_regex_ext(regex_t* reg)
6572 {
6573 if (IS_NULL(reg->extp)) {
6574 RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
6575 if (IS_NULL(ext)) return 0;
6576
6577 ext->pattern = 0;
6578 ext->pattern_end = 0;
6579 #ifdef USE_CALLOUT
6580 ext->tag_table = 0;
6581 ext->callout_num = 0;
6582 ext->callout_list_alloc = 0;
6583 ext->callout_list = 0;
6584 #endif
6585
6586 reg->extp = ext;
6587 }
6588
6589 return reg->extp;
6590 }
6591
6592 static void
free_regex_ext(RegexExt * ext)6593 free_regex_ext(RegexExt* ext)
6594 {
6595 if (IS_NOT_NULL(ext)) {
6596 if (IS_NOT_NULL(ext->pattern))
6597 xfree((void* )ext->pattern);
6598
6599 #ifdef USE_CALLOUT
6600 if (IS_NOT_NULL(ext->tag_table))
6601 onig_callout_tag_table_free(ext->tag_table);
6602
6603 if (IS_NOT_NULL(ext->callout_list))
6604 onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
6605 #endif
6606
6607 xfree(ext);
6608 }
6609 }
6610
6611 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)6612 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
6613 {
6614 RegexExt* ext;
6615 UChar* s;
6616
6617 ext = onig_get_regex_ext(reg);
6618 CHECK_NULL_RETURN_MEMERR(ext);
6619
6620 s = onigenc_strdup(reg->enc, pattern, pattern_end);
6621 CHECK_NULL_RETURN_MEMERR(s);
6622
6623 ext->pattern = s;
6624 ext->pattern_end = s + (pattern_end - pattern);
6625
6626 return ONIG_NORMAL;
6627 }
6628
6629 extern void
onig_free_body(regex_t * reg)6630 onig_free_body(regex_t* reg)
6631 {
6632 if (IS_NOT_NULL(reg)) {
6633 ops_free(reg);
6634 if (IS_NOT_NULL(reg->string_pool)) {
6635 xfree(reg->string_pool);
6636 reg->string_pool_end = reg->string_pool = 0;
6637 }
6638 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
6639 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
6640 if (IS_NOT_NULL(reg->extp)) {
6641 free_regex_ext(reg->extp);
6642 reg->extp = 0;
6643 }
6644
6645 onig_names_free(reg);
6646 }
6647 }
6648
6649 extern void
onig_free(regex_t * reg)6650 onig_free(regex_t* reg)
6651 {
6652 if (IS_NOT_NULL(reg)) {
6653 onig_free_body(reg);
6654 xfree(reg);
6655 }
6656 }
6657
6658
6659 #ifdef ONIG_DEBUG_PARSE
6660 static void print_tree P_((FILE* f, Node* node));
6661 #endif
6662
6663 extern int onig_init_for_match_at(regex_t* reg);
6664
6665 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)6666 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
6667 OnigErrorInfo* einfo)
6668 {
6669 int r;
6670 Node* root;
6671 ScanEnv scan_env;
6672 #ifdef USE_CALL
6673 UnsetAddrList uslist = {0};
6674 #endif
6675
6676 root = 0;
6677 if (IS_NOT_NULL(einfo)) {
6678 einfo->enc = reg->enc;
6679 einfo->par = (UChar* )NULL;
6680 }
6681
6682 #ifdef ONIG_DEBUG
6683 print_enc_string(stderr, reg->enc, pattern, pattern_end);
6684 #endif
6685
6686 if (reg->ops_alloc == 0) {
6687 r = ops_init(reg, OPS_INIT_SIZE);
6688 if (r != 0) goto end;
6689 }
6690 else
6691 reg->ops_used = 0;
6692
6693 reg->string_pool = 0;
6694 reg->string_pool_end = 0;
6695 reg->num_mem = 0;
6696 reg->num_repeat = 0;
6697 reg->num_empty_check = 0;
6698 reg->repeat_range_alloc = 0;
6699 reg->repeat_range = (RepeatRange* )NULL;
6700 reg->empty_status_mem = 0;
6701
6702 r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
6703 if (r != 0) goto err;
6704
6705 r = reduce_string_list(root);
6706 if (r != 0) goto err;
6707
6708 /* mixed use named group and no-named group */
6709 if (scan_env.num_named > 0 &&
6710 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
6711 ! ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
6712 if (scan_env.num_named != scan_env.num_mem)
6713 r = disable_noname_group_capture(&root, reg, &scan_env);
6714 else
6715 r = numbered_ref_check(root);
6716
6717 if (r != 0) goto err;
6718 }
6719
6720 r = check_backrefs(root, &scan_env);
6721 if (r != 0) goto err;
6722
6723 #ifdef USE_CALL
6724 if (scan_env.num_call > 0) {
6725 r = unset_addr_list_init(&uslist, scan_env.num_call);
6726 if (r != 0) goto err;
6727 scan_env.unset_addr_list = &uslist;
6728 r = tune_call(root, &scan_env, 0);
6729 if (r != 0) goto err_unset;
6730 r = tune_call2(root);
6731 if (r != 0) goto err_unset;
6732 r = recursive_call_check_trav(root, &scan_env, 0);
6733 if (r < 0) goto err_unset;
6734 r = infinite_recursive_call_check_trav(root, &scan_env);
6735 if (r != 0) goto err_unset;
6736
6737 tune_called_state(root, 0);
6738 }
6739
6740 reg->num_call = scan_env.num_call;
6741 #endif
6742
6743 #ifdef ONIG_DEBUG_PARSE
6744 fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth);
6745 fprintf(stderr, "TREE (parsed)\n");
6746 print_tree(stderr, root);
6747 fprintf(stderr, "\n");
6748 #endif
6749
6750 r = tune_tree(root, reg, 0, &scan_env);
6751 if (r != 0) goto err_unset;
6752
6753 if (scan_env.backref_num != 0) {
6754 set_parent_node_trav(root, NULL_NODE);
6755 r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env);
6756 if (r != 0) goto err_unset;
6757 set_empty_status_check_trav(root, &scan_env);
6758 }
6759
6760 #ifdef ONIG_DEBUG_PARSE
6761 fprintf(stderr, "TREE (after tune)\n");
6762 print_tree(stderr, root);
6763 fprintf(stderr, "\n");
6764 #endif
6765
6766 reg->capture_history = scan_env.cap_history;
6767 reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history;
6768
6769 #ifdef USE_CALLOUT
6770 if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
6771 reg->push_mem_end = reg->push_mem_start;
6772 }
6773 else {
6774 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
6775 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
6776 else
6777 reg->push_mem_end = reg->push_mem_start &
6778 (scan_env.backrefed_mem | scan_env.cap_history);
6779 }
6780 #else
6781 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
6782 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
6783 else
6784 reg->push_mem_end = reg->push_mem_start &
6785 (scan_env.backrefed_mem | scan_env.cap_history);
6786 #endif
6787
6788 clear_optimize_info(reg);
6789 #ifndef ONIG_DONT_OPTIMIZE
6790 r = set_optimize_info_from_tree(root, reg, &scan_env);
6791 if (r != 0) goto err_unset;
6792 #endif
6793
6794 if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
6795 xfree(scan_env.mem_env_dynamic);
6796 scan_env.mem_env_dynamic = (MemEnv* )NULL;
6797 }
6798
6799 r = compile_tree(root, reg, &scan_env);
6800 if (r == 0) {
6801 if (scan_env.keep_num > 0) {
6802 r = add_op(reg, OP_UPDATE_VAR);
6803 if (r != 0) goto err;
6804
6805 COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
6806 COP(reg)->update_var.id = 0; /* not used */
6807 }
6808
6809 r = add_op(reg, OP_END);
6810 if (r != 0) goto err;
6811
6812 #ifdef USE_CALL
6813 if (scan_env.num_call > 0) {
6814 r = fix_unset_addr_list(&uslist, reg);
6815 unset_addr_list_end(&uslist);
6816 if (r != 0) goto err;
6817 }
6818 #endif
6819
6820 set_addr_in_repeat_range(reg);
6821
6822 if ((reg->push_mem_end != 0)
6823 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
6824 || (reg->num_repeat != 0)
6825 || (reg->num_empty_check != 0)
6826 #endif
6827 #ifdef USE_CALLOUT
6828 || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
6829 #endif
6830 )
6831 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
6832 else {
6833 if (reg->push_mem_start != 0)
6834 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
6835 else
6836 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
6837 }
6838
6839 r = ops_make_string_pool(reg);
6840 if (r != 0) goto err;
6841 }
6842 #ifdef USE_CALL
6843 else if (scan_env.num_call > 0) {
6844 unset_addr_list_end(&uslist);
6845 }
6846 #endif
6847 onig_node_free(root);
6848
6849 #ifdef ONIG_DEBUG_COMPILE
6850 onig_print_names(stderr, reg);
6851 onig_print_compiled_byte_code_list(stderr, reg);
6852 #endif
6853
6854 #ifdef USE_DIRECT_THREADED_CODE
6855 /* opcode -> opaddr */
6856 onig_init_for_match_at(reg);
6857 #endif
6858
6859 end:
6860 return r;
6861
6862 err_unset:
6863 #ifdef USE_CALL
6864 if (scan_env.num_call > 0) {
6865 unset_addr_list_end(&uslist);
6866 }
6867 #endif
6868 err:
6869 if (IS_NOT_NULL(scan_env.error)) {
6870 if (IS_NOT_NULL(einfo)) {
6871 einfo->par = scan_env.error;
6872 einfo->par_end = scan_env.error_end;
6873 }
6874 }
6875
6876 onig_node_free(root);
6877 if (IS_NOT_NULL(scan_env.mem_env_dynamic))
6878 xfree(scan_env.mem_env_dynamic);
6879 return r;
6880 }
6881
6882
6883 static int onig_inited = 0;
6884
6885 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)6886 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
6887 OnigEncoding enc, OnigSyntaxType* syntax)
6888 {
6889 int r;
6890
6891 xmemset(reg, 0, sizeof(*reg));
6892
6893 if (onig_inited == 0) {
6894 #if 0
6895 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
6896 #else
6897 r = onig_initialize(&enc, 1);
6898 if (r != 0)
6899 return ONIGERR_FAIL_TO_INITIALIZE;
6900
6901 onig_warning("You didn't call onig_initialize() explicitly");
6902 #endif
6903 }
6904
6905 if (IS_NULL(reg))
6906 return ONIGERR_INVALID_ARGUMENT;
6907
6908 if (ONIGENC_IS_UNDEF(enc))
6909 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
6910
6911 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
6912 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
6913 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
6914 }
6915
6916 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
6917 option |= syntax->options;
6918 option &= ~ONIG_OPTION_SINGLELINE;
6919 }
6920 else
6921 option |= syntax->options;
6922
6923 (reg)->enc = enc;
6924 (reg)->options = option;
6925 (reg)->syntax = syntax;
6926 (reg)->optimize = 0;
6927 (reg)->exact = (UChar* )NULL;
6928 (reg)->extp = (RegexExt* )NULL;
6929
6930 (reg)->ops = (Operation* )NULL;
6931 (reg)->ops_curr = (Operation* )NULL;
6932 (reg)->ops_used = 0;
6933 (reg)->ops_alloc = 0;
6934 (reg)->name_table = (void* )NULL;
6935
6936 (reg)->case_fold_flag = case_fold_flag;
6937 return 0;
6938 }
6939
6940 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)6941 onig_new_without_alloc(regex_t* reg,
6942 const UChar* pattern, const UChar* pattern_end,
6943 OnigOptionType option, OnigEncoding enc,
6944 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
6945 {
6946 int r;
6947
6948 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6949 if (r != 0) return r;
6950
6951 r = onig_compile(reg, pattern, pattern_end, einfo);
6952 return r;
6953 }
6954
6955 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)6956 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
6957 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
6958 OnigErrorInfo* einfo)
6959 {
6960 int r;
6961
6962 *reg = (regex_t* )xmalloc(sizeof(regex_t));
6963 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
6964
6965 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6966 if (r != 0) {
6967 xfree(*reg);
6968 *reg = NULL;
6969 return r;
6970 }
6971
6972 r = onig_compile(*reg, pattern, pattern_end, einfo);
6973 if (r != 0) {
6974 onig_free(*reg);
6975 *reg = NULL;
6976 }
6977 return r;
6978 }
6979
6980 extern int
onig_initialize(OnigEncoding encodings[],int n)6981 onig_initialize(OnigEncoding encodings[], int n)
6982 {
6983 int i;
6984 int r;
6985
6986 if (onig_inited != 0)
6987 return 0;
6988
6989 onigenc_init();
6990
6991 onig_inited = 1;
6992
6993 for (i = 0; i < n; i++) {
6994 OnigEncoding enc = encodings[i];
6995 r = onig_initialize_encoding(enc);
6996 if (r != 0)
6997 return r;
6998 }
6999
7000 return ONIG_NORMAL;
7001 }
7002
7003 typedef struct EndCallListItem {
7004 struct EndCallListItem* next;
7005 void (*func)(void);
7006 } EndCallListItemType;
7007
7008 static EndCallListItemType* EndCallTop;
7009
onig_add_end_call(void (* func)(void))7010 extern void onig_add_end_call(void (*func)(void))
7011 {
7012 EndCallListItemType* item;
7013
7014 item = (EndCallListItemType* )xmalloc(sizeof(*item));
7015 if (item == 0) return ;
7016
7017 item->next = EndCallTop;
7018 item->func = func;
7019
7020 EndCallTop = item;
7021 }
7022
7023 static void
exec_end_call_list(void)7024 exec_end_call_list(void)
7025 {
7026 EndCallListItemType* prev;
7027 void (*func)(void);
7028
7029 while (EndCallTop != 0) {
7030 func = EndCallTop->func;
7031 (*func)();
7032
7033 prev = EndCallTop;
7034 EndCallTop = EndCallTop->next;
7035 xfree(prev);
7036 }
7037 }
7038
7039 extern int
onig_end(void)7040 onig_end(void)
7041 {
7042 exec_end_call_list();
7043
7044 #ifdef USE_CALLOUT
7045 onig_global_callout_names_free();
7046 #endif
7047
7048 onigenc_end();
7049
7050 onig_inited = 0;
7051
7052 return 0;
7053 }
7054
7055 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7056 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7057 {
7058 OnigCodePoint n, *data;
7059 OnigCodePoint low, high, x;
7060
7061 GET_CODE_POINT(n, p);
7062 data = (OnigCodePoint* )p;
7063 data++;
7064
7065 for (low = 0, high = n; low < high; ) {
7066 x = (low + high) >> 1;
7067 if (code > data[x * 2 + 1])
7068 low = x + 1;
7069 else
7070 high = x;
7071 }
7072
7073 return ((low < n && code >= data[low * 2]) ? 1 : 0);
7074 }
7075
7076 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7077 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7078 {
7079 int found;
7080 CClassNode* cc = (CClassNode* )cc_arg;
7081
7082 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7083 if (IS_NULL(cc->mbuf)) {
7084 found = 0;
7085 }
7086 else {
7087 found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7088 }
7089 }
7090 else {
7091 found = BITSET_AT(cc->bs, code) != 0;
7092 }
7093
7094 if (IS_NCCLASS_NOT(cc))
7095 return !found;
7096 else
7097 return found;
7098 }
7099
7100 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7101 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7102 {
7103 int len;
7104
7105 if (ONIGENC_MBC_MINLEN(enc) > 1) {
7106 len = 2;
7107 }
7108 else {
7109 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7110 if (len < 0) return 0;
7111 }
7112 return onig_is_code_in_cc_len(len, code, cc);
7113 }
7114
7115
7116 #ifdef ONIG_DEBUG_PARSE
7117
7118 #ifdef USE_CALL
7119 static void
p_string(FILE * f,int len,UChar * s)7120 p_string(FILE* f, int len, UChar* s)
7121 {
7122 fputs(":", f);
7123 while (len-- > 0) { fputc(*s++, f); }
7124 }
7125 #endif
7126
7127 static void
Indent(FILE * f,int indent)7128 Indent(FILE* f, int indent)
7129 {
7130 int i;
7131 for (i = 0; i < indent; i++) putc(' ', f);
7132 }
7133
7134 static void
print_indent_tree(FILE * f,Node * node,int indent)7135 print_indent_tree(FILE* f, Node* node, int indent)
7136 {
7137 int i;
7138 NodeType type;
7139 UChar* p;
7140 int add = 3;
7141
7142 Indent(f, indent);
7143 if (IS_NULL(node)) {
7144 fprintf(f, "ERROR: null node!!!\n");
7145 exit(0);
7146 }
7147
7148 type = NODE_TYPE(node);
7149 switch (type) {
7150 case NODE_LIST:
7151 case NODE_ALT:
7152 if (type == NODE_LIST)
7153 fprintf(f, "<list:%p>\n", node);
7154 else
7155 fprintf(f, "<alt:%p>\n", node);
7156
7157 print_indent_tree(f, NODE_CAR(node), indent + add);
7158 while (IS_NOT_NULL(node = NODE_CDR(node))) {
7159 if (NODE_TYPE(node) != type) {
7160 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
7161 exit(0);
7162 }
7163 print_indent_tree(f, NODE_CAR(node), indent + add);
7164 }
7165 break;
7166
7167 case NODE_STRING:
7168 {
7169 char* str;
7170 char* mode;
7171
7172 if (NODE_STRING_IS_CRUDE(node))
7173 mode = "-crude";
7174 else if (NODE_STRING_IS_CASE_FOLD_MATCH(node))
7175 mode = "-case_fold_match";
7176 else
7177 mode = "";
7178
7179 if (STR_(node)->s == STR_(node)->end)
7180 str = "empty-string";
7181 else
7182 str = "string";
7183
7184 fprintf(f, "<%s%s:%p>", str, mode, node);
7185 for (p = STR_(node)->s; p < STR_(node)->end; p++) {
7186 if (*p >= 0x20 && *p < 0x7f)
7187 fputc(*p, f);
7188 else {
7189 fprintf(f, " 0x%02x", *p);
7190 }
7191 }
7192 }
7193 break;
7194
7195 case NODE_CCLASS:
7196 fprintf(f, "<cclass:%p>", node);
7197 if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
7198 if (CCLASS_(node)->mbuf) {
7199 BBuf* bbuf = CCLASS_(node)->mbuf;
7200 for (i = 0; i < bbuf->used; i++) {
7201 if (i > 0) fprintf(f, ",");
7202 fprintf(f, "%0x", bbuf->p[i]);
7203 }
7204 }
7205 break;
7206
7207 case NODE_CTYPE:
7208 fprintf(f, "<ctype:%p> ", node);
7209 switch (CTYPE_(node)->ctype) {
7210 case CTYPE_ANYCHAR:
7211 fprintf(f, "<anychar:%p>", node);
7212 break;
7213
7214 case ONIGENC_CTYPE_WORD:
7215 if (CTYPE_(node)->not != 0)
7216 fputs("not word", f);
7217 else
7218 fputs("word", f);
7219
7220 if (CTYPE_(node)->ascii_mode != 0)
7221 fputs(" (ascii)", f);
7222
7223 break;
7224
7225 default:
7226 fprintf(f, "ERROR: undefined ctype.\n");
7227 exit(0);
7228 }
7229 break;
7230
7231 case NODE_ANCHOR:
7232 fprintf(f, "<anchor:%p> ", node);
7233 switch (ANCHOR_(node)->type) {
7234 case ANCR_BEGIN_BUF: fputs("begin buf", f); break;
7235 case ANCR_END_BUF: fputs("end buf", f); break;
7236 case ANCR_BEGIN_LINE: fputs("begin line", f); break;
7237 case ANCR_END_LINE: fputs("end line", f); break;
7238 case ANCR_SEMI_END_BUF: fputs("semi end buf", f); break;
7239 case ANCR_BEGIN_POSITION: fputs("begin position", f); break;
7240
7241 case ANCR_WORD_BOUNDARY: fputs("word boundary", f); break;
7242 case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
7243 #ifdef USE_WORD_BEGIN_END
7244 case ANCR_WORD_BEGIN: fputs("word begin", f); break;
7245 case ANCR_WORD_END: fputs("word end", f); break;
7246 #endif
7247 case ANCR_TEXT_SEGMENT_BOUNDARY:
7248 fputs("text-segment boundary", f); break;
7249 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
7250 fputs("no text-segment boundary", f); break;
7251 case ANCR_PREC_READ:
7252 fprintf(f, "prec read\n");
7253 print_indent_tree(f, NODE_BODY(node), indent + add);
7254 break;
7255 case ANCR_PREC_READ_NOT:
7256 fprintf(f, "prec read not\n");
7257 print_indent_tree(f, NODE_BODY(node), indent + add);
7258 break;
7259 case ANCR_LOOK_BEHIND:
7260 fprintf(f, "look behind\n");
7261 print_indent_tree(f, NODE_BODY(node), indent + add);
7262 break;
7263 case ANCR_LOOK_BEHIND_NOT:
7264 fprintf(f, "look behind not\n");
7265 print_indent_tree(f, NODE_BODY(node), indent + add);
7266 break;
7267
7268 default:
7269 fprintf(f, "ERROR: undefined anchor type.\n");
7270 break;
7271 }
7272 break;
7273
7274 case NODE_BACKREF:
7275 {
7276 int* p;
7277 BackRefNode* br = BACKREF_(node);
7278 p = BACKREFS_P(br);
7279 fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
7280 for (i = 0; i < br->back_num; i++) {
7281 if (i > 0) fputs(", ", f);
7282 fprintf(f, "%d", p[i]);
7283 }
7284 }
7285 break;
7286
7287 #ifdef USE_CALL
7288 case NODE_CALL:
7289 {
7290 CallNode* cn = CALL_(node);
7291 fprintf(f, "<call:%p>", node);
7292 p_string(f, cn->name_end - cn->name, cn->name);
7293 }
7294 break;
7295 #endif
7296
7297 case NODE_QUANT:
7298 fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
7299 QUANT_(node)->lower, QUANT_(node)->upper,
7300 (QUANT_(node)->greedy ? "" : "?"));
7301 print_indent_tree(f, NODE_BODY(node), indent + add);
7302 break;
7303
7304 case NODE_BAG:
7305 fprintf(f, "<bag:%p> ", node);
7306 if (BAG_(node)->type == BAG_IF_ELSE) {
7307 Node* Then;
7308 Node* Else;
7309 BagNode* bn;
7310
7311 bn = BAG_(node);
7312 fprintf(f, "if-else\n");
7313 print_indent_tree(f, NODE_BODY(node), indent + add);
7314
7315 Then = bn->te.Then;
7316 Else = bn->te.Else;
7317 if (IS_NULL(Then)) {
7318 Indent(f, indent + add);
7319 fprintf(f, "THEN empty\n");
7320 }
7321 else
7322 print_indent_tree(f, Then, indent + add);
7323
7324 if (IS_NULL(Else)) {
7325 Indent(f, indent + add);
7326 fprintf(f, "ELSE empty\n");
7327 }
7328 else
7329 print_indent_tree(f, Else, indent + add);
7330
7331 break;
7332 }
7333
7334 switch (BAG_(node)->type) {
7335 case BAG_OPTION:
7336 fprintf(f, "option:%d", BAG_(node)->o.options);
7337 break;
7338 case BAG_MEMORY:
7339 fprintf(f, "memory:%d", BAG_(node)->m.regnum);
7340 break;
7341 case BAG_STOP_BACKTRACK:
7342 fprintf(f, "stop-bt");
7343 break;
7344 default:
7345 break;
7346 }
7347 fprintf(f, "\n");
7348 print_indent_tree(f, NODE_BODY(node), indent + add);
7349 break;
7350
7351 case NODE_GIMMICK:
7352 fprintf(f, "<gimmick:%p> ", node);
7353 switch (GIMMICK_(node)->type) {
7354 case GIMMICK_FAIL:
7355 fprintf(f, "fail");
7356 break;
7357 case GIMMICK_SAVE:
7358 fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7359 break;
7360 case GIMMICK_UPDATE_VAR:
7361 fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7362 break;
7363 #ifdef USE_CALLOUT
7364 case GIMMICK_CALLOUT:
7365 switch (GIMMICK_(node)->detail_type) {
7366 case ONIG_CALLOUT_OF_CONTENTS:
7367 fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
7368 break;
7369 case ONIG_CALLOUT_OF_NAME:
7370 fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
7371 break;
7372 }
7373 #endif
7374 }
7375 break;
7376
7377 default:
7378 fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
7379 break;
7380 }
7381
7382 if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
7383 type != NODE_BAG)
7384 fprintf(f, "\n");
7385 fflush(f);
7386 }
7387
7388 static void
print_tree(FILE * f,Node * node)7389 print_tree(FILE* f, Node* node)
7390 {
7391 print_indent_tree(f, node, 0);
7392 }
7393 #endif
7394