1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2020 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 #define OPS_INIT_SIZE 8
33
34 typedef struct {
35 OnigLen min;
36 OnigLen max;
37 } MinMaxLen;
38
39 typedef struct {
40 OnigLen min;
41 OnigLen max;
42 int min_is_sure;
43 } MinMaxCharLen;
44
45 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
46
47 static OnigLen node_min_byte_len(Node* node, ScanEnv* env);
48
49 #if 0
50 typedef struct {
51 int n;
52 int alloc;
53 int* v;
54 } int_stack;
55
56 static int
57 make_int_stack(int_stack** rs, int init_size)
58 {
59 int_stack* s;
60 int* v;
61
62 *rs = 0;
63
64 s = xmalloc(sizeof(*s));
65 if (IS_NULL(s)) return ONIGERR_MEMORY;
66
67 v = (int* )xmalloc(sizeof(int) * init_size);
68 if (IS_NULL(v)) {
69 xfree(s);
70 return ONIGERR_MEMORY;
71 }
72
73 s->n = 0;
74 s->alloc = init_size;
75 s->v = v;
76
77 *rs = s;
78 return ONIG_NORMAL;
79 }
80
81 static void
82 free_int_stack(int_stack* s)
83 {
84 if (IS_NOT_NULL(s)) {
85 if (IS_NOT_NULL(s->v))
86 xfree(s->v);
87 xfree(s);
88 }
89 }
90
91 static int
92 int_stack_push(int_stack* s, int v)
93 {
94 if (s->n >= s->alloc) {
95 int new_size = s->alloc * 2;
96 int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size);
97 if (IS_NULL(nv)) return ONIGERR_MEMORY;
98
99 s->alloc = new_size;
100 s->v = nv;
101 }
102
103 s->v[s->n] = v;
104 s->n++;
105 return ONIG_NORMAL;
106 }
107
108 static int
109 int_stack_pop(int_stack* s)
110 {
111 int v;
112
113 #ifdef ONIG_DEBUG
114 if (s->n <= 0) {
115 fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s);
116 return 0;
117 }
118 #endif
119
120 v = s->v[s->n];
121 s->n--;
122 return v;
123 }
124 #endif
125
126 static int
ops_init(regex_t * reg,int init_alloc_size)127 ops_init(regex_t* reg, int init_alloc_size)
128 {
129 Operation* p;
130 size_t size;
131
132 if (init_alloc_size > 0) {
133 size = sizeof(Operation) * init_alloc_size;
134 p = (Operation* )xrealloc(reg->ops, size);
135 CHECK_NULL_RETURN_MEMERR(p);
136 #ifdef USE_DIRECT_THREADED_CODE
137 {
138 enum OpCode* cp;
139 size = sizeof(enum OpCode) * init_alloc_size;
140 cp = (enum OpCode* )xrealloc(reg->ocs, size);
141 CHECK_NULL_RETURN_MEMERR(cp);
142 reg->ocs = cp;
143 }
144 #endif
145 }
146 else {
147 p = (Operation* )0;
148 #ifdef USE_DIRECT_THREADED_CODE
149 reg->ocs = (enum OpCode* )0;
150 #endif
151 }
152
153 reg->ops = p;
154 reg->ops_curr = 0; /* !!! not yet done ops_new() */
155 reg->ops_alloc = init_alloc_size;
156 reg->ops_used = 0;
157
158 return ONIG_NORMAL;
159 }
160
161 static int
ops_expand(regex_t * reg,int n)162 ops_expand(regex_t* reg, int n)
163 {
164 #define MIN_OPS_EXPAND_SIZE 4
165
166 #ifdef USE_DIRECT_THREADED_CODE
167 enum OpCode* cp;
168 #endif
169 Operation* p;
170 size_t size;
171
172 if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
173
174 n += reg->ops_alloc;
175
176 size = sizeof(Operation) * n;
177 p = (Operation* )xrealloc(reg->ops, size);
178 CHECK_NULL_RETURN_MEMERR(p);
179
180 #ifdef USE_DIRECT_THREADED_CODE
181 size = sizeof(enum OpCode) * n;
182 cp = (enum OpCode* )xrealloc(reg->ocs, size);
183 CHECK_NULL_RETURN_MEMERR(cp);
184 reg->ocs = cp;
185 #endif
186
187 reg->ops = p;
188 reg->ops_alloc = n;
189 if (reg->ops_used == 0)
190 reg->ops_curr = 0;
191 else
192 reg->ops_curr = reg->ops + (reg->ops_used - 1);
193
194 return ONIG_NORMAL;
195 }
196
197 static int
ops_new(regex_t * reg)198 ops_new(regex_t* reg)
199 {
200 int r;
201
202 if (reg->ops_used >= reg->ops_alloc) {
203 r = ops_expand(reg, reg->ops_alloc);
204 if (r != ONIG_NORMAL) return r;
205 }
206
207 reg->ops_curr = reg->ops + reg->ops_used;
208 reg->ops_used++;
209
210 xmemset(reg->ops_curr, 0, sizeof(Operation));
211 return ONIG_NORMAL;
212 }
213
214 static int
is_in_string_pool(regex_t * reg,UChar * s)215 is_in_string_pool(regex_t* reg, UChar* s)
216 {
217 return (s >= reg->string_pool && s < reg->string_pool_end);
218 }
219
220 static void
ops_free(regex_t * reg)221 ops_free(regex_t* reg)
222 {
223 int i;
224
225 if (IS_NULL(reg->ops)) return ;
226
227 for (i = 0; i < (int )reg->ops_used; i++) {
228 enum OpCode opcode;
229 Operation* op;
230
231 op = reg->ops + i;
232
233 #ifdef USE_DIRECT_THREADED_CODE
234 opcode = *(reg->ocs + i);
235 #else
236 opcode = op->opcode;
237 #endif
238
239 switch (opcode) {
240 case OP_STR_MBN:
241 if (! is_in_string_pool(reg, op->exact_len_n.s))
242 xfree(op->exact_len_n.s);
243 break;
244 case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N:
245 if (! is_in_string_pool(reg, op->exact_n.s))
246 xfree(op->exact_n.s);
247 break;
248 case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4:
249 case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2:
250 case OP_STR_MB2N3:
251 break;
252
253 case OP_CCLASS_NOT: case OP_CCLASS:
254 xfree(op->cclass.bsp);
255 break;
256
257 case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
258 xfree(op->cclass_mb.mb);
259 break;
260 case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
261 xfree(op->cclass_mix.mb);
262 xfree(op->cclass_mix.bsp);
263 break;
264
265 case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
266 break;
267 case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC:
268 case OP_BACKREF_WITH_LEVEL:
269 case OP_BACKREF_WITH_LEVEL_IC:
270 case OP_BACKREF_CHECK:
271 case OP_BACKREF_CHECK_WITH_LEVEL:
272 if (op->backref_general.num != 1)
273 xfree(op->backref_general.ns);
274 break;
275
276 default:
277 break;
278 }
279 }
280
281 xfree(reg->ops);
282 #ifdef USE_DIRECT_THREADED_CODE
283 xfree(reg->ocs);
284 reg->ocs = 0;
285 #endif
286
287 reg->ops = 0;
288 reg->ops_curr = 0;
289 reg->ops_alloc = 0;
290 reg->ops_used = 0;
291 }
292
293 static int
ops_calc_size_of_string_pool(regex_t * reg)294 ops_calc_size_of_string_pool(regex_t* reg)
295 {
296 int i;
297 int total;
298
299 if (IS_NULL(reg->ops)) return 0;
300
301 total = 0;
302 for (i = 0; i < (int )reg->ops_used; i++) {
303 enum OpCode opcode;
304 Operation* op;
305
306 op = reg->ops + i;
307 #ifdef USE_DIRECT_THREADED_CODE
308 opcode = *(reg->ocs + i);
309 #else
310 opcode = op->opcode;
311 #endif
312
313 switch (opcode) {
314 case OP_STR_MBN:
315 total += op->exact_len_n.len * op->exact_len_n.n;
316 break;
317 case OP_STR_N:
318 case OP_STR_MB2N:
319 total += op->exact_n.n * 2;
320 break;
321 case OP_STR_MB3N:
322 total += op->exact_n.n * 3;
323 break;
324
325 default:
326 break;
327 }
328 }
329
330 return total;
331 }
332
333 static int
ops_make_string_pool(regex_t * reg)334 ops_make_string_pool(regex_t* reg)
335 {
336 int i;
337 int len;
338 int size;
339 UChar* pool;
340 UChar* curr;
341
342 size = ops_calc_size_of_string_pool(reg);
343 if (size <= 0) {
344 return 0;
345 }
346
347 curr = pool = (UChar* )xmalloc((size_t )size);
348 CHECK_NULL_RETURN_MEMERR(pool);
349
350 for (i = 0; i < (int )reg->ops_used; i++) {
351 enum OpCode opcode;
352 Operation* op;
353
354 op = reg->ops + i;
355 #ifdef USE_DIRECT_THREADED_CODE
356 opcode = *(reg->ocs + i);
357 #else
358 opcode = op->opcode;
359 #endif
360
361 switch (opcode) {
362 case OP_STR_MBN:
363 len = op->exact_len_n.len * op->exact_len_n.n;
364 xmemcpy(curr, op->exact_len_n.s, len);
365 xfree(op->exact_len_n.s);
366 op->exact_len_n.s = curr;
367 curr += len;
368 break;
369 case OP_STR_N:
370 len = op->exact_n.n;
371 copy:
372 xmemcpy(curr, op->exact_n.s, len);
373 xfree(op->exact_n.s);
374 op->exact_n.s = curr;
375 curr += len;
376 break;
377 case OP_STR_MB2N:
378 len = op->exact_n.n * 2;
379 goto copy;
380 break;
381 case OP_STR_MB3N:
382 len = op->exact_n.n * 3;
383 goto copy;
384 break;
385
386 default:
387 break;
388 }
389 }
390
391 reg->string_pool = pool;
392 reg->string_pool_end = pool + size;
393 return 0;
394 }
395
396 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)397 onig_get_default_case_fold_flag(void)
398 {
399 return OnigDefaultCaseFoldFlag;
400 }
401
402 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)403 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
404 {
405 OnigDefaultCaseFoldFlag = case_fold_flag;
406 return 0;
407 }
408
409 static int
len_multiply_cmp(OnigLen x,int y,OnigLen v)410 len_multiply_cmp(OnigLen x, int y, OnigLen v)
411 {
412 if (x == 0 || y == 0) return -1;
413
414 if (x < INFINITE_LEN / y) {
415 OnigLen xy = x * (OnigLen )y;
416 if (xy > v) return 1;
417 else {
418 if (xy == v) return 0;
419 else return -1;
420 }
421 }
422 else
423 return v == INFINITE_LEN ? 0 : 1;
424 }
425
426 extern int
onig_positive_int_multiply(int x,int y)427 onig_positive_int_multiply(int x, int y)
428 {
429 if (x == 0 || y == 0) return 0;
430
431 if (x < ONIG_INT_MAX / y)
432 return x * y;
433 else
434 return -1;
435 }
436
437
438 static void
node_swap(Node * a,Node * b)439 node_swap(Node* a, Node* b)
440 {
441 Node c;
442
443 c = *a; *a = *b; *b = c;
444
445 if (NODE_TYPE(a) == NODE_STRING) {
446 StrNode* sn = STR_(a);
447 if (sn->capacity == 0) {
448 int len = (int )(sn->end - sn->s);
449 sn->s = sn->buf;
450 sn->end = sn->s + len;
451 }
452 }
453
454 if (NODE_TYPE(b) == NODE_STRING) {
455 StrNode* sn = STR_(b);
456 if (sn->capacity == 0) {
457 int len = (int )(sn->end - sn->s);
458 sn->s = sn->buf;
459 sn->end = sn->s + len;
460 }
461 }
462 }
463
464 static int
node_list_len(Node * list)465 node_list_len(Node* list)
466 {
467 int len;
468
469 len = 1;
470 while (IS_NOT_NULL(NODE_CDR(list))) {
471 list = NODE_CDR(list);
472 len++;
473 }
474
475 return len;
476 }
477
478 static Node*
node_list_add(Node * list,Node * x)479 node_list_add(Node* list, Node* x)
480 {
481 Node *n;
482
483 n = onig_node_new_list(x, NULL);
484 if (IS_NULL(n)) return NULL_NODE;
485
486 if (IS_NOT_NULL(list)) {
487 while (IS_NOT_NULL(NODE_CDR(list)))
488 list = NODE_CDR(list);
489
490 NODE_CDR(list) = n;
491 }
492
493 return n;
494 }
495
496 static int
node_str_node_cat(Node * node,Node * add)497 node_str_node_cat(Node* node, Node* add)
498 {
499 int r;
500
501 if (NODE_STATUS(node) != NODE_STATUS(add))
502 return ONIGERR_TYPE_BUG;
503
504 if (STR_(node)->flag != STR_(add)->flag)
505 return ONIGERR_TYPE_BUG;
506
507 r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end);
508 if (r != 0) return r;
509
510 return 0;
511 }
512
513 static void
node_conv_to_str_node(Node * node,Node * ref_node)514 node_conv_to_str_node(Node* node, Node* ref_node)
515 {
516 xmemset(node, 0, sizeof(*node));
517 NODE_SET_TYPE(node, NODE_STRING);
518 NODE_STATUS(node) = NODE_STATUS(ref_node);
519
520 STR_(node)->flag = STR_(ref_node)->flag;
521 STR_(node)->s = STR_(node)->buf;
522 STR_(node)->end = STR_(node)->buf;
523 STR_(node)->capacity = 0;
524 }
525
526 static OnigLen
distance_add(OnigLen d1,OnigLen d2)527 distance_add(OnigLen d1, OnigLen d2)
528 {
529 if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
530 return INFINITE_LEN;
531 else {
532 if (d1 <= INFINITE_LEN - d2) return d1 + d2;
533 else return INFINITE_LEN;
534 }
535 }
536
537 static OnigLen
distance_multiply(OnigLen d,int m)538 distance_multiply(OnigLen d, int m)
539 {
540 if (m == 0) return 0;
541
542 if (d < INFINITE_LEN / m)
543 return d * m;
544 else
545 return INFINITE_LEN;
546 }
547
548 static int
bitset_is_empty(BitSetRef bs)549 bitset_is_empty(BitSetRef bs)
550 {
551 int i;
552
553 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {
554 if (bs[i] != 0) return 0;
555 }
556 return 1;
557 }
558
559 #ifdef USE_CALL
560
561 static int
unset_addr_list_init(UnsetAddrList * list,int size)562 unset_addr_list_init(UnsetAddrList* list, int size)
563 {
564 UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
565 CHECK_NULL_RETURN_MEMERR(p);
566
567 list->num = 0;
568 list->alloc = size;
569 list->us = p;
570 return 0;
571 }
572
573 static void
unset_addr_list_end(UnsetAddrList * list)574 unset_addr_list_end(UnsetAddrList* list)
575 {
576 if (IS_NOT_NULL(list->us))
577 xfree(list->us);
578 }
579
580 static int
unset_addr_list_add(UnsetAddrList * list,int offset,struct _Node * node)581 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
582 {
583 UnsetAddr* p;
584 int size;
585
586 if (list->num >= list->alloc) {
587 size = list->alloc * 2;
588 p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size);
589 CHECK_NULL_RETURN_MEMERR(p);
590 list->alloc = size;
591 list->us = p;
592 }
593
594 list->us[list->num].offset = offset;
595 list->us[list->num].target = node;
596 list->num++;
597 return 0;
598 }
599 #endif /* USE_CALL */
600
601 enum CharLenReturnType {
602 CHAR_LEN_NORMAL = 0, /* fixed or variable */
603 CHAR_LEN_TOP_ALT_FIXED = 1
604 };
605
606 static int
mmcl_fixed(MinMaxCharLen * c)607 mmcl_fixed(MinMaxCharLen* c)
608 {
609 return (c->min == c->max && c->min != INFINITE_LEN);
610 }
611
612 static void
mmcl_set(MinMaxCharLen * l,OnigLen len)613 mmcl_set(MinMaxCharLen* l, OnigLen len)
614 {
615 l->min = len;
616 l->max = len;
617 l->min_is_sure = TRUE;
618 }
619
620 static void
mmcl_set_min_max(MinMaxCharLen * l,OnigLen min,OnigLen max,int min_is_sure)621 mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure)
622 {
623 l->min = min;
624 l->max = max;
625 l->min_is_sure = min_is_sure;
626 }
627
628 static void
mmcl_add(MinMaxCharLen * to,MinMaxCharLen * add)629 mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add)
630 {
631 to->min = distance_add(to->min, add->min);
632 to->max = distance_add(to->max, add->max);
633
634 to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0;
635 }
636
637 static void
mmcl_multiply(MinMaxCharLen * to,int m)638 mmcl_multiply(MinMaxCharLen* to, int m)
639 {
640 to->min = distance_multiply(to->min, m);
641 to->max = distance_multiply(to->max, m);
642 }
643
644 static void
mmcl_repeat_range_multiply(MinMaxCharLen * to,int mlow,int mhigh)645 mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh)
646 {
647 to->min = distance_multiply(to->min, mlow);
648
649 if (IS_INFINITE_REPEAT(mhigh))
650 to->max = INFINITE_LEN;
651 else
652 to->max = distance_multiply(to->max, mhigh);
653 }
654
655 static void
mmcl_alt_merge(MinMaxCharLen * to,MinMaxCharLen * alt)656 mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)
657 {
658 if (to->min > alt->min) {
659 to->min = alt->min;
660 if (alt->min_is_sure != 0)
661 to->min_is_sure = TRUE;
662 }
663
664 if (to->max < alt->max) to->max = alt->max;
665 }
666
667 static int
mml_is_equal(MinMaxLen * a,MinMaxLen * b)668 mml_is_equal(MinMaxLen* a, MinMaxLen* b)
669 {
670 return a->min == b->min && a->max == b->max;
671 }
672
673 static void
mml_set_min_max(MinMaxLen * l,OnigLen min,OnigLen max)674 mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max)
675 {
676 l->min = min;
677 l->max = max;
678 }
679
680 static void
mml_clear(MinMaxLen * l)681 mml_clear(MinMaxLen* l)
682 {
683 l->min = l->max = 0;
684 }
685
686 static void
mml_copy(MinMaxLen * to,MinMaxLen * from)687 mml_copy(MinMaxLen* to, MinMaxLen* from)
688 {
689 to->min = from->min;
690 to->max = from->max;
691 }
692
693 static void
mml_add(MinMaxLen * to,MinMaxLen * add)694 mml_add(MinMaxLen* to, MinMaxLen* add)
695 {
696 to->min = distance_add(to->min, add->min);
697 to->max = distance_add(to->max, add->max);
698 }
699
700 static void
mml_alt_merge(MinMaxLen * to,MinMaxLen * alt)701 mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)
702 {
703 if (to->min > alt->min) to->min = alt->min;
704 if (to->max < alt->max) to->max = alt->max;
705 }
706
707 /* fixed size pattern node only */
708 static int
node_char_len1(Node * node,regex_t * reg,MinMaxCharLen * ci,ScanEnv * env,int level)709 node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
710 int level)
711 {
712 MinMaxCharLen tci;
713 int r = CHAR_LEN_NORMAL;
714
715 level++;
716
717 switch (NODE_TYPE(node)) {
718 case NODE_LIST:
719 {
720 int first = TRUE;
721 do {
722 r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
723 if (r < 0) break;
724 if (first == TRUE) {
725 *ci = tci;
726 first = FALSE;
727 }
728 else
729 mmcl_add(ci, &tci);
730 } while (IS_NOT_NULL(node = NODE_CDR(node)));
731 }
732 break;
733
734 case NODE_ALT:
735 {
736 int fixed;
737
738 r = node_char_len1(NODE_CAR(node), reg, ci, env, level);
739 if (r < 0) break;
740
741 fixed = TRUE;
742 while (IS_NOT_NULL(node = NODE_CDR(node))) {
743 r = node_char_len1(NODE_CAR(node), reg, &tci, env, level);
744 if (r < 0) break;
745 if (! mmcl_fixed(&tci))
746 fixed = FALSE;
747 mmcl_alt_merge(ci, &tci);
748 }
749 if (r < 0) break;
750
751 r = CHAR_LEN_NORMAL;
752 if (mmcl_fixed(ci)) break;
753
754 if (fixed == TRUE && level == 1) {
755 r = CHAR_LEN_TOP_ALT_FIXED;
756 }
757 }
758 break;
759
760 case NODE_STRING:
761 {
762 OnigLen clen;
763 StrNode* sn = STR_(node);
764 UChar *s = sn->s;
765
766 if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
767 /* Such a case is possible.
768 ex. /(?i)(?<=\1)(a)/
769 Backref node refer to capture group, but it doesn't tune yet.
770 */
771 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
772 break;
773 }
774
775 clen = 0;
776 while (s < sn->end) {
777 s += enclen(reg->enc, s);
778 clen = distance_add(clen, 1);
779 }
780 mmcl_set(ci, clen);
781 }
782 break;
783
784 case NODE_QUANT:
785 {
786 QuantNode* qn = QUANT_(node);
787
788 if (qn->lower == qn->upper) {
789 if (qn->upper == 0) {
790 mmcl_set(ci, 0);
791 }
792 else {
793 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
794 if (r < 0) break;
795 mmcl_multiply(ci, qn->lower);
796 }
797 }
798 else {
799 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
800 if (r < 0) break;
801 mmcl_repeat_range_multiply(ci, qn->lower, qn->upper);
802 }
803 }
804 break;
805
806 #ifdef USE_CALL
807 case NODE_CALL:
808 if (NODE_IS_RECURSION(node))
809 mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
810 else
811 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
812 break;
813 #endif
814
815 case NODE_CTYPE:
816 case NODE_CCLASS:
817 mmcl_set(ci, 1);
818 break;
819
820 case NODE_BAG:
821 {
822 BagNode* en = BAG_(node);
823
824 switch (en->type) {
825 case BAG_MEMORY:
826 if (NODE_IS_FIXED_CLEN(node)) {
827 mmcl_set_min_max(ci, en->min_char_len, en->max_char_len,
828 NODE_IS_FIXED_CLEN_MIN_SURE(node));
829 }
830 else {
831 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
832 if (r < 0) break;
833
834 en->min_char_len = ci->min;
835 en->max_char_len = ci->max;
836 NODE_STATUS_ADD(node, FIXED_CLEN);
837 if (ci->min_is_sure != 0)
838 NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE);
839 }
840 /* can't optimize look-behind if capture exists. */
841 ci->min_is_sure = FALSE;
842 break;
843 case BAG_OPTION:
844 case BAG_STOP_BACKTRACK:
845 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
846 break;
847 case BAG_IF_ELSE:
848 {
849 MinMaxCharLen eci;
850
851 r = node_char_len1(NODE_BODY(node), reg, ci, env, level);
852 if (r < 0) break;
853
854 if (IS_NOT_NULL(en->te.Then)) {
855 r = node_char_len1(en->te.Then, reg, &tci, env, level);
856 if (r < 0) break;
857 mmcl_add(ci, &tci);
858 }
859
860 if (IS_NOT_NULL(en->te.Else)) {
861 r = node_char_len1(en->te.Else, reg, &eci, env, level);
862 if (r < 0) break;
863 }
864 else {
865 mmcl_set(&eci, 0);
866 }
867
868 mmcl_alt_merge(ci, &eci);
869 }
870 break;
871 default: /* never come here */
872 r = ONIGERR_PARSER_BUG;
873 break;
874 }
875 }
876 break;
877
878 case NODE_ANCHOR:
879 mmcl_set(ci, 0);
880 /* can't optimize look-behind if anchor exists. */
881 ci->min_is_sure = FALSE;
882 break;
883
884 case NODE_GIMMICK:
885 zero:
886 mmcl_set(ci, 0);
887 break;
888
889 case NODE_BACKREF:
890 if (NODE_IS_CHECKER(node))
891 goto zero;
892
893 if (NODE_IS_RECURSION(node)) {
894 #ifdef USE_BACKREF_WITH_LEVEL
895 if (NODE_IS_NEST_LEVEL(node)) {
896 mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE);
897 break;
898 }
899 #endif
900
901 mmcl_set_min_max(ci, 0, 0, FALSE);
902 break;
903 }
904
905 {
906 int i;
907 int* backs;
908 MemEnv* mem_env = SCANENV_MEMENV(env);
909 BackRefNode* br = BACKREF_(node);
910
911 backs = BACKREFS_P(br);
912 r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level);
913 if (r < 0) break;
914 if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE;
915
916 for (i = 1; i < br->back_num; i++) {
917 r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level);
918 if (r < 0) break;
919 if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE;
920 mmcl_alt_merge(ci, &tci);
921 }
922 }
923 break;
924
925 default: /* never come here */
926 r = ONIGERR_PARSER_BUG;
927 break;
928 }
929
930 return r;
931 }
932
933 static int
node_char_len(Node * node,regex_t * reg,MinMaxCharLen * ci,ScanEnv * env)934 node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env)
935 {
936 return node_char_len1(node, reg, ci, env, 0);
937 }
938
939
940 static int
add_op(regex_t * reg,int opcode)941 add_op(regex_t* reg, int opcode)
942 {
943 int r;
944
945 r = ops_new(reg);
946 if (r != ONIG_NORMAL) return r;
947
948 #ifdef USE_DIRECT_THREADED_CODE
949 *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
950 #else
951 reg->ops_curr->opcode = opcode;
952 #endif
953
954 return 0;
955 }
956
957 static int compile_length_tree(Node* node, regex_t* reg);
958 static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
959
960
961 #define IS_NEED_STR_LEN_OP(op) \
962 ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\
963 (op) == OP_STR_MB3N || (op) == OP_STR_MBN)
964
965 static int
select_str_opcode(int mb_len,int str_len)966 select_str_opcode(int mb_len, int str_len)
967 {
968 int op;
969
970 switch (mb_len) {
971 case 1:
972 switch (str_len) {
973 case 1: op = OP_STR_1; break;
974 case 2: op = OP_STR_2; break;
975 case 3: op = OP_STR_3; break;
976 case 4: op = OP_STR_4; break;
977 case 5: op = OP_STR_5; break;
978 default: op = OP_STR_N; break;
979 }
980 break;
981
982 case 2:
983 switch (str_len) {
984 case 1: op = OP_STR_MB2N1; break;
985 case 2: op = OP_STR_MB2N2; break;
986 case 3: op = OP_STR_MB2N3; break;
987 default: op = OP_STR_MB2N; break;
988 }
989 break;
990
991 case 3:
992 op = OP_STR_MB3N;
993 break;
994
995 default:
996 op = OP_STR_MBN;
997 break;
998 }
999
1000 return op;
1001 }
1002
1003 static int
is_strict_real_node(Node * node)1004 is_strict_real_node(Node* node)
1005 {
1006 switch (NODE_TYPE(node)) {
1007 case NODE_STRING:
1008 {
1009 StrNode* sn = STR_(node);
1010 return (sn->end != sn->s);
1011 }
1012 break;
1013
1014 case NODE_CCLASS:
1015 case NODE_CTYPE:
1016 return 1;
1017 break;
1018
1019 default:
1020 return 0;
1021 break;
1022 }
1023 }
1024
1025 static int
compile_quant_body_with_empty_check(QuantNode * qn,regex_t * reg,ScanEnv * env)1026 compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
1027 {
1028 int r;
1029 int saved_num_empty_check;
1030 int emptiness;
1031 Node* body;
1032
1033 body = NODE_BODY((Node* )qn);
1034 emptiness = qn->emptiness;
1035 saved_num_empty_check = reg->num_empty_check;
1036
1037 if (emptiness != BODY_IS_NOT_EMPTY) {
1038 r = add_op(reg, OP_EMPTY_CHECK_START);
1039 if (r != 0) return r;
1040 COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */
1041 reg->num_empty_check++;
1042 }
1043
1044 r = compile_tree(body, reg, env);
1045 if (r != 0) return r;
1046
1047 if (emptiness != BODY_IS_NOT_EMPTY) {
1048 if (emptiness == BODY_MAY_BE_EMPTY)
1049 r = add_op(reg, OP_EMPTY_CHECK_END);
1050 else if (emptiness == BODY_MAY_BE_EMPTY_MEM) {
1051 if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0)
1052 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
1053 else
1054 r = add_op(reg, OP_EMPTY_CHECK_END);
1055 }
1056 #ifdef USE_CALL
1057 else if (emptiness == BODY_MAY_BE_EMPTY_REC)
1058 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
1059 #endif
1060
1061 if (r != 0) return r;
1062 COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */
1063 }
1064 return r;
1065 }
1066
1067 #ifdef USE_CALL
1068 static int
compile_call(CallNode * node,regex_t * reg,ScanEnv * env)1069 compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
1070 {
1071 int r;
1072 int offset;
1073
1074 r = add_op(reg, OP_CALL);
1075 if (r != 0) return r;
1076
1077 COP(reg)->call.addr = 0; /* dummy addr. */
1078
1079 offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
1080 r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
1081 return r;
1082 }
1083 #endif
1084
1085 static int
compile_tree_n_times(Node * node,int n,regex_t * reg,ScanEnv * env)1086 compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
1087 {
1088 int i, r;
1089
1090 for (i = 0; i < n; i++) {
1091 r = compile_tree(node, reg, env);
1092 if (r != 0) return r;
1093 }
1094 return 0;
1095 }
1096
1097 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED)1098 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
1099 regex_t* reg ARG_UNUSED)
1100 {
1101 return 1;
1102 }
1103
1104 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg)1105 add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg)
1106 {
1107 int op;
1108 int r;
1109 int byte_len;
1110 UChar* p;
1111 UChar* end;
1112
1113 op = select_str_opcode(mb_len, str_len);
1114 r = add_op(reg, op);
1115 if (r != 0) return r;
1116
1117 byte_len = mb_len * str_len;
1118 end = s + byte_len;
1119
1120 if (op == OP_STR_MBN) {
1121 p = onigenc_strdup(reg->enc, s, end);
1122 CHECK_NULL_RETURN_MEMERR(p);
1123
1124 COP(reg)->exact_len_n.len = mb_len;
1125 COP(reg)->exact_len_n.n = str_len;
1126 COP(reg)->exact_len_n.s = p;
1127 }
1128 else if (IS_NEED_STR_LEN_OP(op)) {
1129 p = onigenc_strdup(reg->enc, s, end);
1130 CHECK_NULL_RETURN_MEMERR(p);
1131 COP(reg)->exact_n.n = str_len;
1132 COP(reg)->exact_n.s = p;
1133 }
1134 else {
1135 xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s));
1136 xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
1137 }
1138
1139 return 0;
1140 }
1141
1142 static int
compile_length_string_node(Node * node,regex_t * reg)1143 compile_length_string_node(Node* node, regex_t* reg)
1144 {
1145 int rlen, r, len, prev_len, slen;
1146 UChar *p, *prev;
1147 StrNode* sn;
1148 OnigEncoding enc = reg->enc;
1149
1150 sn = STR_(node);
1151 if (sn->end <= sn->s)
1152 return 0;
1153
1154 p = prev = sn->s;
1155 prev_len = enclen(enc, p);
1156 p += prev_len;
1157 slen = 1;
1158 rlen = 0;
1159
1160 for (; p < sn->end; ) {
1161 len = enclen(enc, p);
1162 if (len == prev_len) {
1163 slen++;
1164 }
1165 else {
1166 r = add_compile_string_length(prev, prev_len, slen, reg);
1167 rlen += r;
1168 prev = p;
1169 slen = 1;
1170 prev_len = len;
1171 }
1172 p += len;
1173 }
1174
1175 r = add_compile_string_length(prev, prev_len, slen, reg);
1176 rlen += r;
1177 return rlen;
1178 }
1179
1180 static int
compile_length_string_crude_node(StrNode * sn,regex_t * reg)1181 compile_length_string_crude_node(StrNode* sn, regex_t* reg)
1182 {
1183 if (sn->end <= sn->s)
1184 return 0;
1185
1186 return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
1187 reg);
1188 }
1189
1190 static int
compile_string_node(Node * node,regex_t * reg)1191 compile_string_node(Node* node, regex_t* reg)
1192 {
1193 int r, len, prev_len, slen;
1194 UChar *p, *prev, *end;
1195 StrNode* sn;
1196 OnigEncoding enc = reg->enc;
1197
1198 sn = STR_(node);
1199 if (sn->end <= sn->s)
1200 return 0;
1201
1202 end = sn->end;
1203
1204 p = prev = sn->s;
1205 prev_len = enclen(enc, p);
1206 p += prev_len;
1207 slen = 1;
1208
1209 for (; p < end; ) {
1210 len = enclen(enc, p);
1211 if (len == prev_len) {
1212 slen++;
1213 }
1214 else {
1215 r = add_compile_string(prev, prev_len, slen, reg);
1216 if (r != 0) return r;
1217
1218 prev = p;
1219 slen = 1;
1220 prev_len = len;
1221 }
1222
1223 p += len;
1224 }
1225
1226 return add_compile_string(prev, prev_len, slen, reg);
1227 }
1228
1229 static int
compile_string_crude_node(StrNode * sn,regex_t * reg)1230 compile_string_crude_node(StrNode* sn, regex_t* reg)
1231 {
1232 if (sn->end <= sn->s)
1233 return 0;
1234
1235 return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg);
1236 }
1237
1238 static void*
set_multi_byte_cclass(BBuf * mbuf,regex_t * reg)1239 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
1240 {
1241 size_t len;
1242 void* p;
1243
1244 len = (size_t )mbuf->used;
1245 p = xmalloc(len);
1246 if (IS_NULL(p)) return NULL;
1247
1248 xmemcpy(p, mbuf->p, len);
1249 return p;
1250 }
1251
1252 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)1253 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
1254 {
1255 return 1;
1256 }
1257
1258 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)1259 compile_cclass_node(CClassNode* cc, regex_t* reg)
1260 {
1261 int r;
1262
1263 if (IS_NULL(cc->mbuf)) {
1264 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
1265 if (r != 0) return r;
1266
1267 COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
1268 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
1269 xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
1270 }
1271 else {
1272 void* p;
1273
1274 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
1275 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
1276 if (r != 0) return r;
1277
1278 p = set_multi_byte_cclass(cc->mbuf, reg);
1279 CHECK_NULL_RETURN_MEMERR(p);
1280 COP(reg)->cclass_mb.mb = p;
1281 }
1282 else {
1283 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
1284 if (r != 0) return r;
1285
1286 COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
1287 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
1288 xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
1289
1290 p = set_multi_byte_cclass(cc->mbuf, reg);
1291 CHECK_NULL_RETURN_MEMERR(p);
1292 COP(reg)->cclass_mix.mb = p;
1293 }
1294 }
1295
1296 return 0;
1297 }
1298
1299 static void
set_addr_in_repeat_range(regex_t * reg)1300 set_addr_in_repeat_range(regex_t* reg)
1301 {
1302 int i;
1303
1304 for (i = 0; i < reg->num_repeat; i++) {
1305 RepeatRange* p = reg->repeat_range + i;
1306 int offset = p->u.offset;
1307 p->u.pcode = reg->ops + offset;
1308 }
1309 }
1310
1311 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper,int ops_index)1312 entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
1313 {
1314 #define REPEAT_RANGE_ALLOC 4
1315
1316 RepeatRange* p;
1317
1318 if (reg->repeat_range_alloc == 0) {
1319 p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC);
1320 CHECK_NULL_RETURN_MEMERR(p);
1321 reg->repeat_range = p;
1322 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
1323 }
1324 else if (reg->repeat_range_alloc <= id) {
1325 int n;
1326 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
1327 p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n);
1328 CHECK_NULL_RETURN_MEMERR(p);
1329 reg->repeat_range = p;
1330 reg->repeat_range_alloc = n;
1331 }
1332 else {
1333 p = reg->repeat_range;
1334 }
1335
1336 p[id].lower = lower;
1337 p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
1338 p[id].u.offset = ops_index;
1339 return 0;
1340 }
1341
1342 static int
compile_range_repeat_node(QuantNode * qn,int target_len,int emptiness,regex_t * reg,ScanEnv * env)1343 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
1344 regex_t* reg, ScanEnv* env)
1345 {
1346 int r;
1347 int num_repeat = reg->num_repeat++;
1348
1349 r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
1350 if (r != 0) return r;
1351
1352 COP(reg)->repeat.id = num_repeat;
1353 COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC;
1354
1355 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper,
1356 COP_CURR_OFFSET(reg) + OPSIZE_REPEAT);
1357 if (r != 0) return r;
1358
1359 r = compile_quant_body_with_empty_check(qn, reg, env);
1360 if (r != 0) return r;
1361
1362 r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
1363 if (r != 0) return r;
1364
1365 COP(reg)->repeat_inc.id = num_repeat;
1366 return r;
1367 }
1368
1369 static int
is_anychar_infinite_greedy(QuantNode * qn)1370 is_anychar_infinite_greedy(QuantNode* qn)
1371 {
1372 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
1373 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
1374 return 1;
1375 else
1376 return 0;
1377 }
1378
1379 #define QUANTIFIER_EXPAND_LIMIT_SIZE 10
1380 #define CKN_ON (ckn > 0)
1381
1382 static int
compile_length_quantifier_node(QuantNode * qn,regex_t * reg)1383 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
1384 {
1385 int len, mod_tlen;
1386 int infinite = IS_INFINITE_REPEAT(qn->upper);
1387 enum BodyEmptyType emptiness = qn->emptiness;
1388 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1389
1390 if (tlen < 0) return tlen;
1391 if (tlen == 0) return 0;
1392
1393 /* anychar repeat */
1394 if (is_anychar_infinite_greedy(qn)) {
1395 if (qn->lower <= 1 ||
1396 len_multiply_cmp((OnigLen )tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
1397 if (IS_NOT_NULL(qn->next_head_exact))
1398 return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
1399 else
1400 return OPSIZE_ANYCHAR_STAR + tlen * qn->lower;
1401 }
1402 }
1403
1404 mod_tlen = tlen;
1405 if (emptiness != BODY_IS_NOT_EMPTY)
1406 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1407
1408 if (infinite &&
1409 (qn->lower <= 1 ||
1410 len_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1411 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1412 len = OPSIZE_JUMP;
1413 }
1414 else {
1415 len = tlen * qn->lower;
1416 }
1417
1418 if (qn->greedy) {
1419 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1420 if (IS_NOT_NULL(qn->head_exact))
1421 len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP;
1422 else
1423 #endif
1424 if (IS_NOT_NULL(qn->next_head_exact))
1425 len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP;
1426 else
1427 len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP;
1428 }
1429 else
1430 len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH;
1431 }
1432 else if (qn->upper == 0) {
1433 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1434 len = OPSIZE_JUMP + tlen;
1435 }
1436 else
1437 len = 0;
1438 }
1439 else if (!infinite && qn->greedy &&
1440 (qn->upper == 1 ||
1441 len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1442 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1443 len = tlen * qn->lower;
1444 len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower);
1445 }
1446 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1447 len = OPSIZE_PUSH + OPSIZE_JUMP + tlen;
1448 }
1449 else {
1450 len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT;
1451 }
1452
1453 return len;
1454 }
1455
1456 static int
compile_quantifier_node(QuantNode * qn,regex_t * reg,ScanEnv * env)1457 compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
1458 {
1459 int i, r, mod_tlen;
1460 int infinite = IS_INFINITE_REPEAT(qn->upper);
1461 enum BodyEmptyType emptiness = qn->emptiness;
1462 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1463
1464 if (tlen < 0) return tlen;
1465 if (tlen == 0) return 0;
1466
1467 if (is_anychar_infinite_greedy(qn) &&
1468 (qn->lower <= 1 ||
1469 len_multiply_cmp((OnigLen )tlen, qn->lower,
1470 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1471 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1472 if (r != 0) return r;
1473 if (IS_NOT_NULL(qn->next_head_exact)) {
1474 r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1475 OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1476 if (r != 0) return r;
1477
1478 COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1479 return 0;
1480 }
1481 else {
1482 r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ?
1483 OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1484 return r;
1485 }
1486 }
1487
1488 mod_tlen = tlen;
1489 if (emptiness != BODY_IS_NOT_EMPTY)
1490 mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END;
1491
1492 if (infinite &&
1493 (qn->lower <= 1 ||
1494 len_multiply_cmp((OnigLen )tlen, qn->lower,
1495 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1496 int addr;
1497
1498 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1499 r = add_op(reg, OP_JUMP);
1500 if (r != 0) return r;
1501 if (qn->greedy) {
1502 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1503 if (IS_NOT_NULL(qn->head_exact))
1504 COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC;
1505 else
1506 #endif
1507 if (IS_NOT_NULL(qn->next_head_exact))
1508 COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC;
1509 else
1510 COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC;
1511 }
1512 else {
1513 COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC;
1514 }
1515 }
1516 else {
1517 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1518 if (r != 0) return r;
1519 }
1520
1521 if (qn->greedy) {
1522 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1523 if (IS_NOT_NULL(qn->head_exact)) {
1524 r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1525 if (r != 0) return r;
1526 COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1527 COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0];
1528
1529 r = compile_quant_body_with_empty_check(qn, reg, env);
1530 if (r != 0) return r;
1531
1532 addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1);
1533 }
1534 else
1535 #endif
1536 if (IS_NOT_NULL(qn->next_head_exact)) {
1537 r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1538 if (r != 0) return r;
1539 COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1540 COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0];
1541
1542 r = compile_quant_body_with_empty_check(qn, reg, env);
1543 if (r != 0) return r;
1544
1545 addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT);
1546 }
1547 else {
1548 r = add_op(reg, OP_PUSH);
1549 if (r != 0) return r;
1550 COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP;
1551
1552 r = compile_quant_body_with_empty_check(qn, reg, env);
1553 if (r != 0) return r;
1554
1555 addr = -(mod_tlen + (int )OPSIZE_PUSH);
1556 }
1557
1558 r = add_op(reg, OP_JUMP);
1559 if (r != 0) return r;
1560 COP(reg)->jump.addr = addr;
1561 }
1562 else {
1563 r = add_op(reg, OP_JUMP);
1564 if (r != 0) return r;
1565 COP(reg)->jump.addr = mod_tlen + SIZE_INC;
1566
1567 r = compile_quant_body_with_empty_check(qn, reg, env);
1568 if (r != 0) return r;
1569
1570 r = add_op(reg, OP_PUSH);
1571 if (r != 0) return r;
1572 COP(reg)->push.addr = -mod_tlen;
1573 }
1574 }
1575 else if (qn->upper == 0) {
1576 if (qn->include_referred != 0) { /* /(?<n>..){0}/ */
1577 r = add_op(reg, OP_JUMP);
1578 if (r != 0) return r;
1579 COP(reg)->jump.addr = tlen + SIZE_INC;
1580
1581 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1582 }
1583 else {
1584 /* Nothing output */
1585 r = 0;
1586 }
1587 }
1588 else if (! infinite && qn->greedy &&
1589 (qn->upper == 1 ||
1590 len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper,
1591 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1592 int n = qn->upper - qn->lower;
1593
1594 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1595 if (r != 0) return r;
1596
1597 for (i = 0; i < n; i++) {
1598 int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH);
1599 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1600
1601 r = add_op(reg, OP_PUSH);
1602 if (r != 0) return r;
1603 COP(reg)->push.addr = v;
1604
1605 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1606 if (r != 0) return r;
1607 }
1608 }
1609 else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1610 r = add_op(reg, OP_PUSH);
1611 if (r != 0) return r;
1612 COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
1613
1614 r = add_op(reg, OP_JUMP);
1615 if (r != 0) return r;
1616 COP(reg)->jump.addr = tlen + SIZE_INC;
1617
1618 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1619 }
1620 else {
1621 r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1622 }
1623 return r;
1624 }
1625
1626 static int
compile_length_option_node(BagNode * node,regex_t * reg)1627 compile_length_option_node(BagNode* node, regex_t* reg)
1628 {
1629 int tlen;
1630
1631 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1632
1633 return tlen;
1634 }
1635
1636 static int
compile_option_node(BagNode * node,regex_t * reg,ScanEnv * env)1637 compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
1638 {
1639 int r;
1640
1641 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1642
1643 return r;
1644 }
1645
1646 static int
compile_length_bag_node(BagNode * node,regex_t * reg)1647 compile_length_bag_node(BagNode* node, regex_t* reg)
1648 {
1649 int len;
1650 int tlen;
1651
1652 if (node->type == BAG_OPTION)
1653 return compile_length_option_node(node, reg);
1654
1655 if (NODE_BAG_BODY(node)) {
1656 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1657 if (tlen < 0) return tlen;
1658 }
1659 else
1660 tlen = 0;
1661
1662 switch (node->type) {
1663 case BAG_MEMORY:
1664 #ifdef USE_CALL
1665
1666 if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1667 len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1668 return len;
1669 }
1670
1671 if (NODE_IS_CALLED(node)) {
1672 len = OPSIZE_MEM_START_PUSH + tlen
1673 + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN;
1674 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1675 len += (NODE_IS_RECURSION(node)
1676 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1677 else
1678 len += (NODE_IS_RECURSION(node)
1679 ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1680 }
1681 else if (NODE_IS_RECURSION(node)) {
1682 len = OPSIZE_MEM_START_PUSH;
1683 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1684 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC);
1685 }
1686 else
1687 #endif
1688 {
1689 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1690 len = OPSIZE_MEM_START_PUSH;
1691 else
1692 len = OPSIZE_MEM_START;
1693
1694 len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)
1695 ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END);
1696 }
1697 break;
1698
1699 case BAG_STOP_BACKTRACK:
1700 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1701 int v;
1702 QuantNode* qn;
1703
1704 qn = QUANT_(NODE_BAG_BODY(node));
1705 tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1706 if (tlen < 0) return tlen;
1707
1708 v = onig_positive_int_multiply(qn->lower, tlen);
1709 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1710 len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP;
1711 }
1712 else {
1713 len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1714 }
1715 break;
1716
1717 case BAG_IF_ELSE:
1718 {
1719 Node* cond = NODE_BAG_BODY(node);
1720 Node* Then = node->te.Then;
1721 Node* Else = node->te.Else;
1722
1723 len = compile_length_tree(cond, reg);
1724 if (len < 0) return len;
1725 len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK;
1726
1727 if (IS_NOT_NULL(Then)) {
1728 tlen = compile_length_tree(Then, reg);
1729 if (tlen < 0) return tlen;
1730 len += tlen;
1731 }
1732
1733 len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK;
1734
1735 if (IS_NOT_NULL(Else)) {
1736 tlen = compile_length_tree(Else, reg);
1737 if (tlen < 0) return tlen;
1738 len += tlen;
1739 }
1740 }
1741 break;
1742
1743 case BAG_OPTION:
1744 /* never come here, but set for escape warning */
1745 len = 0;
1746 break;
1747 }
1748
1749 return len;
1750 }
1751
1752 static int
compile_bag_memory_node(BagNode * node,regex_t * reg,ScanEnv * env)1753 compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
1754 {
1755 int r;
1756
1757 #ifdef USE_CALL
1758 if (NODE_IS_CALLED(node)) {
1759 int len;
1760
1761 r = add_op(reg, OP_CALL);
1762 if (r != 0) return r;
1763
1764 node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP;
1765 NODE_STATUS_ADD(node, FIXED_ADDR);
1766 COP(reg)->call.addr = (int )node->m.called_addr;
1767
1768 if (node->m.regnum == 0) {
1769 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1770 len += OPSIZE_RETURN;
1771
1772 r = add_op(reg, OP_JUMP);
1773 if (r != 0) return r;
1774 COP(reg)->jump.addr = len + SIZE_INC;
1775
1776 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1777 if (r != 0) return r;
1778
1779 r = add_op(reg, OP_RETURN);
1780 return r;
1781 }
1782 else {
1783 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1784 len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
1785 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1786 len += (NODE_IS_RECURSION(node)
1787 ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH);
1788 else
1789 len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END);
1790
1791 r = add_op(reg, OP_JUMP);
1792 if (r != 0) return r;
1793 COP(reg)->jump.addr = len + SIZE_INC;
1794 }
1795 }
1796 #endif
1797
1798 if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum))
1799 r = add_op(reg, OP_MEM_START_PUSH);
1800 else
1801 r = add_op(reg, OP_MEM_START);
1802 if (r != 0) return r;
1803 COP(reg)->memory_start.num = node->m.regnum;
1804
1805 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1806 if (r != 0) return r;
1807
1808 #ifdef USE_CALL
1809 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1810 r = add_op(reg, (NODE_IS_RECURSION(node)
1811 ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH));
1812 else
1813 r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END));
1814 if (r != 0) return r;
1815 COP(reg)->memory_end.num = node->m.regnum;
1816
1817 if (NODE_IS_CALLED(node)) {
1818 if (r != 0) return r;
1819 r = add_op(reg, OP_RETURN);
1820 }
1821 #else
1822 if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
1823 r = add_op(reg, OP_MEM_END_PUSH);
1824 else
1825 r = add_op(reg, OP_MEM_END);
1826 if (r != 0) return r;
1827 COP(reg)->memory_end.num = node->m.regnum;
1828 #endif
1829
1830 return r;
1831 }
1832
1833 static int
compile_bag_node(BagNode * node,regex_t * reg,ScanEnv * env)1834 compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
1835 {
1836 int r, len;
1837
1838 switch (node->type) {
1839 case BAG_MEMORY:
1840 r = compile_bag_memory_node(node, reg, env);
1841 break;
1842
1843 case BAG_OPTION:
1844 r = compile_option_node(node, reg, env);
1845 break;
1846
1847 case BAG_STOP_BACKTRACK:
1848 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1849 QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1850 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1851 if (r != 0) return r;
1852
1853 len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1854 if (len < 0) return len;
1855
1856 r = add_op(reg, OP_PUSH);
1857 if (r != 0) return r;
1858 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP;
1859
1860 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1861 if (r != 0) return r;
1862 r = add_op(reg, OP_POP);
1863 if (r != 0) return r;
1864
1865 r = add_op(reg, OP_JUMP);
1866 if (r != 0) return r;
1867 COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP);
1868 }
1869 else {
1870 MemNumType mid;
1871
1872 ID_ENTRY(env, mid);
1873 r = add_op(reg, OP_MARK);
1874 if (r != 0) return r;
1875 COP(reg)->mark.id = mid;
1876 COP(reg)->mark.save_pos = 0;
1877
1878 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1879 if (r != 0) return r;
1880 r = add_op(reg, OP_CUT_TO_MARK);
1881 if (r != 0) return r;
1882 COP(reg)->cut_to_mark.id = mid;
1883 COP(reg)->cut_to_mark.restore_pos = 0;
1884 }
1885 break;
1886
1887 case BAG_IF_ELSE:
1888 {
1889 int cond_len, then_len, else_len, jump_len;
1890 MemNumType mid;
1891 Node* cond = NODE_BAG_BODY(node);
1892 Node* Then = node->te.Then;
1893 Node* Else = node->te.Else;
1894
1895 ID_ENTRY(env, mid);
1896
1897 r = add_op(reg, OP_MARK);
1898 if (r != 0) return r;
1899 COP(reg)->mark.id = mid;
1900 COP(reg)->mark.save_pos = 0;
1901
1902 cond_len = compile_length_tree(cond, reg);
1903 if (cond_len < 0) return cond_len;
1904 if (IS_NOT_NULL(Then)) {
1905 then_len = compile_length_tree(Then, reg);
1906 if (then_len < 0) return then_len;
1907 }
1908 else
1909 then_len = 0;
1910
1911 jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP;
1912
1913 r = add_op(reg, OP_PUSH);
1914 if (r != 0) return r;
1915 COP(reg)->push.addr = SIZE_INC + jump_len;
1916
1917 r = compile_tree(cond, reg, env);
1918 if (r != 0) return r;
1919 r = add_op(reg, OP_CUT_TO_MARK);
1920 if (r != 0) return r;
1921 COP(reg)->cut_to_mark.id = mid;
1922 COP(reg)->cut_to_mark.restore_pos = 0;
1923
1924 if (IS_NOT_NULL(Then)) {
1925 r = compile_tree(Then, reg, env);
1926 if (r != 0) return r;
1927 }
1928
1929 if (IS_NOT_NULL(Else)) {
1930 else_len = compile_length_tree(Else, reg);
1931 if (else_len < 0) return else_len;
1932 }
1933 else
1934 else_len = 0;
1935
1936 r = add_op(reg, OP_JUMP);
1937 if (r != 0) return r;
1938 COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC;
1939
1940 r = add_op(reg, OP_CUT_TO_MARK);
1941 if (r != 0) return r;
1942 COP(reg)->cut_to_mark.id = mid;
1943 COP(reg)->cut_to_mark.restore_pos = 0;
1944
1945 if (IS_NOT_NULL(Else)) {
1946 r = compile_tree(Else, reg, env);
1947 }
1948 }
1949 break;
1950 }
1951
1952 return r;
1953 }
1954
1955 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1956 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1957 {
1958 int len;
1959 int tlen = 0;
1960
1961 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1962 tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1963 if (tlen < 0) return tlen;
1964 }
1965
1966 switch (node->type) {
1967 case ANCR_PREC_READ:
1968 len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK;
1969 break;
1970 case ANCR_PREC_READ_NOT:
1971 len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
1972 break;
1973 case ANCR_LOOK_BEHIND:
1974 if (node->char_min_len == node->char_max_len)
1975 len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK;
1976 else {
1977 len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR;
1978
1979 if (IS_NOT_NULL(node->lead_node)) {
1980 int llen = compile_length_tree(node->lead_node, reg);
1981 if (llen < 0) return llen;
1982
1983 len += OPSIZE_MOVE + llen;
1984 }
1985 }
1986 break;
1987 case ANCR_LOOK_BEHIND_NOT:
1988 if (node->char_min_len == node->char_max_len)
1989 len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP;
1990 else {
1991 len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP;
1992
1993 if (IS_NOT_NULL(node->lead_node)) {
1994 int llen = compile_length_tree(node->lead_node, reg);
1995 if (llen < 0) return llen;
1996
1997 len += OPSIZE_MOVE + llen;
1998 }
1999 }
2000 break;
2001
2002 case ANCR_WORD_BOUNDARY:
2003 case ANCR_NO_WORD_BOUNDARY:
2004 #ifdef USE_WORD_BEGIN_END
2005 case ANCR_WORD_BEGIN:
2006 case ANCR_WORD_END:
2007 #endif
2008 len = OPSIZE_WORD_BOUNDARY;
2009 break;
2010
2011 case ANCR_TEXT_SEGMENT_BOUNDARY:
2012 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2013 len = SIZE_OPCODE;
2014 break;
2015
2016 default:
2017 len = SIZE_OPCODE;
2018 break;
2019 }
2020
2021 return len;
2022 }
2023
2024 static int
compile_anchor_look_behind_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2025 compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
2026 {
2027 int r;
2028
2029 if (node->char_min_len == node->char_max_len) {
2030 MemNumType mid;
2031
2032 ID_ENTRY(env, mid);
2033 r = add_op(reg, OP_MARK);
2034 if (r != 0) return r;
2035 COP(reg)->mark.id = mid;
2036 COP(reg)->mark.save_pos = FALSE;
2037
2038 r = add_op(reg, OP_STEP_BACK_START);
2039 if (r != 0) return r;
2040 COP(reg)->step_back_start.initial = node->char_min_len;
2041 COP(reg)->step_back_start.remaining = 0;
2042 COP(reg)->step_back_start.addr = 1;
2043
2044 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2045 if (r != 0) return r;
2046
2047 r = add_op(reg, OP_CUT_TO_MARK);
2048 if (r != 0) return r;
2049 COP(reg)->cut_to_mark.id = mid;
2050 COP(reg)->cut_to_mark.restore_pos = FALSE;
2051 }
2052 else {
2053 MemNumType mid1, mid2;
2054 OnigLen diff;
2055
2056 if (IS_NOT_NULL(node->lead_node)) {
2057 MinMaxCharLen ci;
2058
2059 r = node_char_len(node->lead_node, reg, &ci, env);
2060 if (r < 0) return r;
2061 r = add_op(reg, OP_MOVE);
2062 if (r != 0) return r;
2063 COP(reg)->move.n = -((RelPositionType )ci.min);
2064 r = compile_tree(node->lead_node, reg, env);
2065 if (r != 0) return r;
2066 }
2067
2068 ID_ENTRY(env, mid1);
2069 r = add_op(reg, OP_SAVE_VAL);
2070 if (r != 0) return r;
2071 COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2072 COP(reg)->save_val.id = mid1;
2073
2074 r = add_op(reg, OP_UPDATE_VAR);
2075 if (r != 0) return r;
2076 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2077
2078 ID_ENTRY(env, mid2);
2079 r = add_op(reg, OP_MARK);
2080 if (r != 0) return r;
2081 COP(reg)->mark.id = mid2;
2082 COP(reg)->mark.save_pos = FALSE;
2083
2084 r = add_op(reg, OP_PUSH);
2085 if (r != 0) return r;
2086 COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP;
2087
2088 r = add_op(reg, OP_JUMP);
2089 if (r != 0) return r;
2090 COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL;
2091
2092 r = add_op(reg, OP_UPDATE_VAR);
2093 if (r != 0) return r;
2094 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2095 COP(reg)->update_var.id = mid1;
2096 COP(reg)->update_var.clear = FALSE;
2097 r = add_op(reg, OP_FAIL);
2098 if (r != 0) return r;
2099
2100 r = add_op(reg, OP_STEP_BACK_START);
2101 if (r != 0) return r;
2102
2103 if (node->char_max_len != INFINITE_LEN)
2104 diff = node->char_max_len - node->char_min_len;
2105 else
2106 diff = INFINITE_LEN;
2107
2108 COP(reg)->step_back_start.initial = node->char_min_len;
2109 COP(reg)->step_back_start.remaining = diff;
2110 COP(reg)->step_back_start.addr = 2;
2111
2112 r = add_op(reg, OP_STEP_BACK_NEXT);
2113 if (r != 0) return r;
2114
2115 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2116 if (r != 0) return r;
2117
2118 r = add_op(reg, OP_CHECK_POSITION);
2119 if (r != 0) return r;
2120 COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2121
2122 r = add_op(reg, OP_CUT_TO_MARK);
2123 if (r != 0) return r;
2124 COP(reg)->cut_to_mark.id = mid2;
2125 COP(reg)->cut_to_mark.restore_pos = FALSE;
2126
2127 r = add_op(reg, OP_UPDATE_VAR);
2128 if (r != 0) return r;
2129 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2130 COP(reg)->update_var.id = mid1;
2131 COP(reg)->update_var.clear = TRUE;
2132 }
2133
2134 return r;
2135 }
2136
2137 static int
compile_anchor_look_behind_not_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2138 compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
2139 ScanEnv* env)
2140 {
2141 int r;
2142 int len;
2143
2144 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2145
2146 if (node->char_min_len == node->char_max_len) {
2147 MemNumType mid;
2148
2149 ID_ENTRY(env, mid);
2150 r = add_op(reg, OP_MARK);
2151 if (r != 0) return r;
2152 COP(reg)->mark.id = mid;
2153 COP(reg)->mark.save_pos = FALSE;
2154
2155 r = add_op(reg, OP_PUSH);
2156 if (r != 0) return r;
2157 COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL;
2158
2159 r = add_op(reg, OP_STEP_BACK_START);
2160 if (r != 0) return r;
2161 COP(reg)->step_back_start.initial = node->char_min_len;
2162 COP(reg)->step_back_start.remaining = 0;
2163 COP(reg)->step_back_start.addr = 1;
2164
2165 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2166 if (r != 0) return r;
2167
2168 r = add_op(reg, OP_POP_TO_MARK);
2169 if (r != 0) return r;
2170 COP(reg)->pop_to_mark.id = mid;
2171 r = add_op(reg, OP_FAIL);
2172 if (r != 0) return r;
2173 r = add_op(reg, OP_POP);
2174 }
2175 else {
2176 MemNumType mid1, mid2;
2177 OnigLen diff;
2178
2179 ID_ENTRY(env, mid1);
2180 r = add_op(reg, OP_SAVE_VAL);
2181 if (r != 0) return r;
2182 COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
2183 COP(reg)->save_val.id = mid1;
2184
2185 r = add_op(reg, OP_UPDATE_VAR);
2186 if (r != 0) return r;
2187 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S;
2188
2189 ID_ENTRY(env, mid2);
2190 r = add_op(reg, OP_MARK);
2191 if (r != 0) return r;
2192 COP(reg)->mark.id = mid2;
2193 COP(reg)->mark.save_pos = FALSE;
2194
2195 r = add_op(reg, OP_PUSH);
2196 if (r != 0) return r;
2197 COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL;
2198
2199 if (IS_NOT_NULL(node->lead_node)) {
2200 int clen;
2201 MinMaxCharLen ci;
2202
2203 clen = compile_length_tree(node->lead_node, reg);
2204 COP(reg)->push.addr += OPSIZE_MOVE + clen;
2205
2206 r = node_char_len(node->lead_node, reg, &ci, env);
2207 if (r < 0) return r;
2208 r = add_op(reg, OP_MOVE);
2209 if (r != 0) return r;
2210 COP(reg)->move.n = -((RelPositionType )ci.min);
2211
2212 r = compile_tree(node->lead_node, reg, env);
2213 if (r != 0) return r;
2214 }
2215
2216 r = add_op(reg, OP_STEP_BACK_START);
2217 if (r != 0) return r;
2218
2219 if (node->char_max_len != INFINITE_LEN)
2220 diff = node->char_max_len - node->char_min_len;
2221 else
2222 diff = INFINITE_LEN;
2223
2224 COP(reg)->step_back_start.initial = node->char_min_len;
2225 COP(reg)->step_back_start.remaining = diff;
2226 COP(reg)->step_back_start.addr = 2;
2227
2228 r = add_op(reg, OP_STEP_BACK_NEXT);
2229 if (r != 0) return r;
2230
2231 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2232 if (r != 0) return r;
2233
2234 r = add_op(reg, OP_CHECK_POSITION);
2235 if (r != 0) return r;
2236 COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
2237
2238 r = add_op(reg, OP_POP_TO_MARK);
2239 if (r != 0) return r;
2240 COP(reg)->pop_to_mark.id = mid2;
2241
2242 r = add_op(reg, OP_UPDATE_VAR);
2243 if (r != 0) return r;
2244 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2245 COP(reg)->update_var.id = mid1;
2246 COP(reg)->update_var.clear = FALSE;
2247
2248 r = add_op(reg, OP_POP); /* pop save val */
2249 if (r != 0) return r;
2250 r = add_op(reg, OP_FAIL);
2251 if (r != 0) return r;
2252
2253 r = add_op(reg, OP_UPDATE_VAR);
2254 if (r != 0) return r;
2255 COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
2256 COP(reg)->update_var.id = mid1;
2257 COP(reg)->update_var.clear = FALSE;
2258
2259 r = add_op(reg, OP_POP); /* pop mark */
2260 if (r != 0) return r;
2261 r = add_op(reg, OP_POP); /* pop save val */
2262 }
2263
2264 return r;
2265 }
2266
2267 static int
compile_anchor_node(AnchorNode * node,regex_t * reg,ScanEnv * env)2268 compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
2269 {
2270 int r, len;
2271 enum OpCode op;
2272 MemNumType mid;
2273
2274 switch (node->type) {
2275 case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break;
2276 case ANCR_END_BUF: r = add_op(reg, OP_END_BUF); break;
2277 case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break;
2278 case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break;
2279 case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break;
2280 case ANCR_BEGIN_POSITION:
2281 r = add_op(reg, OP_CHECK_POSITION);
2282 if (r != 0) return r;
2283 COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START;
2284 break;
2285
2286 case ANCR_WORD_BOUNDARY:
2287 op = OP_WORD_BOUNDARY;
2288 word:
2289 r = add_op(reg, op);
2290 if (r != 0) return r;
2291 COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
2292 break;
2293
2294 case ANCR_NO_WORD_BOUNDARY:
2295 op = OP_NO_WORD_BOUNDARY; goto word;
2296 break;
2297 #ifdef USE_WORD_BEGIN_END
2298 case ANCR_WORD_BEGIN:
2299 op = OP_WORD_BEGIN; goto word;
2300 break;
2301 case ANCR_WORD_END:
2302 op = OP_WORD_END; goto word;
2303 break;
2304 #endif
2305
2306 case ANCR_TEXT_SEGMENT_BOUNDARY:
2307 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
2308 {
2309 enum TextSegmentBoundaryType type;
2310
2311 r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
2312 if (r != 0) return r;
2313
2314 type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
2315 #ifdef USE_UNICODE_WORD_BREAK
2316 if (NODE_IS_TEXT_SEGMENT_WORD(node))
2317 type = WORD_BOUNDARY;
2318 #endif
2319
2320 COP(reg)->text_segment_boundary.type = type;
2321 COP(reg)->text_segment_boundary.not =
2322 (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
2323 }
2324 break;
2325
2326 case ANCR_PREC_READ:
2327 {
2328 ID_ENTRY(env, mid);
2329 r = add_op(reg, OP_MARK);
2330 if (r != 0) return r;
2331 COP(reg)->mark.id = mid;
2332 COP(reg)->mark.save_pos = TRUE;
2333
2334 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2335 if (r != 0) return r;
2336
2337 r = add_op(reg, OP_CUT_TO_MARK);
2338 if (r != 0) return r;
2339 COP(reg)->cut_to_mark.id = mid;
2340 COP(reg)->cut_to_mark.restore_pos = TRUE;
2341 }
2342 break;
2343
2344 case ANCR_PREC_READ_NOT:
2345 {
2346 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
2347 if (len < 0) return len;
2348
2349 ID_ENTRY(env, mid);
2350 r = add_op(reg, OP_PUSH);
2351 if (r != 0) return r;
2352 COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len +
2353 OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL;
2354
2355 r = add_op(reg, OP_MARK);
2356 if (r != 0) return r;
2357 COP(reg)->mark.id = mid;
2358 COP(reg)->mark.save_pos = FALSE;
2359
2360 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
2361 if (r != 0) return r;
2362
2363 r = add_op(reg, OP_POP_TO_MARK);
2364 if (r != 0) return r;
2365 COP(reg)->pop_to_mark.id = mid;
2366
2367 r = add_op(reg, OP_POP);
2368 if (r != 0) return r;
2369 r = add_op(reg, OP_FAIL);
2370 }
2371 break;
2372
2373 case ANCR_LOOK_BEHIND:
2374 r = compile_anchor_look_behind_node(node, reg, env);
2375 break;
2376
2377 case ANCR_LOOK_BEHIND_NOT:
2378 r = compile_anchor_look_behind_not_node(node, reg, env);
2379 break;
2380
2381 default:
2382 return ONIGERR_TYPE_BUG;
2383 break;
2384 }
2385
2386 return r;
2387 }
2388
2389 static int
compile_gimmick_node(GimmickNode * node,regex_t * reg)2390 compile_gimmick_node(GimmickNode* node, regex_t* reg)
2391 {
2392 int r = 0;
2393
2394 switch (node->type) {
2395 case GIMMICK_FAIL:
2396 r = add_op(reg, OP_FAIL);
2397 break;
2398
2399 case GIMMICK_SAVE:
2400 r = add_op(reg, OP_SAVE_VAL);
2401 if (r != 0) return r;
2402 COP(reg)->save_val.type = node->detail_type;
2403 COP(reg)->save_val.id = node->id;
2404 break;
2405
2406 case GIMMICK_UPDATE_VAR:
2407 r = add_op(reg, OP_UPDATE_VAR);
2408 if (r != 0) return r;
2409 COP(reg)->update_var.type = node->detail_type;
2410 COP(reg)->update_var.id = node->id;
2411 COP(reg)->update_var.clear = FALSE;
2412 break;
2413
2414 #ifdef USE_CALLOUT
2415 case GIMMICK_CALLOUT:
2416 switch (node->detail_type) {
2417 case ONIG_CALLOUT_OF_CONTENTS:
2418 case ONIG_CALLOUT_OF_NAME:
2419 {
2420 if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
2421 r = add_op(reg, OP_CALLOUT_NAME);
2422 if (r != 0) return r;
2423 COP(reg)->callout_name.id = node->id;
2424 COP(reg)->callout_name.num = node->num;
2425 }
2426 else {
2427 r = add_op(reg, OP_CALLOUT_CONTENTS);
2428 if (r != 0) return r;
2429 COP(reg)->callout_contents.num = node->num;
2430 }
2431 }
2432 break;
2433
2434 default:
2435 r = ONIGERR_TYPE_BUG;
2436 break;
2437 }
2438 #endif
2439 }
2440
2441 return r;
2442 }
2443
2444 static int
compile_length_gimmick_node(GimmickNode * node,regex_t * reg)2445 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
2446 {
2447 int len;
2448
2449 switch (node->type) {
2450 case GIMMICK_FAIL:
2451 len = OPSIZE_FAIL;
2452 break;
2453
2454 case GIMMICK_SAVE:
2455 len = OPSIZE_SAVE_VAL;
2456 break;
2457
2458 case GIMMICK_UPDATE_VAR:
2459 len = OPSIZE_UPDATE_VAR;
2460 break;
2461
2462 #ifdef USE_CALLOUT
2463 case GIMMICK_CALLOUT:
2464 switch (node->detail_type) {
2465 case ONIG_CALLOUT_OF_CONTENTS:
2466 len = OPSIZE_CALLOUT_CONTENTS;
2467 break;
2468 case ONIG_CALLOUT_OF_NAME:
2469 len = OPSIZE_CALLOUT_NAME;
2470 break;
2471
2472 default:
2473 len = ONIGERR_TYPE_BUG;
2474 break;
2475 }
2476 break;
2477 #endif
2478 }
2479
2480 return len;
2481 }
2482
2483 static int
compile_length_tree(Node * node,regex_t * reg)2484 compile_length_tree(Node* node, regex_t* reg)
2485 {
2486 int len, r;
2487
2488 switch (NODE_TYPE(node)) {
2489 case NODE_LIST:
2490 len = 0;
2491 do {
2492 r = compile_length_tree(NODE_CAR(node), reg);
2493 if (r < 0) return r;
2494 len += r;
2495 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2496 r = len;
2497 break;
2498
2499 case NODE_ALT:
2500 {
2501 int n;
2502
2503 n = r = 0;
2504 do {
2505 r += compile_length_tree(NODE_CAR(node), reg);
2506 n++;
2507 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2508 r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
2509 }
2510 break;
2511
2512 case NODE_STRING:
2513 if (NODE_STRING_IS_CRUDE(node))
2514 r = compile_length_string_crude_node(STR_(node), reg);
2515 else
2516 r = compile_length_string_node(node, reg);
2517 break;
2518
2519 case NODE_CCLASS:
2520 r = compile_length_cclass_node(CCLASS_(node), reg);
2521 break;
2522
2523 case NODE_CTYPE:
2524 r = SIZE_OPCODE;
2525 break;
2526
2527 case NODE_BACKREF:
2528 r = OPSIZE_BACKREF;
2529 break;
2530
2531 #ifdef USE_CALL
2532 case NODE_CALL:
2533 r = OPSIZE_CALL;
2534 break;
2535 #endif
2536
2537 case NODE_QUANT:
2538 r = compile_length_quantifier_node(QUANT_(node), reg);
2539 break;
2540
2541 case NODE_BAG:
2542 r = compile_length_bag_node(BAG_(node), reg);
2543 break;
2544
2545 case NODE_ANCHOR:
2546 r = compile_length_anchor_node(ANCHOR_(node), reg);
2547 break;
2548
2549 case NODE_GIMMICK:
2550 r = compile_length_gimmick_node(GIMMICK_(node), reg);
2551 break;
2552
2553 default:
2554 return ONIGERR_TYPE_BUG;
2555 break;
2556 }
2557
2558 return r;
2559 }
2560
2561 static int
compile_tree(Node * node,regex_t * reg,ScanEnv * env)2562 compile_tree(Node* node, regex_t* reg, ScanEnv* env)
2563 {
2564 int n, len, pos, r = 0;
2565
2566 switch (NODE_TYPE(node)) {
2567 case NODE_LIST:
2568 do {
2569 r = compile_tree(NODE_CAR(node), reg, env);
2570 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2571 break;
2572
2573 case NODE_ALT:
2574 {
2575 Node* x = node;
2576 len = 0;
2577 do {
2578 len += compile_length_tree(NODE_CAR(x), reg);
2579 if (IS_NOT_NULL(NODE_CDR(x))) {
2580 len += OPSIZE_PUSH + OPSIZE_JUMP;
2581 }
2582 } while (IS_NOT_NULL(x = NODE_CDR(x)));
2583 pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */
2584
2585 do {
2586 len = compile_length_tree(NODE_CAR(node), reg);
2587 if (IS_NOT_NULL(NODE_CDR(node))) {
2588 enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
2589 r = add_op(reg, push);
2590 if (r != 0) break;
2591 COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP;
2592 }
2593 r = compile_tree(NODE_CAR(node), reg, env);
2594 if (r != 0) break;
2595 if (IS_NOT_NULL(NODE_CDR(node))) {
2596 len = pos - (COP_CURR_OFFSET(reg) + 1);
2597 r = add_op(reg, OP_JUMP);
2598 if (r != 0) break;
2599 COP(reg)->jump.addr = len;
2600 }
2601 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2602 }
2603 break;
2604
2605 case NODE_STRING:
2606 if (NODE_STRING_IS_CRUDE(node))
2607 r = compile_string_crude_node(STR_(node), reg);
2608 else
2609 r = compile_string_node(node, reg);
2610 break;
2611
2612 case NODE_CCLASS:
2613 r = compile_cclass_node(CCLASS_(node), reg);
2614 break;
2615
2616 case NODE_CTYPE:
2617 {
2618 int op;
2619
2620 switch (CTYPE_(node)->ctype) {
2621 case CTYPE_ANYCHAR:
2622 r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR);
2623 break;
2624
2625 case ONIGENC_CTYPE_WORD:
2626 if (CTYPE_(node)->ascii_mode == 0) {
2627 op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
2628 }
2629 else {
2630 op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
2631 }
2632 r = add_op(reg, op);
2633 break;
2634
2635 default:
2636 return ONIGERR_TYPE_BUG;
2637 break;
2638 }
2639 }
2640 break;
2641
2642 case NODE_BACKREF:
2643 {
2644 BackRefNode* br = BACKREF_(node);
2645
2646 if (NODE_IS_CHECKER(node)) {
2647 #ifdef USE_BACKREF_WITH_LEVEL
2648 if (NODE_IS_NEST_LEVEL(node)) {
2649 r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
2650 if (r != 0) return r;
2651 COP(reg)->backref_general.nest_level = br->nest_level;
2652 }
2653 else
2654 #endif
2655 {
2656 r = add_op(reg, OP_BACKREF_CHECK);
2657 if (r != 0) return r;
2658 }
2659 goto add_bacref_mems;
2660 }
2661 else {
2662 #ifdef USE_BACKREF_WITH_LEVEL
2663 if (NODE_IS_NEST_LEVEL(node)) {
2664 if (NODE_IS_IGNORECASE(node))
2665 r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
2666 else
2667 r = add_op(reg, OP_BACKREF_WITH_LEVEL);
2668
2669 if (r != 0) return r;
2670 COP(reg)->backref_general.nest_level = br->nest_level;
2671 goto add_bacref_mems;
2672 }
2673 else
2674 #endif
2675 if (br->back_num == 1) {
2676 n = br->back_static[0];
2677 if (NODE_IS_IGNORECASE(node)) {
2678 r = add_op(reg, OP_BACKREF_N_IC);
2679 if (r != 0) return r;
2680 COP(reg)->backref_n.n1 = n;
2681 }
2682 else {
2683 switch (n) {
2684 case 1: r = add_op(reg, OP_BACKREF1); break;
2685 case 2: r = add_op(reg, OP_BACKREF2); break;
2686 default:
2687 r = add_op(reg, OP_BACKREF_N);
2688 if (r != 0) return r;
2689 COP(reg)->backref_n.n1 = n;
2690 break;
2691 }
2692 }
2693 }
2694 else {
2695 int num;
2696 int* p;
2697
2698 r = add_op(reg, NODE_IS_IGNORECASE(node) ?
2699 OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2700 if (r != 0) return r;
2701
2702 add_bacref_mems:
2703 num = br->back_num;
2704 COP(reg)->backref_general.num = num;
2705 if (num == 1) {
2706 COP(reg)->backref_general.n1 = br->back_static[0];
2707 }
2708 else {
2709 int i, j;
2710 MemNumType* ns;
2711
2712 ns = xmalloc(sizeof(MemNumType) * num);
2713 CHECK_NULL_RETURN_MEMERR(ns);
2714 COP(reg)->backref_general.ns = ns;
2715 p = BACKREFS_P(br);
2716 for (i = num - 1, j = 0; i >= 0; i--, j++) {
2717 ns[j] = p[i];
2718 }
2719 }
2720 }
2721 }
2722 }
2723 break;
2724
2725 #ifdef USE_CALL
2726 case NODE_CALL:
2727 r = compile_call(CALL_(node), reg, env);
2728 break;
2729 #endif
2730
2731 case NODE_QUANT:
2732 r = compile_quantifier_node(QUANT_(node), reg, env);
2733 break;
2734
2735 case NODE_BAG:
2736 r = compile_bag_node(BAG_(node), reg, env);
2737 break;
2738
2739 case NODE_ANCHOR:
2740 r = compile_anchor_node(ANCHOR_(node), reg, env);
2741 break;
2742
2743 case NODE_GIMMICK:
2744 r = compile_gimmick_node(GIMMICK_(node), reg);
2745 break;
2746
2747 default:
2748 #ifdef ONIG_DEBUG
2749 fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2750 #endif
2751 break;
2752 }
2753
2754 return r;
2755 }
2756
2757 static int
make_named_capture_number_map(Node ** plink,GroupNumMap * map,int * counter)2758 make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)
2759 {
2760 int r = 0;
2761 Node* node = *plink;
2762
2763 switch (NODE_TYPE(node)) {
2764 case NODE_LIST:
2765 case NODE_ALT:
2766 do {
2767 r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter);
2768 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2769 break;
2770
2771 case NODE_QUANT:
2772 {
2773 Node** ptarget = &(NODE_BODY(node));
2774 Node* old = *ptarget;
2775 r = make_named_capture_number_map(ptarget, map, counter);
2776 if (r != 0) return r;
2777 if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) {
2778 r = onig_reduce_nested_quantifier(node);
2779 }
2780 }
2781 break;
2782
2783 case NODE_BAG:
2784 {
2785 BagNode* en = BAG_(node);
2786 if (en->type == BAG_MEMORY) {
2787 if (NODE_IS_NAMED_GROUP(node)) {
2788 (*counter)++;
2789 map[en->m.regnum].new_val = *counter;
2790 en->m.regnum = *counter;
2791 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2792 }
2793 else {
2794 *plink = NODE_BODY(node);
2795 NODE_BODY(node) = NULL_NODE;
2796 onig_node_free(node);
2797 r = make_named_capture_number_map(plink, map, counter);
2798 }
2799 }
2800 else if (en->type == BAG_IF_ELSE) {
2801 r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter);
2802 if (r != 0) return r;
2803 if (IS_NOT_NULL(en->te.Then)) {
2804 r = make_named_capture_number_map(&(en->te.Then), map, counter);
2805 if (r != 0) return r;
2806 }
2807 if (IS_NOT_NULL(en->te.Else)) {
2808 r = make_named_capture_number_map(&(en->te.Else), map, counter);
2809 if (r != 0) return r;
2810 }
2811 }
2812 else
2813 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2814 }
2815 break;
2816
2817 case NODE_ANCHOR:
2818 if (IS_NOT_NULL(NODE_BODY(node)))
2819 r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter);
2820 break;
2821
2822 default:
2823 break;
2824 }
2825
2826 return r;
2827 }
2828
2829 static int
renumber_backref_node(Node * node,GroupNumMap * map)2830 renumber_backref_node(Node* node, GroupNumMap* map)
2831 {
2832 int i, pos, n, old_num;
2833 int *backs;
2834 BackRefNode* bn = BACKREF_(node);
2835
2836 if (! NODE_IS_BY_NAME(node))
2837 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2838
2839 old_num = bn->back_num;
2840 if (IS_NULL(bn->back_dynamic))
2841 backs = bn->back_static;
2842 else
2843 backs = bn->back_dynamic;
2844
2845 for (i = 0, pos = 0; i < old_num; i++) {
2846 n = map[backs[i]].new_val;
2847 if (n > 0) {
2848 backs[pos] = n;
2849 pos++;
2850 }
2851 }
2852
2853 bn->back_num = pos;
2854 return 0;
2855 }
2856
2857 static int
renumber_backref_traverse(Node * node,GroupNumMap * map)2858 renumber_backref_traverse(Node* node, GroupNumMap* map)
2859 {
2860 int r = 0;
2861
2862 switch (NODE_TYPE(node)) {
2863 case NODE_LIST:
2864 case NODE_ALT:
2865 do {
2866 r = renumber_backref_traverse(NODE_CAR(node), map);
2867 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2868 break;
2869
2870 case NODE_QUANT:
2871 r = renumber_backref_traverse(NODE_BODY(node), map);
2872 break;
2873
2874 case NODE_BAG:
2875 {
2876 BagNode* en = BAG_(node);
2877
2878 r = renumber_backref_traverse(NODE_BODY(node), map);
2879 if (r != 0) return r;
2880
2881 if (en->type == BAG_IF_ELSE) {
2882 if (IS_NOT_NULL(en->te.Then)) {
2883 r = renumber_backref_traverse(en->te.Then, map);
2884 if (r != 0) return r;
2885 }
2886 if (IS_NOT_NULL(en->te.Else)) {
2887 r = renumber_backref_traverse(en->te.Else, map);
2888 if (r != 0) return r;
2889 }
2890 }
2891 }
2892 break;
2893
2894 case NODE_BACKREF:
2895 r = renumber_backref_node(node, map);
2896 break;
2897
2898 case NODE_ANCHOR:
2899 if (IS_NOT_NULL(NODE_BODY(node)))
2900 r = renumber_backref_traverse(NODE_BODY(node), map);
2901 break;
2902
2903 default:
2904 break;
2905 }
2906
2907 return r;
2908 }
2909
2910 static int
numbered_ref_check(Node * node)2911 numbered_ref_check(Node* node)
2912 {
2913 int r = 0;
2914
2915 switch (NODE_TYPE(node)) {
2916 case NODE_LIST:
2917 case NODE_ALT:
2918 do {
2919 r = numbered_ref_check(NODE_CAR(node));
2920 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2921 break;
2922
2923 case NODE_ANCHOR:
2924 if (IS_NULL(NODE_BODY(node)))
2925 break;
2926 /* fall */
2927 case NODE_QUANT:
2928 r = numbered_ref_check(NODE_BODY(node));
2929 break;
2930
2931 case NODE_BAG:
2932 {
2933 BagNode* en = BAG_(node);
2934
2935 r = numbered_ref_check(NODE_BODY(node));
2936 if (r != 0) return r;
2937
2938 if (en->type == BAG_IF_ELSE) {
2939 if (IS_NOT_NULL(en->te.Then)) {
2940 r = numbered_ref_check(en->te.Then);
2941 if (r != 0) return r;
2942 }
2943 if (IS_NOT_NULL(en->te.Else)) {
2944 r = numbered_ref_check(en->te.Else);
2945 if (r != 0) return r;
2946 }
2947 }
2948 }
2949
2950 break;
2951
2952 case NODE_BACKREF:
2953 if (! NODE_IS_BY_NAME(node))
2954 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2955 break;
2956
2957 default:
2958 break;
2959 }
2960
2961 return r;
2962 }
2963
2964 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)2965 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2966 {
2967 int r, i, pos, counter;
2968 MemStatusType loc;
2969 GroupNumMap* map;
2970
2971 map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1));
2972 CHECK_NULL_RETURN_MEMERR(map);
2973 for (i = 1; i <= env->num_mem; i++) {
2974 map[i].new_val = 0;
2975 }
2976 counter = 0;
2977 r = make_named_capture_number_map(root, map, &counter);
2978 if (r != 0) return r;
2979
2980 r = renumber_backref_traverse(*root, map);
2981 if (r != 0) return r;
2982
2983 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2984 if (map[i].new_val > 0) {
2985 SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
2986 pos++;
2987 }
2988 }
2989
2990 loc = env->cap_history;
2991 MEM_STATUS_CLEAR(env->cap_history);
2992 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2993 if (MEM_STATUS_AT(loc, i)) {
2994 MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val);
2995 }
2996 }
2997
2998 env->num_mem = env->num_named;
2999 reg->num_mem = env->num_named;
3000
3001 return onig_renumber_name_table(reg, map);
3002 }
3003
3004 #ifdef USE_CALL
3005 static int
fix_unset_addr_list(UnsetAddrList * uslist,regex_t * reg)3006 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
3007 {
3008 int i, offset;
3009 BagNode* en;
3010 AbsAddrType addr;
3011 AbsAddrType* paddr;
3012
3013 for (i = 0; i < uslist->num; i++) {
3014 if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) {
3015 if (NODE_IS_CALLED(uslist->us[i].target))
3016 return ONIGERR_PARSER_BUG;
3017 else {
3018 /* CASE: called node doesn't have called address.
3019 ex. /((|a\g<1>)(.){0}){0}\g<3>/
3020 group-1 doesn't called, but compiled into bytecodes,
3021 because group-3 is referred from outside.
3022 */
3023 continue;
3024 }
3025 }
3026
3027 en = BAG_(uslist->us[i].target);
3028 addr = en->m.called_addr;
3029 offset = uslist->us[i].offset;
3030
3031 paddr = (AbsAddrType* )((char* )reg->ops + offset);
3032 *paddr = addr;
3033 }
3034 return 0;
3035 }
3036 #endif
3037
3038 /* x is not included y ==> 1 : 0 */
3039 static int
is_exclusive(Node * x,Node * y,regex_t * reg)3040 is_exclusive(Node* x, Node* y, regex_t* reg)
3041 {
3042 int i, len;
3043 OnigCodePoint code;
3044 UChar *p;
3045 NodeType ytype;
3046
3047 retry:
3048 ytype = NODE_TYPE(y);
3049 switch (NODE_TYPE(x)) {
3050 case NODE_CTYPE:
3051 {
3052 if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
3053 CTYPE_(y)->ctype == CTYPE_ANYCHAR)
3054 break;
3055
3056 switch (ytype) {
3057 case NODE_CTYPE:
3058 if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
3059 CTYPE_(y)->not != CTYPE_(x)->not &&
3060 CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
3061 return 1;
3062 else
3063 return 0;
3064 break;
3065
3066 case NODE_CCLASS:
3067 swap:
3068 {
3069 Node* tmp;
3070 tmp = x; x = y; y = tmp;
3071 goto retry;
3072 }
3073 break;
3074
3075 case NODE_STRING:
3076 goto swap;
3077 break;
3078
3079 default:
3080 break;
3081 }
3082 }
3083 break;
3084
3085 case NODE_CCLASS:
3086 {
3087 int range;
3088 CClassNode* xc = CCLASS_(x);
3089
3090 switch (ytype) {
3091 case NODE_CTYPE:
3092 switch (CTYPE_(y)->ctype) {
3093 case CTYPE_ANYCHAR:
3094 return 0;
3095 break;
3096
3097 case ONIGENC_CTYPE_WORD:
3098 if (CTYPE_(y)->not == 0) {
3099 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
3100 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3101 for (i = 0; i < range; i++) {
3102 if (BITSET_AT(xc->bs, i)) {
3103 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
3104 }
3105 }
3106 return 1;
3107 }
3108 return 0;
3109 }
3110 else {
3111 if (IS_NOT_NULL(xc->mbuf)) return 0;
3112 if (IS_NCCLASS_NOT(xc)) return 0;
3113
3114 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
3115 for (i = 0; i < range; i++) {
3116 if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
3117 if (BITSET_AT(xc->bs, i))
3118 return 0;
3119 }
3120 }
3121 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
3122 if (BITSET_AT(xc->bs, i)) return 0;
3123 }
3124 return 1;
3125 }
3126 break;
3127
3128 default:
3129 break;
3130 }
3131 break;
3132
3133 case NODE_CCLASS:
3134 {
3135 int v;
3136 CClassNode* yc = CCLASS_(y);
3137
3138 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
3139 v = BITSET_AT(xc->bs, i);
3140 if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
3141 v = BITSET_AT(yc->bs, i);
3142 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
3143 (v == 0 && IS_NCCLASS_NOT(yc)))
3144 return 0;
3145 }
3146 }
3147 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
3148 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
3149 return 1;
3150 return 0;
3151 }
3152 break;
3153
3154 case NODE_STRING:
3155 goto swap;
3156 break;
3157
3158 default:
3159 break;
3160 }
3161 }
3162 break;
3163
3164 case NODE_STRING:
3165 {
3166 StrNode* xs = STR_(x);
3167
3168 if (NODE_STRING_LEN(x) == 0)
3169 break;
3170
3171 switch (ytype) {
3172 case NODE_CTYPE:
3173 switch (CTYPE_(y)->ctype) {
3174 case CTYPE_ANYCHAR:
3175 break;
3176
3177 case ONIGENC_CTYPE_WORD:
3178 if (CTYPE_(y)->ascii_mode == 0) {
3179 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
3180 return CTYPE_(y)->not;
3181 else
3182 return !(CTYPE_(y)->not);
3183 }
3184 else {
3185 if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
3186 return CTYPE_(y)->not;
3187 else
3188 return !(CTYPE_(y)->not);
3189 }
3190 break;
3191 default:
3192 break;
3193 }
3194 break;
3195
3196 case NODE_CCLASS:
3197 {
3198 CClassNode* cc = CCLASS_(y);
3199
3200 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
3201 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
3202 return onig_is_code_in_cc(reg->enc, code, cc) == 0;
3203 }
3204 break;
3205
3206 case NODE_STRING:
3207 {
3208 UChar *q;
3209 StrNode* ys = STR_(y);
3210
3211 len = NODE_STRING_LEN(x);
3212 if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
3213
3214 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
3215 if (*p != *q) return 1;
3216 }
3217 }
3218 break;
3219
3220 default:
3221 break;
3222 }
3223 }
3224 break;
3225
3226 default:
3227 break;
3228 }
3229
3230 return 0;
3231 }
3232
3233 static Node*
get_tree_head_literal(Node * node,int exact,regex_t * reg)3234 get_tree_head_literal(Node* node, int exact, regex_t* reg)
3235 {
3236 Node* n = NULL_NODE;
3237
3238 switch (NODE_TYPE(node)) {
3239 case NODE_BACKREF:
3240 case NODE_ALT:
3241 #ifdef USE_CALL
3242 case NODE_CALL:
3243 #endif
3244 break;
3245
3246 case NODE_CTYPE:
3247 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
3248 break;
3249 /* fall */
3250 case NODE_CCLASS:
3251 if (exact == 0) {
3252 n = node;
3253 }
3254 break;
3255
3256 case NODE_LIST:
3257 n = get_tree_head_literal(NODE_CAR(node), exact, reg);
3258 break;
3259
3260 case NODE_STRING:
3261 {
3262 StrNode* sn = STR_(node);
3263
3264 if (sn->end <= sn->s)
3265 break;
3266
3267 if (exact == 0 ||
3268 ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) {
3269 n = node;
3270 }
3271 }
3272 break;
3273
3274 case NODE_QUANT:
3275 {
3276 QuantNode* qn = QUANT_(node);
3277 if (qn->lower > 0) {
3278 if (IS_NOT_NULL(qn->head_exact))
3279 n = qn->head_exact;
3280 else
3281 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3282 }
3283 }
3284 break;
3285
3286 case NODE_BAG:
3287 {
3288 BagNode* en = BAG_(node);
3289 switch (en->type) {
3290 case BAG_OPTION:
3291 case BAG_MEMORY:
3292 case BAG_STOP_BACKTRACK:
3293 case BAG_IF_ELSE:
3294 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3295 break;
3296 }
3297 }
3298 break;
3299
3300 case NODE_ANCHOR:
3301 if (ANCHOR_(node)->type == ANCR_PREC_READ)
3302 n = get_tree_head_literal(NODE_BODY(node), exact, reg);
3303 break;
3304
3305 case NODE_GIMMICK:
3306 default:
3307 break;
3308 }
3309
3310 return n;
3311 }
3312
3313 enum GetValue {
3314 GET_VALUE_NONE = -1,
3315 GET_VALUE_IGNORE = 0,
3316 GET_VALUE_FOUND = 1
3317 };
3318
3319 static int
get_tree_tail_literal(Node * node,Node ** rnode,regex_t * reg)3320 get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)
3321 {
3322 int r;
3323
3324 switch (NODE_TYPE(node)) {
3325 case NODE_LIST:
3326 if (IS_NULL(NODE_CDR(node))) {
3327 r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3328 }
3329 else {
3330 r = get_tree_tail_literal(NODE_CDR(node), rnode, reg);
3331 if (r == GET_VALUE_IGNORE) {
3332 r = get_tree_tail_literal(NODE_CAR(node), rnode, reg);
3333 }
3334 }
3335 break;
3336
3337 #ifdef USE_CALL
3338 case NODE_CALL:
3339 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3340 break;
3341 #endif
3342
3343 case NODE_CTYPE:
3344 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) {
3345 r = GET_VALUE_NONE;
3346 break;
3347 }
3348 /* fall */
3349 case NODE_CCLASS:
3350 *rnode = node;
3351 r = GET_VALUE_FOUND;
3352 break;
3353
3354 case NODE_STRING:
3355 {
3356 StrNode* sn = STR_(node);
3357
3358 if (sn->end <= sn->s) {
3359 r = GET_VALUE_IGNORE;
3360 break;
3361 }
3362
3363 if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
3364 r = GET_VALUE_NONE;
3365 break;
3366 }
3367
3368 *rnode = node;
3369 r = GET_VALUE_FOUND;
3370 }
3371 break;
3372
3373 case NODE_QUANT:
3374 {
3375 QuantNode* qn = QUANT_(node);
3376 if (qn->lower != 0) {
3377 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3378 }
3379 else
3380 r = GET_VALUE_NONE;
3381 }
3382 break;
3383
3384 case NODE_BAG:
3385 {
3386 BagNode* en = BAG_(node);
3387
3388 if (en->type == BAG_MEMORY) {
3389 if (NODE_IS_MARK1(node))
3390 r = GET_VALUE_NONE;
3391 else {
3392 NODE_STATUS_ADD(node, MARK1);
3393 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3394 NODE_STATUS_REMOVE(node, MARK1);
3395 }
3396 }
3397 else {
3398 r = get_tree_tail_literal(NODE_BODY(node), rnode, reg);
3399 }
3400 }
3401 break;
3402
3403 case NODE_ANCHOR:
3404 case NODE_GIMMICK:
3405 r = GET_VALUE_IGNORE;
3406 break;
3407
3408 case NODE_ALT:
3409 case NODE_BACKREF:
3410 default:
3411 r = GET_VALUE_NONE;
3412 break;
3413 }
3414
3415 return r;
3416 }
3417
3418 static int
check_called_node_in_look_behind(Node * node,int not)3419 check_called_node_in_look_behind(Node* node, int not)
3420 {
3421 int r;
3422
3423 r = 0;
3424
3425 switch (NODE_TYPE(node)) {
3426 case NODE_LIST:
3427 case NODE_ALT:
3428 do {
3429 r = check_called_node_in_look_behind(NODE_CAR(node), not);
3430 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3431 break;
3432
3433 case NODE_QUANT:
3434 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3435 break;
3436
3437 case NODE_BAG:
3438 {
3439 BagNode* en = BAG_(node);
3440
3441 if (en->type == BAG_MEMORY) {
3442 if (NODE_IS_MARK1(node))
3443 return 0;
3444 else {
3445 NODE_STATUS_ADD(node, MARK1);
3446 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3447 NODE_STATUS_REMOVE(node, MARK1);
3448 }
3449 }
3450 else {
3451 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3452 if (r == 0 && en->type == BAG_IF_ELSE) {
3453 if (IS_NOT_NULL(en->te.Then)) {
3454 r = check_called_node_in_look_behind(en->te.Then, not);
3455 if (r != 0) break;
3456 }
3457 if (IS_NOT_NULL(en->te.Else)) {
3458 r = check_called_node_in_look_behind(en->te.Else, not);
3459 }
3460 }
3461 }
3462 }
3463 break;
3464
3465 case NODE_ANCHOR:
3466 if (IS_NOT_NULL(NODE_BODY(node)))
3467 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3468 break;
3469
3470 case NODE_GIMMICK:
3471 if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3472 return 1;
3473 break;
3474
3475 default:
3476 break;
3477 }
3478
3479 return r;
3480 }
3481
3482 /* allowed node types in look-behind */
3483 #define ALLOWED_TYPE_IN_LB \
3484 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
3485 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
3486 | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK)
3487
3488 #define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3489 #define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE )
3490
3491 #define ALLOWED_ANCHOR_IN_LB \
3492 ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
3493 | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
3494 | ANCR_WORD_BEGIN | ANCR_WORD_END \
3495 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3496
3497 #define ALLOWED_ANCHOR_IN_LB_NOT \
3498 ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
3499 | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
3500 | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
3501 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
3502
3503
3504 static int
check_node_in_look_behind(Node * node,int not,int * used)3505 check_node_in_look_behind(Node* node, int not, int* used)
3506 {
3507 static unsigned int
3508 bag_mask[2] = { ALLOWED_BAG_IN_LB, ALLOWED_BAG_IN_LB_NOT };
3509
3510 static unsigned int
3511 anchor_mask[2] = { ALLOWED_ANCHOR_IN_LB, ALLOWED_ANCHOR_IN_LB_NOT };
3512
3513 NodeType type;
3514 int r = 0;
3515
3516 type = NODE_TYPE(node);
3517 if ((NODE_TYPE2BIT(type) & ALLOWED_TYPE_IN_LB) == 0)
3518 return 1;
3519
3520 switch (type) {
3521 case NODE_LIST:
3522 case NODE_ALT:
3523 do {
3524 r = check_node_in_look_behind(NODE_CAR(node), not, used);
3525 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3526 break;
3527
3528 case NODE_QUANT:
3529 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3530 break;
3531
3532 case NODE_BAG:
3533 {
3534 BagNode* en = BAG_(node);
3535 if (((1<<en->type) & bag_mask[not]) == 0)
3536 return 1;
3537
3538 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3539 if (r != 0) break;
3540
3541 if (en->type == BAG_MEMORY) {
3542 if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE;
3543 }
3544 else if (en->type == BAG_IF_ELSE) {
3545 if (IS_NOT_NULL(en->te.Then)) {
3546 r = check_node_in_look_behind(en->te.Then, not, used);
3547 if (r != 0) break;
3548 }
3549 if (IS_NOT_NULL(en->te.Else)) {
3550 r = check_node_in_look_behind(en->te.Else, not, used);
3551 }
3552 }
3553 }
3554 break;
3555
3556 case NODE_ANCHOR:
3557 type = ANCHOR_(node)->type;
3558 if ((type & anchor_mask[not]) == 0)
3559 return 1;
3560
3561 if (IS_NOT_NULL(NODE_BODY(node)))
3562 r = check_node_in_look_behind(NODE_BODY(node), not, used);
3563 break;
3564
3565 case NODE_GIMMICK:
3566 if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0)
3567 return 1;
3568 break;
3569
3570 case NODE_CALL:
3571 r = check_called_node_in_look_behind(NODE_BODY(node), not);
3572 break;
3573
3574 default:
3575 break;
3576 }
3577 return r;
3578 }
3579
3580 static OnigLen
node_min_byte_len(Node * node,ScanEnv * env)3581 node_min_byte_len(Node* node, ScanEnv* env)
3582 {
3583 OnigLen len;
3584 OnigLen tmin;
3585
3586 len = 0;
3587 switch (NODE_TYPE(node)) {
3588 case NODE_BACKREF:
3589 if (! NODE_IS_CHECKER(node)) {
3590 int i;
3591 int* backs;
3592 MemEnv* mem_env = SCANENV_MEMENV(env);
3593 BackRefNode* br = BACKREF_(node);
3594 if (NODE_IS_RECURSION(node)) break;
3595
3596 backs = BACKREFS_P(br);
3597 len = node_min_byte_len(mem_env[backs[0]].mem_node, env);
3598 for (i = 1; i < br->back_num; i++) {
3599 tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env);
3600 if (len > tmin) len = tmin;
3601 }
3602 }
3603 break;
3604
3605 #ifdef USE_CALL
3606 case NODE_CALL:
3607 {
3608 Node* t = NODE_BODY(node);
3609 if (NODE_IS_RECURSION(node)) {
3610 if (NODE_IS_FIXED_MIN(t))
3611 len = BAG_(t)->min_len;
3612 }
3613 else
3614 len = node_min_byte_len(t, env);
3615 }
3616 break;
3617 #endif
3618
3619 case NODE_LIST:
3620 do {
3621 tmin = node_min_byte_len(NODE_CAR(node), env);
3622 len = distance_add(len, tmin);
3623 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3624 break;
3625
3626 case NODE_ALT:
3627 {
3628 Node *x, *y;
3629 y = node;
3630 do {
3631 x = NODE_CAR(y);
3632 tmin = node_min_byte_len(x, env);
3633 if (y == node) len = tmin;
3634 else if (len > tmin) len = tmin;
3635 } while (IS_NOT_NULL(y = NODE_CDR(y)));
3636 }
3637 break;
3638
3639 case NODE_STRING:
3640 {
3641 StrNode* sn = STR_(node);
3642 len = (int )(sn->end - sn->s);
3643 }
3644 break;
3645
3646 case NODE_CTYPE:
3647 case NODE_CCLASS:
3648 len = ONIGENC_MBC_MINLEN(env->enc);
3649 break;
3650
3651 case NODE_QUANT:
3652 {
3653 QuantNode* qn = QUANT_(node);
3654
3655 if (qn->lower > 0) {
3656 len = node_min_byte_len(NODE_BODY(node), env);
3657 len = distance_multiply(len, qn->lower);
3658 }
3659 }
3660 break;
3661
3662 case NODE_BAG:
3663 {
3664 BagNode* en = BAG_(node);
3665 switch (en->type) {
3666 case BAG_MEMORY:
3667 if (NODE_IS_FIXED_MIN(node))
3668 len = en->min_len;
3669 else {
3670 if (NODE_IS_MARK1(node))
3671 len = 0; /* recursive */
3672 else {
3673 NODE_STATUS_ADD(node, MARK1);
3674 len = node_min_byte_len(NODE_BODY(node), env);
3675 NODE_STATUS_REMOVE(node, MARK1);
3676
3677 en->min_len = len;
3678 NODE_STATUS_ADD(node, FIXED_MIN);
3679 }
3680 }
3681 break;
3682
3683 case BAG_OPTION:
3684 case BAG_STOP_BACKTRACK:
3685 len = node_min_byte_len(NODE_BODY(node), env);
3686 break;
3687 case BAG_IF_ELSE:
3688 {
3689 OnigLen elen;
3690
3691 len = node_min_byte_len(NODE_BODY(node), env);
3692 if (IS_NOT_NULL(en->te.Then))
3693 len += node_min_byte_len(en->te.Then, env);
3694 if (IS_NOT_NULL(en->te.Else))
3695 elen = node_min_byte_len(en->te.Else, env);
3696 else elen = 0;
3697
3698 if (elen < len) len = elen;
3699 }
3700 break;
3701 }
3702 }
3703 break;
3704
3705 case NODE_GIMMICK:
3706 {
3707 GimmickNode* g = GIMMICK_(node);
3708 if (g->type == GIMMICK_FAIL) {
3709 len = INFINITE_LEN;
3710 break;
3711 }
3712 }
3713 /* fall */
3714 case NODE_ANCHOR:
3715 default:
3716 break;
3717 }
3718
3719 return len;
3720 }
3721
3722 static OnigLen
node_max_byte_len(Node * node,ScanEnv * env)3723 node_max_byte_len(Node* node, ScanEnv* env)
3724 {
3725 OnigLen len;
3726 OnigLen tmax;
3727
3728 len = 0;
3729 switch (NODE_TYPE(node)) {
3730 case NODE_LIST:
3731 do {
3732 tmax = node_max_byte_len(NODE_CAR(node), env);
3733 len = distance_add(len, tmax);
3734 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3735 break;
3736
3737 case NODE_ALT:
3738 do {
3739 tmax = node_max_byte_len(NODE_CAR(node), env);
3740 if (len < tmax) len = tmax;
3741 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3742 break;
3743
3744 case NODE_STRING:
3745 {
3746 StrNode* sn = STR_(node);
3747 len = (OnigLen )(sn->end - sn->s);
3748 }
3749 break;
3750
3751 case NODE_CTYPE:
3752 case NODE_CCLASS:
3753 len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
3754 break;
3755
3756 case NODE_BACKREF:
3757 if (! NODE_IS_CHECKER(node)) {
3758 int i;
3759 int* backs;
3760 MemEnv* mem_env = SCANENV_MEMENV(env);
3761 BackRefNode* br = BACKREF_(node);
3762 if (NODE_IS_RECURSION(node)) {
3763 #ifdef USE_BACKREF_WITH_LEVEL
3764 if (NODE_IS_NEST_LEVEL(node)) {
3765 len = INFINITE_LEN;
3766 }
3767 #endif
3768 break;
3769 }
3770 backs = BACKREFS_P(br);
3771 for (i = 0; i < br->back_num; i++) {
3772 tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
3773 if (len < tmax) len = tmax;
3774 }
3775 }
3776 break;
3777
3778 #ifdef USE_CALL
3779 case NODE_CALL:
3780 if (! NODE_IS_RECURSION(node))
3781 len = node_max_byte_len(NODE_BODY(node), env);
3782 else
3783 len = INFINITE_LEN;
3784 break;
3785 #endif
3786
3787 case NODE_QUANT:
3788 {
3789 QuantNode* qn = QUANT_(node);
3790
3791 if (qn->upper != 0) {
3792 len = node_max_byte_len(NODE_BODY(node), env);
3793 if (len != 0) {
3794 if (! IS_INFINITE_REPEAT(qn->upper))
3795 len = distance_multiply(len, qn->upper);
3796 else
3797 len = INFINITE_LEN;
3798 }
3799 }
3800 }
3801 break;
3802
3803 case NODE_BAG:
3804 {
3805 BagNode* en = BAG_(node);
3806 switch (en->type) {
3807 case BAG_MEMORY:
3808 if (NODE_IS_FIXED_MAX(node))
3809 len = en->max_len;
3810 else {
3811 if (NODE_IS_MARK1(node))
3812 len = INFINITE_LEN;
3813 else {
3814 NODE_STATUS_ADD(node, MARK1);
3815 len = node_max_byte_len(NODE_BODY(node), env);
3816 NODE_STATUS_REMOVE(node, MARK1);
3817
3818 en->max_len = len;
3819 NODE_STATUS_ADD(node, FIXED_MAX);
3820 }
3821 }
3822 break;
3823
3824 case BAG_OPTION:
3825 case BAG_STOP_BACKTRACK:
3826 len = node_max_byte_len(NODE_BODY(node), env);
3827 break;
3828 case BAG_IF_ELSE:
3829 {
3830 OnigLen tlen, elen;
3831
3832 len = node_max_byte_len(NODE_BODY(node), env);
3833 if (IS_NOT_NULL(en->te.Then)) {
3834 tlen = node_max_byte_len(en->te.Then, env);
3835 len = distance_add(len, tlen);
3836 }
3837 if (IS_NOT_NULL(en->te.Else))
3838 elen = node_max_byte_len(en->te.Else, env);
3839 else elen = 0;
3840
3841 if (elen > len) len = elen;
3842 }
3843 break;
3844 }
3845 }
3846 break;
3847
3848 case NODE_ANCHOR:
3849 case NODE_GIMMICK:
3850 default:
3851 break;
3852 }
3853
3854 return len;
3855 }
3856
3857 static int
check_backrefs(Node * node,ScanEnv * env)3858 check_backrefs(Node* node, ScanEnv* env)
3859 {
3860 int r;
3861
3862 switch (NODE_TYPE(node)) {
3863 case NODE_LIST:
3864 case NODE_ALT:
3865 do {
3866 r = check_backrefs(NODE_CAR(node), env);
3867 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3868 break;
3869
3870 case NODE_ANCHOR:
3871 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3872 r = 0;
3873 break;
3874 }
3875 /* fall */
3876 case NODE_QUANT:
3877 r = check_backrefs(NODE_BODY(node), env);
3878 break;
3879
3880 case NODE_BAG:
3881 r = check_backrefs(NODE_BODY(node), env);
3882 {
3883 BagNode* en = BAG_(node);
3884
3885 if (en->type == BAG_IF_ELSE) {
3886 if (r != 0) return r;
3887 if (IS_NOT_NULL(en->te.Then)) {
3888 r = check_backrefs(en->te.Then, env);
3889 if (r != 0) return r;
3890 }
3891 if (IS_NOT_NULL(en->te.Else)) {
3892 r = check_backrefs(en->te.Else, env);
3893 }
3894 }
3895 }
3896 break;
3897
3898 case NODE_BACKREF:
3899 {
3900 int i;
3901 BackRefNode* br = BACKREF_(node);
3902 int* backs = BACKREFS_P(br);
3903 MemEnv* mem_env = SCANENV_MEMENV(env);
3904
3905 for (i = 0; i < br->back_num; i++) {
3906 if (backs[i] > env->num_mem)
3907 return ONIGERR_INVALID_BACKREF;
3908
3909 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF);
3910 }
3911 r = 0;
3912 }
3913 break;
3914
3915 default:
3916 r = 0;
3917 break;
3918 }
3919
3920 return r;
3921 }
3922
3923 static int
set_empty_repeat_node_trav(Node * node,Node * empty,ScanEnv * env)3924 set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
3925 {
3926 int r;
3927
3928 switch (NODE_TYPE(node)) {
3929 case NODE_LIST:
3930 case NODE_ALT:
3931 do {
3932 r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env);
3933 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3934 break;
3935
3936 case NODE_ANCHOR:
3937 {
3938 AnchorNode* an = ANCHOR_(node);
3939
3940 if (! ANCHOR_HAS_BODY(an)) {
3941 r = 0;
3942 break;
3943 }
3944
3945 switch (an->type) {
3946 case ANCR_PREC_READ:
3947 case ANCR_LOOK_BEHIND:
3948 empty = NULL_NODE;
3949 break;
3950 default:
3951 break;
3952 }
3953 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3954 }
3955 break;
3956
3957 case NODE_QUANT:
3958 {
3959 QuantNode* qn = QUANT_(node);
3960
3961 if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node;
3962 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3963 }
3964 break;
3965
3966 case NODE_BAG:
3967 if (IS_NOT_NULL(NODE_BODY(node))) {
3968 r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env);
3969 if (r != 0) return r;
3970 }
3971 {
3972 BagNode* en = BAG_(node);
3973
3974 if (en->type == BAG_MEMORY) {
3975 if (NODE_IS_BACKREF(node)) {
3976 if (IS_NOT_NULL(empty))
3977 SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
3978 }
3979 }
3980 else if (en->type == BAG_IF_ELSE) {
3981 if (IS_NOT_NULL(en->te.Then)) {
3982 r = set_empty_repeat_node_trav(en->te.Then, empty, env);
3983 if (r != 0) return r;
3984 }
3985 if (IS_NOT_NULL(en->te.Else)) {
3986 r = set_empty_repeat_node_trav(en->te.Else, empty, env);
3987 }
3988 }
3989 }
3990 break;
3991
3992 default:
3993 r = 0;
3994 break;
3995 }
3996
3997 return r;
3998 }
3999
4000 static int
is_ancestor_node(Node * node,Node * me)4001 is_ancestor_node(Node* node, Node* me)
4002 {
4003 Node* parent;
4004
4005 while ((parent = NODE_PARENT(me)) != NULL_NODE) {
4006 if (parent == node) return 1;
4007 me = parent;
4008 }
4009 return 0;
4010 }
4011
4012 static void
set_empty_status_check_trav(Node * node,ScanEnv * env)4013 set_empty_status_check_trav(Node* node, ScanEnv* env)
4014 {
4015 switch (NODE_TYPE(node)) {
4016 case NODE_LIST:
4017 case NODE_ALT:
4018 do {
4019 set_empty_status_check_trav(NODE_CAR(node), env);
4020 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4021 break;
4022
4023 case NODE_ANCHOR:
4024 {
4025 AnchorNode* an = ANCHOR_(node);
4026
4027 if (! ANCHOR_HAS_BODY(an)) break;
4028 set_empty_status_check_trav(NODE_BODY(node), env);
4029 }
4030 break;
4031
4032 case NODE_QUANT:
4033 set_empty_status_check_trav(NODE_BODY(node), env);
4034 break;
4035
4036 case NODE_BAG:
4037 if (IS_NOT_NULL(NODE_BODY(node)))
4038 set_empty_status_check_trav(NODE_BODY(node), env);
4039 {
4040 BagNode* en = BAG_(node);
4041
4042 if (en->type == BAG_IF_ELSE) {
4043 if (IS_NOT_NULL(en->te.Then)) {
4044 set_empty_status_check_trav(en->te.Then, env);
4045 }
4046 if (IS_NOT_NULL(en->te.Else)) {
4047 set_empty_status_check_trav(en->te.Else, env);
4048 }
4049 }
4050 }
4051 break;
4052
4053 case NODE_BACKREF:
4054 {
4055 int i;
4056 int* backs;
4057 MemEnv* mem_env = SCANENV_MEMENV(env);
4058 BackRefNode* br = BACKREF_(node);
4059 backs = BACKREFS_P(br);
4060 for (i = 0; i < br->back_num; i++) {
4061 Node* ernode = mem_env[backs[i]].empty_repeat_node;
4062 if (IS_NOT_NULL(ernode)) {
4063 if (! is_ancestor_node(ernode, node)) {
4064 MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]);
4065 NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
4066 NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
4067 }
4068 }
4069 }
4070 }
4071 break;
4072
4073 default:
4074 break;
4075 }
4076 }
4077
4078 static void
set_parent_node_trav(Node * node,Node * parent)4079 set_parent_node_trav(Node* node, Node* parent)
4080 {
4081 NODE_PARENT(node) = parent;
4082
4083 switch (NODE_TYPE(node)) {
4084 case NODE_LIST:
4085 case NODE_ALT:
4086 do {
4087 set_parent_node_trav(NODE_CAR(node), node);
4088 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4089 break;
4090
4091 case NODE_ANCHOR:
4092 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break;
4093 set_parent_node_trav(NODE_BODY(node), node);
4094 break;
4095
4096 case NODE_QUANT:
4097 set_parent_node_trav(NODE_BODY(node), node);
4098 break;
4099
4100 case NODE_BAG:
4101 if (IS_NOT_NULL(NODE_BODY(node)))
4102 set_parent_node_trav(NODE_BODY(node), node);
4103 {
4104 BagNode* en = BAG_(node);
4105
4106 if (en->type == BAG_IF_ELSE) {
4107 if (IS_NOT_NULL(en->te.Then))
4108 set_parent_node_trav(en->te.Then, node);
4109 if (IS_NOT_NULL(en->te.Else)) {
4110 set_parent_node_trav(en->te.Else, node);
4111 }
4112 }
4113 }
4114 break;
4115
4116 default:
4117 break;
4118 }
4119 }
4120
4121
4122 #ifdef USE_CALL
4123
4124 #define RECURSION_EXIST (1<<0)
4125 #define RECURSION_MUST (1<<1)
4126 #define RECURSION_INFINITE (1<<2)
4127
4128 static int
infinite_recursive_call_check(Node * node,ScanEnv * env,int head)4129 infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
4130 {
4131 int ret;
4132 int r = 0;
4133
4134 switch (NODE_TYPE(node)) {
4135 case NODE_LIST:
4136 {
4137 Node *x;
4138 OnigLen min;
4139
4140 x = node;
4141 do {
4142 ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
4143 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4144 r |= ret;
4145 if (head != 0) {
4146 min = node_min_byte_len(NODE_CAR(x), env);
4147 if (min != 0) head = 0;
4148 }
4149 } while (IS_NOT_NULL(x = NODE_CDR(x)));
4150 }
4151 break;
4152
4153 case NODE_ALT:
4154 {
4155 int must;
4156
4157 must = RECURSION_MUST;
4158 do {
4159 ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
4160 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4161
4162 r |= (ret & RECURSION_EXIST);
4163 must &= ret;
4164 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4165 r |= must;
4166 }
4167 break;
4168
4169 case NODE_QUANT:
4170 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4171 if (r < 0) return r;
4172 if ((r & RECURSION_MUST) != 0) {
4173 if (QUANT_(node)->lower == 0)
4174 r &= ~RECURSION_MUST;
4175 }
4176 break;
4177
4178 case NODE_ANCHOR:
4179 if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
4180 break;
4181 /* fall */
4182 case NODE_CALL:
4183 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4184 break;
4185
4186 case NODE_BAG:
4187 {
4188 BagNode* en = BAG_(node);
4189
4190 if (en->type == BAG_MEMORY) {
4191 if (NODE_IS_MARK2(node))
4192 return 0;
4193 else if (NODE_IS_MARK1(node))
4194 return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
4195 : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
4196 else {
4197 NODE_STATUS_ADD(node, MARK2);
4198 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4199 NODE_STATUS_REMOVE(node, MARK2);
4200 }
4201 }
4202 else if (en->type == BAG_IF_ELSE) {
4203 int eret;
4204
4205 ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
4206 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4207 r |= ret;
4208 if (IS_NOT_NULL(en->te.Then)) {
4209 OnigLen min;
4210 if (head != 0) {
4211 min = node_min_byte_len(NODE_BODY(node), env);
4212 }
4213 else min = 0;
4214
4215 ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
4216 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
4217 r |= ret;
4218 }
4219 if (IS_NOT_NULL(en->te.Else)) {
4220 eret = infinite_recursive_call_check(en->te.Else, env, head);
4221 if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
4222 r |= (eret & RECURSION_EXIST);
4223 if ((eret & RECURSION_MUST) == 0)
4224 r &= ~RECURSION_MUST;
4225 }
4226 else {
4227 r &= ~RECURSION_MUST;
4228 }
4229 }
4230 else {
4231 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
4232 }
4233 }
4234 break;
4235
4236 default:
4237 break;
4238 }
4239
4240 return r;
4241 }
4242
4243 static int
infinite_recursive_call_check_trav(Node * node,ScanEnv * env)4244 infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
4245 {
4246 int r;
4247
4248 switch (NODE_TYPE(node)) {
4249 case NODE_LIST:
4250 case NODE_ALT:
4251 do {
4252 r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
4253 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4254 break;
4255
4256 case NODE_ANCHOR:
4257 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4258 r = 0;
4259 break;
4260 }
4261 /* fall */
4262 case NODE_QUANT:
4263 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4264 break;
4265
4266 case NODE_BAG:
4267 {
4268 BagNode* en = BAG_(node);
4269
4270 if (en->type == BAG_MEMORY) {
4271 if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
4272 int ret;
4273
4274 NODE_STATUS_ADD(node, MARK1);
4275
4276 ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
4277 if (ret < 0) return ret;
4278 else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
4279 return ONIGERR_NEVER_ENDING_RECURSION;
4280
4281 NODE_STATUS_REMOVE(node, MARK1);
4282 }
4283 }
4284 else if (en->type == BAG_IF_ELSE) {
4285 if (IS_NOT_NULL(en->te.Then)) {
4286 r = infinite_recursive_call_check_trav(en->te.Then, env);
4287 if (r != 0) return r;
4288 }
4289 if (IS_NOT_NULL(en->te.Else)) {
4290 r = infinite_recursive_call_check_trav(en->te.Else, env);
4291 if (r != 0) return r;
4292 }
4293 }
4294 }
4295
4296 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
4297 break;
4298
4299 default:
4300 r = 0;
4301 break;
4302 }
4303
4304 return r;
4305 }
4306
4307 static int
recursive_call_check(Node * node)4308 recursive_call_check(Node* node)
4309 {
4310 int r;
4311
4312 switch (NODE_TYPE(node)) {
4313 case NODE_LIST:
4314 case NODE_ALT:
4315 r = 0;
4316 do {
4317 r |= recursive_call_check(NODE_CAR(node));
4318 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4319 break;
4320
4321 case NODE_ANCHOR:
4322 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
4323 r = 0;
4324 break;
4325 }
4326 /* fall */
4327 case NODE_QUANT:
4328 r = recursive_call_check(NODE_BODY(node));
4329 break;
4330
4331 case NODE_CALL:
4332 r = recursive_call_check(NODE_BODY(node));
4333 if (r != 0) {
4334 if (NODE_IS_MARK1(NODE_BODY(node)))
4335 NODE_STATUS_ADD(node, RECURSION);
4336 }
4337 break;
4338
4339 case NODE_BAG:
4340 {
4341 BagNode* en = BAG_(node);
4342
4343 if (en->type == BAG_MEMORY) {
4344 if (NODE_IS_MARK2(node))
4345 return 0;
4346 else if (NODE_IS_MARK1(node))
4347 return 1; /* recursion */
4348 else {
4349 NODE_STATUS_ADD(node, MARK2);
4350 r = recursive_call_check(NODE_BODY(node));
4351 NODE_STATUS_REMOVE(node, MARK2);
4352 }
4353 }
4354 else if (en->type == BAG_IF_ELSE) {
4355 r = 0;
4356 if (IS_NOT_NULL(en->te.Then)) {
4357 r |= recursive_call_check(en->te.Then);
4358 }
4359 if (IS_NOT_NULL(en->te.Else)) {
4360 r |= recursive_call_check(en->te.Else);
4361 }
4362 r |= recursive_call_check(NODE_BODY(node));
4363 }
4364 else {
4365 r = recursive_call_check(NODE_BODY(node));
4366 }
4367 }
4368 break;
4369
4370 default:
4371 r = 0;
4372 break;
4373 }
4374
4375 return r;
4376 }
4377
4378 #define IN_RECURSION (1<<0)
4379 #define FOUND_CALLED_NODE 1
4380
4381 static int
recursive_call_check_trav(Node * node,ScanEnv * env,int state)4382 recursive_call_check_trav(Node* node, ScanEnv* env, int state)
4383 {
4384 int r = 0;
4385
4386 switch (NODE_TYPE(node)) {
4387 case NODE_LIST:
4388 case NODE_ALT:
4389 {
4390 int ret;
4391 do {
4392 ret = recursive_call_check_trav(NODE_CAR(node), env, state);
4393 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
4394 else if (ret < 0) return ret;
4395 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4396 }
4397 break;
4398
4399 case NODE_QUANT:
4400 r = recursive_call_check_trav(NODE_BODY(node), env, state);
4401 if (QUANT_(node)->upper == 0) {
4402 if (r == FOUND_CALLED_NODE)
4403 QUANT_(node)->include_referred = 1;
4404 }
4405 break;
4406
4407 case NODE_ANCHOR:
4408 {
4409 AnchorNode* an = ANCHOR_(node);
4410 if (ANCHOR_HAS_BODY(an))
4411 r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
4412 }
4413 break;
4414
4415 case NODE_BAG:
4416 {
4417 int ret;
4418 int state1;
4419 BagNode* en = BAG_(node);
4420
4421 if (en->type == BAG_MEMORY) {
4422 if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
4423 if (! NODE_IS_RECURSION(node)) {
4424 NODE_STATUS_ADD(node, MARK1);
4425 r = recursive_call_check(NODE_BODY(node));
4426 if (r != 0) {
4427 NODE_STATUS_ADD(node, RECURSION);
4428 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
4429 }
4430 NODE_STATUS_REMOVE(node, MARK1);
4431 }
4432
4433 if (NODE_IS_CALLED(node))
4434 r = FOUND_CALLED_NODE;
4435 }
4436 }
4437
4438 state1 = state;
4439 if (NODE_IS_RECURSION(node))
4440 state1 |= IN_RECURSION;
4441
4442 ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
4443 if (ret == FOUND_CALLED_NODE)
4444 r = FOUND_CALLED_NODE;
4445
4446 if (en->type == BAG_IF_ELSE) {
4447 if (IS_NOT_NULL(en->te.Then)) {
4448 ret = recursive_call_check_trav(en->te.Then, env, state1);
4449 if (ret == FOUND_CALLED_NODE)
4450 r = FOUND_CALLED_NODE;
4451 }
4452 if (IS_NOT_NULL(en->te.Else)) {
4453 ret = recursive_call_check_trav(en->te.Else, env, state1);
4454 if (ret == FOUND_CALLED_NODE)
4455 r = FOUND_CALLED_NODE;
4456 }
4457 }
4458 }
4459 break;
4460
4461 default:
4462 break;
4463 }
4464
4465 return r;
4466 }
4467
4468 #endif
4469
4470 static void
remove_from_list(Node * prev,Node * a)4471 remove_from_list(Node* prev, Node* a)
4472 {
4473 if (NODE_CDR(prev) != a) return ;
4474
4475 NODE_CDR(prev) = NODE_CDR(a);
4476 NODE_CDR(a) = NULL_NODE;
4477 }
4478
4479 static int
reduce_string_list(Node * node)4480 reduce_string_list(Node* node)
4481 {
4482 int r = 0;
4483
4484 switch (NODE_TYPE(node)) {
4485 case NODE_LIST:
4486 {
4487 Node* prev;
4488 Node* curr;
4489 Node* prev_node;
4490 Node* next_node;
4491
4492 prev = NULL_NODE;
4493 do {
4494 next_node = NODE_CDR(node);
4495 curr = NODE_CAR(node);
4496 if (NODE_TYPE(curr) == NODE_STRING) {
4497 if (IS_NULL(prev)
4498 || STR_(curr)->flag != STR_(prev)->flag
4499 || NODE_STATUS(curr) != NODE_STATUS(prev)) {
4500 prev = curr;
4501 prev_node = node;
4502 }
4503 else {
4504 r = node_str_node_cat(prev, curr);
4505 if (r != 0) return r;
4506 remove_from_list(prev_node, node);
4507 onig_node_free(node);
4508 }
4509 }
4510 else {
4511 prev = NULL_NODE;
4512 prev_node = node;
4513 }
4514
4515 node = next_node;
4516 } while (r == 0 && IS_NOT_NULL(node));
4517 }
4518 break;
4519
4520 case NODE_ALT:
4521 do {
4522 r = reduce_string_list(NODE_CAR(node));
4523 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4524 break;
4525
4526 case NODE_ANCHOR:
4527 if (IS_NULL(NODE_BODY(node)))
4528 break;
4529 /* fall */
4530 case NODE_QUANT:
4531 r = reduce_string_list(NODE_BODY(node));
4532 break;
4533
4534 case NODE_BAG:
4535 {
4536 BagNode* en = BAG_(node);
4537
4538 r = reduce_string_list(NODE_BODY(node));
4539 if (r != 0) return r;
4540
4541 if (en->type == BAG_IF_ELSE) {
4542 if (IS_NOT_NULL(en->te.Then)) {
4543 r = reduce_string_list(en->te.Then);
4544 if (r != 0) return r;
4545 }
4546 if (IS_NOT_NULL(en->te.Else)) {
4547 r = reduce_string_list(en->te.Else);
4548 if (r != 0) return r;
4549 }
4550 }
4551 }
4552 break;
4553
4554 default:
4555 break;
4556 }
4557
4558 return r;
4559 }
4560
4561
4562 #define IN_ALT (1<<0)
4563 #define IN_NOT (1<<1)
4564 #define IN_REAL_REPEAT (1<<2)
4565 #define IN_VAR_REPEAT (1<<3)
4566 #define IN_ZERO_REPEAT (1<<4)
4567 #define IN_MULTI_ENTRY (1<<5)
4568 #define IN_LOOK_BEHIND (1<<6)
4569
4570
4571 /* divide different length alternatives in look-behind.
4572 (?<=A|B) ==> (?<=A)|(?<=B)
4573 (?<!A|B) ==> (?<!A)(?<!B)
4574 */
4575 static int
divide_look_behind_alternatives(Node * node)4576 divide_look_behind_alternatives(Node* node)
4577 {
4578 int r;
4579 int anc_type;
4580 Node *head, *np, *insert_node;
4581 AnchorNode* an;
4582
4583 an = ANCHOR_(node);
4584 anc_type = an->type;
4585
4586 head = NODE_ANCHOR_BODY(an);
4587 np = NODE_CAR(head);
4588 node_swap(node, head);
4589 NODE_CAR(node) = head;
4590 NODE_BODY(head) = np;
4591
4592 np = node;
4593 while (IS_NOT_NULL(np = NODE_CDR(np))) {
4594 r = onig_node_copy(&insert_node, head);
4595 if (r != 0) return r;
4596 CHECK_NULL_RETURN_MEMERR(insert_node);
4597 NODE_BODY(insert_node) = NODE_CAR(np);
4598 NODE_CAR(np) = insert_node;
4599 }
4600
4601 if (anc_type == ANCR_LOOK_BEHIND_NOT) {
4602 np = node;
4603 do {
4604 NODE_SET_TYPE(np, NODE_LIST); /* alt -> list */
4605 } while (IS_NOT_NULL(np = NODE_CDR(np)));
4606 }
4607 return 0;
4608 }
4609
4610 static int
node_reduce_in_look_behind(Node * node)4611 node_reduce_in_look_behind(Node* node)
4612 {
4613 NodeType type;
4614 Node* body;
4615
4616 if (NODE_TYPE(node) != NODE_QUANT) return 0;
4617
4618 body = NODE_BODY(node);
4619 type = NODE_TYPE(body);
4620 if (type == NODE_STRING || type == NODE_CTYPE ||
4621 type == NODE_CCLASS || type == NODE_BACKREF) {
4622 QuantNode* qn = QUANT_(node);
4623 qn->upper = qn->lower;
4624 if (qn->upper == 0)
4625 return 1; /* removed */
4626 }
4627
4628 return 0;
4629 }
4630
4631 static int
list_reduce_in_look_behind(Node * node)4632 list_reduce_in_look_behind(Node* node)
4633 {
4634 int r;
4635
4636 switch (NODE_TYPE(node)) {
4637 case NODE_QUANT:
4638 r = node_reduce_in_look_behind(node);
4639 if (r > 0) r = 0;
4640 break;
4641
4642 case NODE_LIST:
4643 do {
4644 r = node_reduce_in_look_behind(NODE_CAR(node));
4645 if (r <= 0) break;
4646 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4647 break;
4648
4649 default:
4650 r = 0;
4651 break;
4652 }
4653
4654 return r;
4655 }
4656
4657 static int
alt_reduce_in_look_behind(Node * node,regex_t * reg,ScanEnv * env)4658 alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env)
4659 {
4660 int r;
4661
4662 switch (NODE_TYPE(node)) {
4663 case NODE_ALT:
4664 do {
4665 r = list_reduce_in_look_behind(NODE_CAR(node));
4666 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4667 break;
4668
4669 default:
4670 r = list_reduce_in_look_behind(node);
4671 break;
4672 }
4673
4674 return r;
4675 }
4676
4677 static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
4678
4679 static int
tune_look_behind(Node * node,regex_t * reg,int state,ScanEnv * env)4680 tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env)
4681 {
4682 int r;
4683 int state1;
4684 int used;
4685 MinMaxCharLen ci;
4686 Node* body;
4687 AnchorNode* an = ANCHOR_(node);
4688
4689 used = FALSE;
4690 r = check_node_in_look_behind(NODE_ANCHOR_BODY(an),
4691 an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0,
4692 &used);
4693 if (r < 0) return r;
4694 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4695
4696 if (an->type == ANCR_LOOK_BEHIND_NOT)
4697 state1 = state | IN_NOT | IN_LOOK_BEHIND;
4698 else
4699 state1 = state | IN_LOOK_BEHIND;
4700
4701 body = NODE_ANCHOR_BODY(an);
4702 /* Execute tune_tree(body) before call node_char_len().
4703 Because case-fold expansion must be done before node_char_len().
4704 */
4705 r = tune_tree(body, reg, state1, env);
4706 if (r != 0) return r;
4707
4708 r = alt_reduce_in_look_behind(body, reg, env);
4709 if (r != 0) return r;
4710
4711 r = node_char_len(body, reg, &ci, env);
4712 if (r >= 0) {
4713 /* #177: overflow in onigenc_step_back() */
4714 if ((ci.max != INFINITE_LEN && ci.max > LOOK_BEHIND_MAX_CHAR_LEN)
4715 || ci.min > LOOK_BEHIND_MAX_CHAR_LEN) {
4716 return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4717 }
4718
4719 if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) {
4720 if (an->type == ANCR_LOOK_BEHIND_NOT)
4721 r = onig_node_reset_fail(node);
4722 else
4723 r = onig_node_reset_empty(node);
4724
4725 return r;
4726 }
4727
4728 if (r == CHAR_LEN_TOP_ALT_FIXED) {
4729 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) {
4730 r = divide_look_behind_alternatives(node);
4731 if (r == 0)
4732 r = tune_tree(node, reg, state, env);
4733 }
4734 else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND))
4735 goto normal;
4736 else
4737 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4738 }
4739 else { /* CHAR_LEN_NORMAL */
4740 normal:
4741 if (ci.min == INFINITE_LEN) {
4742 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4743 }
4744 else {
4745 if (ci.min != ci.max &&
4746 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) {
4747 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4748 }
4749 else {
4750 Node* tail;
4751
4752 /* check lead_node is already set by double call after
4753 divide_look_behind_alternatives() */
4754 if (IS_NULL(an->lead_node)) {
4755 an->char_min_len = ci.min;
4756 an->char_max_len = ci.max;
4757 r = get_tree_tail_literal(body, &tail, reg);
4758 if (r == GET_VALUE_FOUND) {
4759 r = onig_node_copy(&(an->lead_node), tail);
4760 if (r != 0) return r;
4761 }
4762 }
4763 r = ONIG_NORMAL;
4764 }
4765 }
4766 }
4767 }
4768
4769 return r;
4770 }
4771
4772 static int
tune_next(Node * node,Node * next_node,regex_t * reg)4773 tune_next(Node* node, Node* next_node, regex_t* reg)
4774 {
4775 NodeType type;
4776
4777 retry:
4778 type = NODE_TYPE(node);
4779 if (type == NODE_QUANT) {
4780 QuantNode* qn = QUANT_(node);
4781 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
4782 #ifdef USE_QUANT_PEEK_NEXT
4783 Node* n = get_tree_head_literal(next_node, 1, reg);
4784 /* '\0': for UTF-16BE etc... */
4785 if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
4786 qn->next_head_exact = n;
4787 }
4788 #endif
4789 /* automatic posseivation a*b ==> (?>a*)b */
4790 if (qn->lower <= 1) {
4791 if (is_strict_real_node(NODE_BODY(node))) {
4792 Node *x, *y;
4793 x = get_tree_head_literal(NODE_BODY(node), 0, reg);
4794 if (IS_NOT_NULL(x)) {
4795 y = get_tree_head_literal(next_node, 0, reg);
4796 if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
4797 Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
4798 CHECK_NULL_RETURN_MEMERR(en);
4799 NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
4800 node_swap(node, en);
4801 NODE_BODY(node) = en;
4802 }
4803 }
4804 }
4805 }
4806 }
4807 }
4808 else if (type == NODE_BAG) {
4809 BagNode* en = BAG_(node);
4810 if (en->type == BAG_MEMORY) {
4811 node = NODE_BODY(node);
4812 goto retry;
4813 }
4814 }
4815 return 0;
4816 }
4817
4818
4819 static int
is_all_code_len_1_items(int n,OnigCaseFoldCodeItem items[])4820 is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[])
4821 {
4822 int i;
4823
4824 for (i = 0; i < n; i++) {
4825 OnigCaseFoldCodeItem* item = items + i;
4826 if (item->code_len != 1) return 0;
4827 }
4828
4829 return 1;
4830 }
4831
4832 static int
get_min_max_byte_len_case_fold_items(int n,OnigCaseFoldCodeItem items[],OnigLen * rmin,OnigLen * rmax)4833 get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[],
4834 OnigLen* rmin, OnigLen* rmax)
4835 {
4836 int i;
4837 OnigLen len, minlen, maxlen;
4838
4839 minlen = INFINITE_LEN;
4840 maxlen = 0;
4841 for (i = 0; i < n; i++) {
4842 OnigCaseFoldCodeItem* item = items + i;
4843
4844 len = item->byte_len;
4845 if (len < minlen) minlen = len;
4846 if (len > maxlen) maxlen = len;
4847 }
4848
4849 *rmin = minlen;
4850 *rmax = maxlen;
4851 return 0;
4852 }
4853
4854 static int
make_code_list_to_string(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])4855 make_code_list_to_string(Node** rnode, OnigEncoding enc,
4856 int n, OnigCodePoint codes[])
4857 {
4858 int r, i, len;
4859 Node* node;
4860 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4861
4862 *rnode = NULL_NODE;
4863 node = onig_node_new_str(NULL, NULL);
4864 CHECK_NULL_RETURN_MEMERR(node);
4865
4866 for (i = 0; i < n; i++) {
4867 len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
4868 if (len < 0) {
4869 r = len;
4870 goto err;
4871 }
4872
4873 r = onig_node_str_cat(node, buf, buf + len);
4874 if (r != 0) goto err;
4875 }
4876
4877 *rnode = node;
4878 return 0;
4879
4880 err:
4881 onig_node_free(node);
4882 return r;
4883 }
4884
4885 static int
unravel_cf_node_add(Node ** rlist,Node * add)4886 unravel_cf_node_add(Node** rlist, Node* add)
4887 {
4888 Node *list;
4889
4890 list = *rlist;
4891 if (IS_NULL(list)) {
4892 list = onig_node_new_list(add, NULL);
4893 CHECK_NULL_RETURN_MEMERR(list);
4894 *rlist = list;
4895 }
4896 else {
4897 Node* r = node_list_add(list, add);
4898 CHECK_NULL_RETURN_MEMERR(r);
4899 }
4900
4901 return 0;
4902 }
4903
4904 static int
unravel_cf_string_add(Node ** rlist,Node ** rsn,UChar * s,UChar * end,unsigned int flag)4905 unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end,
4906 unsigned int flag)
4907 {
4908 int r;
4909 Node *sn, *list;
4910
4911 list = *rlist;
4912 sn = *rsn;
4913
4914 if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) {
4915 r = onig_node_str_cat(sn, s, end);
4916 }
4917 else {
4918 sn = onig_node_new_str(s, end);
4919 CHECK_NULL_RETURN_MEMERR(sn);
4920
4921 STR_(sn)->flag = flag;
4922 r = unravel_cf_node_add(&list, sn);
4923 }
4924
4925 if (r == 0) {
4926 *rlist = list;
4927 *rsn = sn;
4928 }
4929 return r;
4930 }
4931
4932 static int
unravel_cf_string_alt_or_cc_add(Node ** rlist,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,OnigCaseFoldType case_fold_flag,UChar * s,UChar * end)4933 unravel_cf_string_alt_or_cc_add(Node** rlist, int n,
4934 OnigCaseFoldCodeItem items[], OnigEncoding enc,
4935 OnigCaseFoldType case_fold_flag, UChar* s, UChar* end)
4936 {
4937 int r, i;
4938 Node* node;
4939
4940 if (is_all_code_len_1_items(n, items)) {
4941 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
4942
4943 codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end);
4944 for (i = 0; i < n; i++) {
4945 OnigCaseFoldCodeItem* item = items + i;
4946 codes[i+1] = item->code[0];
4947 }
4948 r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes);
4949 if (r != 0) return r;
4950 }
4951 else {
4952 Node *snode, *alt, *curr;
4953
4954 snode = onig_node_new_str(s, end);
4955 CHECK_NULL_RETURN_MEMERR(snode);
4956 node = curr = onig_node_new_alt(snode, NULL_NODE);
4957 if (IS_NULL(curr)) {
4958 onig_node_free(snode);
4959 return ONIGERR_MEMORY;
4960 }
4961
4962 r = 0;
4963 for (i = 0; i < n; i++) {
4964 OnigCaseFoldCodeItem* item = items + i;
4965 r = make_code_list_to_string(&snode, enc, item->code_len, item->code);
4966 if (r != 0) {
4967 onig_node_free(node);
4968 return r;
4969 }
4970
4971 alt = onig_node_new_alt(snode, NULL_NODE);
4972 if (IS_NULL(alt)) {
4973 onig_node_free(snode);
4974 onig_node_free(node);
4975 return ONIGERR_MEMORY;
4976 }
4977
4978 NODE_CDR(curr) = alt;
4979 curr = alt;
4980 }
4981 }
4982
4983 r = unravel_cf_node_add(rlist, node);
4984 if (r != 0) onig_node_free(node);
4985 return r;
4986 }
4987
4988 static int
unravel_cf_look_behind_add(Node ** rlist,Node ** rsn,int n,OnigCaseFoldCodeItem items[],OnigEncoding enc,UChar * s,OnigLen one_len)4989 unravel_cf_look_behind_add(Node** rlist, Node** rsn,
4990 int n, OnigCaseFoldCodeItem items[], OnigEncoding enc,
4991 UChar* s, OnigLen one_len)
4992 {
4993 int r, i, found;
4994
4995 found = 0;
4996 for (i = 0; i < n; i++) {
4997 OnigCaseFoldCodeItem* item = items + i;
4998 if (item->byte_len == one_len) {
4999 if (item->code_len == 1) {
5000 found = 1;
5001 }
5002 }
5003 }
5004
5005 if (found == 0) {
5006 r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */);
5007 }
5008 else {
5009 Node* node;
5010 OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */
5011
5012 found = 0;
5013 codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len);
5014 for (i = 0; i < n; i++) {
5015 OnigCaseFoldCodeItem* item = items + i;
5016 if (item->byte_len == one_len) {
5017 if (item->code_len == 1) {
5018 codes[found++] = item->code[0];
5019 }
5020 }
5021 }
5022 r = onig_new_cclass_with_code_list(&node, enc, found, codes);
5023 if (r != 0) return r;
5024
5025 r = unravel_cf_node_add(rlist, node);
5026 if (r != 0) onig_node_free(node);
5027
5028 *rsn = NULL_NODE;
5029 }
5030
5031 return r;
5032 }
5033
5034 static int
unravel_case_fold_string(Node * node,regex_t * reg,int state)5035 unravel_case_fold_string(Node* node, regex_t* reg, int state)
5036 {
5037 int r, n, in_look_behind;
5038 OnigLen min_len, max_len, one_len;
5039 UChar *start, *end, *p, *q;
5040 StrNode* snode;
5041 Node *sn, *list;
5042 OnigEncoding enc;
5043 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5044
5045 if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0;
5046
5047 NODE_STATUS_REMOVE(node, IGNORECASE);
5048 snode = STR_(node);
5049 start = snode->s;
5050 end = snode->end;
5051 if (start >= end) return 0;
5052
5053 in_look_behind = (state & IN_LOOK_BEHIND) != 0;
5054 enc = reg->enc;
5055
5056 list = sn = NULL_NODE;
5057 p = start;
5058 while (p < end) {
5059 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end,
5060 items);
5061 if (n < 0) {
5062 r = n;
5063 goto err;
5064 }
5065
5066 one_len = (OnigLen )enclen(enc, p);
5067 if (n == 0) {
5068 q = p + one_len;
5069 r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */);
5070 if (r != 0) goto err;
5071 }
5072 else {
5073 if (in_look_behind != 0) {
5074 q = p + one_len;
5075 if (items[0].byte_len != one_len) {
5076 r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q,
5077 items);
5078 if (r < 0) goto err;
5079 n = r;
5080 }
5081 r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len);
5082 if (r != 0) goto err;
5083 }
5084 else {
5085 get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len);
5086 if (min_len != max_len) {
5087 r = ONIGERR_PARSER_BUG;
5088 goto err;
5089 }
5090
5091 q = p + max_len;
5092 r = unravel_cf_string_alt_or_cc_add(&list, n, items, enc,
5093 reg->case_fold_flag, p, q);
5094 if (r != 0) goto err;
5095 sn = NULL_NODE;
5096 }
5097 }
5098
5099 p = q;
5100 }
5101
5102 if (IS_NOT_NULL(list)) {
5103 if (node_list_len(list) == 1) {
5104 node_swap(node, NODE_CAR(list));
5105 }
5106 else {
5107 node_swap(node, list);
5108 }
5109 onig_node_free(list);
5110 }
5111 else {
5112 node_swap(node, sn);
5113 onig_node_free(sn);
5114 }
5115 return 0;
5116
5117 err:
5118 if (IS_NOT_NULL(list))
5119 onig_node_free(list);
5120 else if (IS_NOT_NULL(sn))
5121 onig_node_free(sn);
5122
5123 return r;
5124 }
5125
5126 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5127 static enum BodyEmptyType
quantifiers_memory_node_info(Node * node)5128 quantifiers_memory_node_info(Node* node)
5129 {
5130 int r = BODY_MAY_BE_EMPTY;
5131
5132 switch (NODE_TYPE(node)) {
5133 case NODE_LIST:
5134 case NODE_ALT:
5135 {
5136 int v;
5137 do {
5138 v = quantifiers_memory_node_info(NODE_CAR(node));
5139 if (v > r) r = v;
5140 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5141 }
5142 break;
5143
5144 #ifdef USE_CALL
5145 case NODE_CALL:
5146 if (NODE_IS_RECURSION(node)) {
5147 return BODY_MAY_BE_EMPTY_REC; /* tiny version */
5148 }
5149 else
5150 r = quantifiers_memory_node_info(NODE_BODY(node));
5151 break;
5152 #endif
5153
5154 case NODE_QUANT:
5155 {
5156 QuantNode* qn = QUANT_(node);
5157 if (qn->upper != 0) {
5158 r = quantifiers_memory_node_info(NODE_BODY(node));
5159 }
5160 }
5161 break;
5162
5163 case NODE_BAG:
5164 {
5165 BagNode* en = BAG_(node);
5166 switch (en->type) {
5167 case BAG_MEMORY:
5168 if (NODE_IS_RECURSION(node)) {
5169 return BODY_MAY_BE_EMPTY_REC;
5170 }
5171 return BODY_MAY_BE_EMPTY_MEM;
5172 break;
5173
5174 case BAG_OPTION:
5175 case BAG_STOP_BACKTRACK:
5176 r = quantifiers_memory_node_info(NODE_BODY(node));
5177 break;
5178 case BAG_IF_ELSE:
5179 {
5180 int v;
5181 r = quantifiers_memory_node_info(NODE_BODY(node));
5182 if (IS_NOT_NULL(en->te.Then)) {
5183 v = quantifiers_memory_node_info(en->te.Then);
5184 if (v > r) r = v;
5185 }
5186 if (IS_NOT_NULL(en->te.Else)) {
5187 v = quantifiers_memory_node_info(en->te.Else);
5188 if (v > r) r = v;
5189 }
5190 }
5191 break;
5192 }
5193 }
5194 break;
5195
5196 case NODE_BACKREF:
5197 case NODE_STRING:
5198 case NODE_CTYPE:
5199 case NODE_CCLASS:
5200 case NODE_ANCHOR:
5201 case NODE_GIMMICK:
5202 default:
5203 break;
5204 }
5205
5206 return r;
5207 }
5208 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
5209
5210
5211 #ifdef USE_CALL
5212
5213 #ifdef __GNUC__
5214 __inline
5215 #endif
5216 static int
tune_call_node_call(CallNode * cn,ScanEnv * env,int state)5217 tune_call_node_call(CallNode* cn, ScanEnv* env, int state)
5218 {
5219 MemEnv* mem_env = SCANENV_MEMENV(env);
5220
5221 if (cn->by_number != 0) {
5222 int gnum = cn->group_num;
5223
5224 if (env->num_named > 0 &&
5225 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5226 ! OPTON_CAPTURE_GROUP(env->options)) {
5227 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
5228 }
5229
5230 if (gnum > env->num_mem) {
5231 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
5232 cn->name, cn->name_end);
5233 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5234 }
5235
5236 set_call_attr:
5237 NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node;
5238 if (IS_NULL(NODE_CALL_BODY(cn))) {
5239 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5240 cn->name, cn->name_end);
5241 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5242 }
5243 }
5244 else {
5245 int *refs;
5246
5247 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
5248 if (n <= 0) {
5249 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5250 cn->name, cn->name_end);
5251 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5252 }
5253 else if (n > 1) {
5254 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
5255 cn->name, cn->name_end);
5256 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
5257 }
5258 else {
5259 cn->group_num = refs[0];
5260 goto set_call_attr;
5261 }
5262 }
5263
5264 return 0;
5265 }
5266
5267 static void
tune_call2_call(Node * node)5268 tune_call2_call(Node* node)
5269 {
5270 switch (NODE_TYPE(node)) {
5271 case NODE_LIST:
5272 case NODE_ALT:
5273 do {
5274 tune_call2_call(NODE_CAR(node));
5275 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5276 break;
5277
5278 case NODE_QUANT:
5279 tune_call2_call(NODE_BODY(node));
5280 break;
5281
5282 case NODE_ANCHOR:
5283 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5284 tune_call2_call(NODE_BODY(node));
5285 break;
5286
5287 case NODE_BAG:
5288 {
5289 BagNode* en = BAG_(node);
5290
5291 if (en->type == BAG_MEMORY) {
5292 if (! NODE_IS_MARK1(node)) {
5293 NODE_STATUS_ADD(node, MARK1);
5294 tune_call2_call(NODE_BODY(node));
5295 NODE_STATUS_REMOVE(node, MARK1);
5296 }
5297 }
5298 else if (en->type == BAG_IF_ELSE) {
5299 tune_call2_call(NODE_BODY(node));
5300 if (IS_NOT_NULL(en->te.Then))
5301 tune_call2_call(en->te.Then);
5302 if (IS_NOT_NULL(en->te.Else))
5303 tune_call2_call(en->te.Else);
5304 }
5305 else {
5306 tune_call2_call(NODE_BODY(node));
5307 }
5308 }
5309 break;
5310
5311 case NODE_CALL:
5312 if (! NODE_IS_MARK1(node)) {
5313 NODE_STATUS_ADD(node, MARK1);
5314 {
5315 CallNode* cn = CALL_(node);
5316 Node* called = NODE_CALL_BODY(cn);
5317
5318 cn->entry_count++;
5319
5320 NODE_STATUS_ADD(called, CALLED);
5321 BAG_(called)->m.entry_count++;
5322 tune_call2_call(called);
5323 }
5324 NODE_STATUS_REMOVE(node, MARK1);
5325 }
5326 break;
5327
5328 default:
5329 break;
5330 }
5331 }
5332
5333 static int
tune_call(Node * node,ScanEnv * env,int state)5334 tune_call(Node* node, ScanEnv* env, int state)
5335 {
5336 int r;
5337
5338 switch (NODE_TYPE(node)) {
5339 case NODE_LIST:
5340 case NODE_ALT:
5341 do {
5342 r = tune_call(NODE_CAR(node), env, state);
5343 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5344 break;
5345
5346 case NODE_QUANT:
5347 if (QUANT_(node)->upper == 0)
5348 state |= IN_ZERO_REPEAT;
5349
5350 r = tune_call(NODE_BODY(node), env, state);
5351 break;
5352
5353 case NODE_ANCHOR:
5354 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5355 r = tune_call(NODE_BODY(node), env, state);
5356 else
5357 r = 0;
5358 break;
5359
5360 case NODE_BAG:
5361 {
5362 BagNode* en = BAG_(node);
5363
5364 if (en->type == BAG_MEMORY) {
5365 if ((state & IN_ZERO_REPEAT) != 0) {
5366 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5367 BAG_(node)->m.entry_count--;
5368 }
5369 r = tune_call(NODE_BODY(node), env, state);
5370 }
5371 else if (en->type == BAG_IF_ELSE) {
5372 r = tune_call(NODE_BODY(node), env, state);
5373 if (r != 0) return r;
5374 if (IS_NOT_NULL(en->te.Then)) {
5375 r = tune_call(en->te.Then, env, state);
5376 if (r != 0) return r;
5377 }
5378 if (IS_NOT_NULL(en->te.Else))
5379 r = tune_call(en->te.Else, env, state);
5380 }
5381 else
5382 r = tune_call(NODE_BODY(node), env, state);
5383 }
5384 break;
5385
5386 case NODE_CALL:
5387 if ((state & IN_ZERO_REPEAT) != 0) {
5388 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
5389 CALL_(node)->entry_count--;
5390 }
5391
5392 r = tune_call_node_call(CALL_(node), env, state);
5393 break;
5394
5395 default:
5396 r = 0;
5397 break;
5398 }
5399
5400 return r;
5401 }
5402
5403 static int
tune_call2(Node * node)5404 tune_call2(Node* node)
5405 {
5406 int r = 0;
5407
5408 switch (NODE_TYPE(node)) {
5409 case NODE_LIST:
5410 case NODE_ALT:
5411 do {
5412 r = tune_call2(NODE_CAR(node));
5413 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5414 break;
5415
5416 case NODE_QUANT:
5417 if (QUANT_(node)->upper != 0)
5418 r = tune_call2(NODE_BODY(node));
5419 break;
5420
5421 case NODE_ANCHOR:
5422 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
5423 r = tune_call2(NODE_BODY(node));
5424 break;
5425
5426 case NODE_BAG:
5427 if (! NODE_IS_IN_ZERO_REPEAT(node))
5428 r = tune_call2(NODE_BODY(node));
5429
5430 {
5431 BagNode* en = BAG_(node);
5432
5433 if (r != 0) return r;
5434 if (en->type == BAG_IF_ELSE) {
5435 if (IS_NOT_NULL(en->te.Then)) {
5436 r = tune_call2(en->te.Then);
5437 if (r != 0) return r;
5438 }
5439 if (IS_NOT_NULL(en->te.Else))
5440 r = tune_call2(en->te.Else);
5441 }
5442 }
5443 break;
5444
5445 case NODE_CALL:
5446 if (! NODE_IS_IN_ZERO_REPEAT(node)) {
5447 tune_call2_call(node);
5448 }
5449 break;
5450
5451 default:
5452 break;
5453 }
5454
5455 return r;
5456 }
5457
5458
5459 static void
tune_called_state_call(Node * node,int state)5460 tune_called_state_call(Node* node, int state)
5461 {
5462 switch (NODE_TYPE(node)) {
5463 case NODE_ALT:
5464 state |= IN_ALT;
5465 /* fall */
5466 case NODE_LIST:
5467 do {
5468 tune_called_state_call(NODE_CAR(node), state);
5469 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5470 break;
5471
5472 case NODE_QUANT:
5473 {
5474 QuantNode* qn = QUANT_(node);
5475
5476 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5477 state |= IN_REAL_REPEAT;
5478 if (qn->lower != qn->upper)
5479 state |= IN_VAR_REPEAT;
5480
5481 tune_called_state_call(NODE_QUANT_BODY(qn), state);
5482 }
5483 break;
5484
5485 case NODE_ANCHOR:
5486 {
5487 AnchorNode* an = ANCHOR_(node);
5488
5489 switch (an->type) {
5490 case ANCR_PREC_READ_NOT:
5491 case ANCR_LOOK_BEHIND_NOT:
5492 state |= IN_NOT;
5493 /* fall */
5494 case ANCR_PREC_READ:
5495 case ANCR_LOOK_BEHIND:
5496 tune_called_state_call(NODE_ANCHOR_BODY(an), state);
5497 break;
5498 default:
5499 break;
5500 }
5501 }
5502 break;
5503
5504 case NODE_BAG:
5505 {
5506 BagNode* en = BAG_(node);
5507
5508 if (en->type == BAG_MEMORY) {
5509 if (NODE_IS_MARK1(node)) {
5510 if ((~en->m.called_state & state) != 0) {
5511 en->m.called_state |= state;
5512 tune_called_state_call(NODE_BODY(node), state);
5513 }
5514 }
5515 else {
5516 NODE_STATUS_ADD(node, MARK1);
5517 en->m.called_state |= state;
5518 tune_called_state_call(NODE_BODY(node), state);
5519 NODE_STATUS_REMOVE(node, MARK1);
5520 }
5521 }
5522 else if (en->type == BAG_IF_ELSE) {
5523 state |= IN_ALT;
5524 tune_called_state_call(NODE_BODY(node), state);
5525 if (IS_NOT_NULL(en->te.Then)) {
5526 tune_called_state_call(en->te.Then, state);
5527 }
5528 if (IS_NOT_NULL(en->te.Else))
5529 tune_called_state_call(en->te.Else, state);
5530 }
5531 else {
5532 tune_called_state_call(NODE_BODY(node), state);
5533 }
5534 }
5535 break;
5536
5537 case NODE_CALL:
5538 tune_called_state_call(NODE_BODY(node), state);
5539 break;
5540
5541 default:
5542 break;
5543 }
5544 }
5545
5546 static void
tune_called_state(Node * node,int state)5547 tune_called_state(Node* node, int state)
5548 {
5549 switch (NODE_TYPE(node)) {
5550 case NODE_ALT:
5551 state |= IN_ALT;
5552 /* fall */
5553 case NODE_LIST:
5554 do {
5555 tune_called_state(NODE_CAR(node), state);
5556 } while (IS_NOT_NULL(node = NODE_CDR(node)));
5557 break;
5558
5559 #ifdef USE_CALL
5560 case NODE_CALL:
5561 tune_called_state_call(node, state);
5562 break;
5563 #endif
5564
5565 case NODE_BAG:
5566 {
5567 BagNode* en = BAG_(node);
5568
5569 switch (en->type) {
5570 case BAG_MEMORY:
5571 if (en->m.entry_count > 1)
5572 state |= IN_MULTI_ENTRY;
5573
5574 en->m.called_state |= state;
5575 /* fall */
5576 case BAG_OPTION:
5577 case BAG_STOP_BACKTRACK:
5578 tune_called_state(NODE_BODY(node), state);
5579 break;
5580 case BAG_IF_ELSE:
5581 state |= IN_ALT;
5582 tune_called_state(NODE_BODY(node), state);
5583 if (IS_NOT_NULL(en->te.Then))
5584 tune_called_state(en->te.Then, state);
5585 if (IS_NOT_NULL(en->te.Else))
5586 tune_called_state(en->te.Else, state);
5587 break;
5588 }
5589 }
5590 break;
5591
5592 case NODE_QUANT:
5593 {
5594 QuantNode* qn = QUANT_(node);
5595
5596 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5597 state |= IN_REAL_REPEAT;
5598 if (qn->lower != qn->upper)
5599 state |= IN_VAR_REPEAT;
5600
5601 tune_called_state(NODE_QUANT_BODY(qn), state);
5602 }
5603 break;
5604
5605 case NODE_ANCHOR:
5606 {
5607 AnchorNode* an = ANCHOR_(node);
5608
5609 switch (an->type) {
5610 case ANCR_PREC_READ_NOT:
5611 case ANCR_LOOK_BEHIND_NOT:
5612 state |= IN_NOT;
5613 /* fall */
5614 case ANCR_PREC_READ:
5615 case ANCR_LOOK_BEHIND:
5616 tune_called_state(NODE_ANCHOR_BODY(an), state);
5617 break;
5618 default:
5619 break;
5620 }
5621 }
5622 break;
5623
5624 case NODE_BACKREF:
5625 case NODE_STRING:
5626 case NODE_CTYPE:
5627 case NODE_CCLASS:
5628 case NODE_GIMMICK:
5629 default:
5630 break;
5631 }
5632 }
5633
5634 #endif /* USE_CALL */
5635
5636
5637 #ifdef __GNUC__
5638 __inline
5639 #endif
5640 static int
tune_anchor(Node * node,regex_t * reg,int state,ScanEnv * env)5641 tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
5642 {
5643 int r;
5644 AnchorNode* an = ANCHOR_(node);
5645
5646 switch (an->type) {
5647 case ANCR_PREC_READ:
5648 r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env);
5649 break;
5650 case ANCR_PREC_READ_NOT:
5651 r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
5652 break;
5653
5654 case ANCR_LOOK_BEHIND:
5655 case ANCR_LOOK_BEHIND_NOT:
5656 r = tune_look_behind(node, reg, state, env);
5657 break;
5658
5659 default:
5660 r = 0;
5661 break;
5662 }
5663
5664 return r;
5665 }
5666
5667 #ifdef __GNUC__
5668 __inline
5669 #endif
5670 static int
tune_quant(Node * node,regex_t * reg,int state,ScanEnv * env)5671 tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
5672 {
5673 int r;
5674 QuantNode* qn = QUANT_(node);
5675 Node* body = NODE_BODY(node);
5676
5677 if ((state & IN_REAL_REPEAT) != 0) {
5678 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
5679 }
5680 if ((state & IN_MULTI_ENTRY) != 0) {
5681 NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
5682 }
5683
5684 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
5685 OnigLen d = node_min_byte_len(body, env);
5686 if (d == 0) {
5687 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
5688 qn->emptiness = quantifiers_memory_node_info(body);
5689 #else
5690 qn->emptiness = BODY_MAY_BE_EMPTY;
5691 #endif
5692 }
5693 }
5694
5695 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
5696 state |= IN_REAL_REPEAT;
5697 if (qn->lower != qn->upper)
5698 state |= IN_VAR_REPEAT;
5699
5700 r = tune_tree(body, reg, state, env);
5701 if (r != 0) return r;
5702
5703 /* expand string */
5704 #define EXPAND_STRING_MAX_LENGTH 100
5705 if (NODE_TYPE(body) == NODE_STRING) {
5706 if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
5707 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5708 int len = NODE_STRING_LEN(body);
5709
5710 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
5711 int i, n = qn->lower;
5712 node_conv_to_str_node(node, body);
5713 for (i = 0; i < n; i++) {
5714 r = node_str_node_cat(node, body);
5715 if (r != 0) return r;
5716 }
5717 onig_node_free(body);
5718 return r;
5719 }
5720 }
5721 }
5722
5723 if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
5724 if (NODE_TYPE(body) == NODE_QUANT) {
5725 QuantNode* tqn = QUANT_(body);
5726 if (IS_NOT_NULL(tqn->head_exact)) {
5727 qn->head_exact = tqn->head_exact;
5728 tqn->head_exact = NULL;
5729 }
5730 }
5731 else {
5732 qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg);
5733 }
5734 }
5735
5736 return r;
5737 }
5738
5739 /* tune_tree does the following work.
5740 1. check empty loop. (set qn->emptiness)
5741 2. expand ignore-case in char class.
5742 3. set memory status bit flags. (reg->mem_stats)
5743 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5744 5. find invalid patterns in look-behind.
5745 6. expand repeated string.
5746 */
5747 static int
tune_tree(Node * node,regex_t * reg,int state,ScanEnv * env)5748 tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
5749 {
5750 int r = 0;
5751
5752 switch (NODE_TYPE(node)) {
5753 case NODE_LIST:
5754 {
5755 Node* prev = NULL_NODE;
5756 do {
5757 r = tune_tree(NODE_CAR(node), reg, state, env);
5758 if (IS_NOT_NULL(prev) && r == 0) {
5759 r = tune_next(prev, NODE_CAR(node), reg);
5760 }
5761 prev = NODE_CAR(node);
5762 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5763 }
5764 break;
5765
5766 case NODE_ALT:
5767 do {
5768 r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
5769 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
5770 break;
5771
5772 case NODE_STRING:
5773 if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
5774 r = unravel_case_fold_string(node, reg, state);
5775 }
5776 break;
5777
5778 case NODE_BACKREF:
5779 {
5780 int i;
5781 int* p;
5782 BackRefNode* br = BACKREF_(node);
5783 p = BACKREFS_P(br);
5784 for (i = 0; i < br->back_num; i++) {
5785 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
5786 MEM_STATUS_ON(env->backrefed_mem, p[i]);
5787 #if 0
5788 #ifdef USE_BACKREF_WITH_LEVEL
5789 if (NODE_IS_NEST_LEVEL(node)) {
5790 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5791 }
5792 #endif
5793 #else
5794 /* More precisely, it should be checked whether alt/repeat exists before
5795 the subject capture node, and then this backreference position
5796 exists before (or in) the capture node. */
5797 MEM_STATUS_ON(env->backtrack_mem, p[i]);
5798 #endif
5799 }
5800 }
5801 break;
5802
5803 case NODE_BAG:
5804 {
5805 BagNode* en = BAG_(node);
5806
5807 switch (en->type) {
5808 case BAG_OPTION:
5809 {
5810 OnigOptionType options = reg->options;
5811 reg->options = BAG_(node)->o.options;
5812 r = tune_tree(NODE_BODY(node), reg, state, env);
5813 reg->options = options;
5814 }
5815 break;
5816
5817 case BAG_MEMORY:
5818 #ifdef USE_CALL
5819 state |= en->m.called_state;
5820 #endif
5821
5822 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
5823 || NODE_IS_RECURSION(node)) {
5824 MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
5825 }
5826 r = tune_tree(NODE_BODY(node), reg, state, env);
5827 break;
5828
5829 case BAG_STOP_BACKTRACK:
5830 {
5831 Node* target = NODE_BODY(node);
5832 r = tune_tree(target, reg, state, env);
5833 if (NODE_TYPE(target) == NODE_QUANT) {
5834 QuantNode* tqn = QUANT_(target);
5835 if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
5836 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
5837 if (is_strict_real_node(NODE_BODY(target)))
5838 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
5839 }
5840 }
5841 }
5842 break;
5843
5844 case BAG_IF_ELSE:
5845 r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
5846 if (r != 0) return r;
5847 if (IS_NOT_NULL(en->te.Then)) {
5848 r = tune_tree(en->te.Then, reg, (state | IN_ALT), env);
5849 if (r != 0) return r;
5850 }
5851 if (IS_NOT_NULL(en->te.Else))
5852 r = tune_tree(en->te.Else, reg, (state | IN_ALT), env);
5853 break;
5854 }
5855 }
5856 break;
5857
5858 case NODE_QUANT:
5859 r = tune_quant(node, reg, state, env);
5860 break;
5861
5862 case NODE_ANCHOR:
5863 r = tune_anchor(node, reg, state, env);
5864 break;
5865
5866 #ifdef USE_CALL
5867 case NODE_CALL:
5868 #endif
5869 case NODE_CTYPE:
5870 case NODE_CCLASS:
5871 case NODE_GIMMICK:
5872 default:
5873 break;
5874 }
5875
5876 return r;
5877 }
5878
5879 static int
set_sunday_quick_search_or_bmh_skip_table(regex_t * reg,int case_expand,UChar * s,UChar * end,UChar skip[],int * roffset)5880 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
5881 UChar* s, UChar* end,
5882 UChar skip[], int* roffset)
5883 {
5884 int i, j, k, len, offset;
5885 int n, clen;
5886 UChar* p;
5887 OnigEncoding enc;
5888 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5889 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
5890
5891 enc = reg->enc;
5892 offset = ENC_GET_SKIP_OFFSET(enc);
5893 if (offset == ENC_SKIP_OFFSET_1_OR_0) {
5894 UChar* p = s;
5895 while (1) {
5896 len = enclen(enc, p);
5897 if (p + len >= end) {
5898 if (len == 1) offset = 1;
5899 else offset = 0;
5900 break;
5901 }
5902 p += len;
5903 }
5904 }
5905
5906 len = (int )(end - s);
5907 if (len + offset >= UCHAR_MAX)
5908 return ONIGERR_PARSER_BUG;
5909
5910 *roffset = offset;
5911
5912 for (i = 0; i < CHAR_MAP_SIZE; i++) {
5913 skip[i] = (UChar )(len + offset);
5914 }
5915
5916 for (p = s; p < end; ) {
5917 int z;
5918
5919 clen = enclen(enc, p);
5920 if (p + clen > end) clen = (int )(end - p);
5921
5922 len = (int )(end - p);
5923 for (j = 0; j < clen; j++) {
5924 z = len - j + (offset - 1);
5925 if (z <= 0) break;
5926 skip[p[j]] = z;
5927 }
5928
5929 if (case_expand != 0) {
5930 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
5931 p, end, items);
5932 for (k = 0; k < n; k++) {
5933 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
5934 for (j = 0; j < clen; j++) {
5935 z = len - j + (offset - 1);
5936 if (z <= 0) break;
5937 if (skip[buf[j]] > z)
5938 skip[buf[j]] = z;
5939 }
5940 }
5941 }
5942
5943 p += clen;
5944 }
5945
5946 return 0;
5947 }
5948
5949
5950 #define OPT_EXACT_MAXLEN 24
5951
5952 #if OPT_EXACT_MAXLEN >= UCHAR_MAX
5953 #error Too big OPT_EXACT_MAXLEN
5954 #endif
5955
5956 typedef struct {
5957 MinMaxLen mm;
5958 OnigEncoding enc;
5959 OnigCaseFoldType case_fold_flag;
5960 ScanEnv* scan_env;
5961 } OptEnv;
5962
5963 typedef struct {
5964 int left;
5965 int right;
5966 } OptAnc;
5967
5968 typedef struct {
5969 MinMaxLen mm; /* position */
5970 OptAnc anc;
5971 int reach_end;
5972 int len;
5973 UChar s[OPT_EXACT_MAXLEN];
5974 } OptStr;
5975
5976 typedef struct {
5977 MinMaxLen mm; /* position */
5978 OptAnc anc;
5979 int value; /* weighted value */
5980 UChar map[CHAR_MAP_SIZE];
5981 } OptMap;
5982
5983 typedef struct {
5984 MinMaxLen len;
5985 OptAnc anc;
5986 OptStr sb; /* boundary */
5987 OptStr sm; /* middle */
5988 OptStr spr; /* prec read (?=...) */
5989 OptMap map; /* boundary */
5990 } OptNode;
5991
5992
5993 static int
map_position_value(OnigEncoding enc,int i)5994 map_position_value(OnigEncoding enc, int i)
5995 {
5996 static const short int Vals[] = {
5997 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
5998 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5999 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
6000 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
6001 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6002 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
6003 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6004 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
6005 };
6006
6007 if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
6008 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
6009 return 20;
6010 else
6011 return (int )Vals[i];
6012 }
6013 else
6014 return 4; /* Take it easy. */
6015 }
6016
6017 static int
distance_value(MinMaxLen * mm)6018 distance_value(MinMaxLen* mm)
6019 {
6020 /* 1000 / (min-max-dist + 1) */
6021 static const short int dist_vals[] = {
6022 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
6023 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
6024 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
6025 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
6026 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
6027 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
6028 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
6029 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
6030 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
6031 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
6032 };
6033
6034 OnigLen d;
6035
6036 if (mm->max == INFINITE_LEN) return 0;
6037
6038 d = mm->max - mm->min;
6039 if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
6040 /* return dist_vals[d] * 16 / (mm->min + 12); */
6041 return (int )dist_vals[d];
6042 else
6043 return 1;
6044 }
6045
6046 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)6047 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
6048 {
6049 if (v2 <= 0) return -1;
6050 if (v1 <= 0) return 1;
6051
6052 v1 *= distance_value(d1);
6053 v2 *= distance_value(d2);
6054
6055 if (v2 > v1) return 1;
6056 if (v2 < v1) return -1;
6057
6058 if (d2->min < d1->min) return 1;
6059 if (d2->min > d1->min) return -1;
6060 return 0;
6061 }
6062
6063 static void
copy_opt_env(OptEnv * to,OptEnv * from)6064 copy_opt_env(OptEnv* to, OptEnv* from)
6065 {
6066 *to = *from;
6067 }
6068
6069 static void
clear_opt_anc_info(OptAnc * a)6070 clear_opt_anc_info(OptAnc* a)
6071 {
6072 a->left = 0;
6073 a->right = 0;
6074 }
6075
6076 static void
copy_opt_anc_info(OptAnc * to,OptAnc * from)6077 copy_opt_anc_info(OptAnc* to, OptAnc* from)
6078 {
6079 *to = *from;
6080 }
6081
6082 static void
concat_opt_anc_info(OptAnc * to,OptAnc * left,OptAnc * right,OnigLen left_len,OnigLen right_len)6083 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
6084 OnigLen left_len, OnigLen right_len)
6085 {
6086 clear_opt_anc_info(to);
6087
6088 to->left = left->left;
6089 if (left_len == 0) {
6090 to->left |= right->left;
6091 }
6092
6093 to->right = right->right;
6094 if (right_len == 0) {
6095 to->right |= left->right;
6096 }
6097 else {
6098 to->right |= (left->right & ANCR_PREC_READ_NOT);
6099 }
6100 }
6101
6102 static int
is_left(int a)6103 is_left(int a)
6104 {
6105 if (a == ANCR_END_BUF || a == ANCR_SEMI_END_BUF ||
6106 a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
6107 return 0;
6108
6109 return 1;
6110 }
6111
6112 static int
is_set_opt_anc_info(OptAnc * to,int anc)6113 is_set_opt_anc_info(OptAnc* to, int anc)
6114 {
6115 if ((to->left & anc) != 0) return 1;
6116
6117 return ((to->right & anc) != 0 ? 1 : 0);
6118 }
6119
6120 static void
add_opt_anc_info(OptAnc * to,int anc)6121 add_opt_anc_info(OptAnc* to, int anc)
6122 {
6123 if (is_left(anc))
6124 to->left |= anc;
6125 else
6126 to->right |= anc;
6127 }
6128
6129 static void
remove_opt_anc_info(OptAnc * to,int anc)6130 remove_opt_anc_info(OptAnc* to, int anc)
6131 {
6132 if (is_left(anc))
6133 to->left &= ~anc;
6134 else
6135 to->right &= ~anc;
6136 }
6137
6138 static void
alt_merge_opt_anc_info(OptAnc * to,OptAnc * add)6139 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
6140 {
6141 to->left &= add->left;
6142 to->right &= add->right;
6143 }
6144
6145 static int
is_full_opt_exact(OptStr * e)6146 is_full_opt_exact(OptStr* e)
6147 {
6148 return e->len >= OPT_EXACT_MAXLEN;
6149 }
6150
6151 static void
clear_opt_exact(OptStr * e)6152 clear_opt_exact(OptStr* e)
6153 {
6154 mml_clear(&e->mm);
6155 clear_opt_anc_info(&e->anc);
6156 e->reach_end = 0;
6157 e->len = 0;
6158 e->s[0] = '\0';
6159 }
6160
6161 static void
copy_opt_exact(OptStr * to,OptStr * from)6162 copy_opt_exact(OptStr* to, OptStr* from)
6163 {
6164 *to = *from;
6165 }
6166
6167 static int
concat_opt_exact(OptStr * to,OptStr * add,OnigEncoding enc)6168 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
6169 {
6170 int i, j, len, r;
6171 UChar *p, *end;
6172 OptAnc tanc;
6173
6174 r = 0;
6175 p = add->s;
6176 end = p + add->len;
6177 for (i = to->len; p < end; ) {
6178 len = enclen(enc, p);
6179 if (i + len > OPT_EXACT_MAXLEN) {
6180 r = 1; /* 1:full */
6181 break;
6182 }
6183 for (j = 0; j < len && p < end; j++)
6184 to->s[i++] = *p++;
6185 }
6186
6187 to->len = i;
6188 to->reach_end = (p == end ? add->reach_end : 0);
6189
6190 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
6191 if (! to->reach_end) tanc.right = 0;
6192 copy_opt_anc_info(&to->anc, &tanc);
6193
6194 return r;
6195 }
6196
6197 static void
concat_opt_exact_str(OptStr * to,UChar * s,UChar * end,OnigEncoding enc)6198 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
6199 {
6200 int i, j, len;
6201 UChar *p;
6202
6203 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
6204 len = enclen(enc, p);
6205 if (i + len > OPT_EXACT_MAXLEN) break;
6206 for (j = 0; j < len && p < end; j++)
6207 to->s[i++] = *p++;
6208 }
6209
6210 to->len = i;
6211
6212 if (p >= end)
6213 to->reach_end = 1;
6214 }
6215
6216 static void
alt_merge_opt_exact(OptStr * to,OptStr * add,OptEnv * env)6217 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
6218 {
6219 int i, j, len;
6220
6221 if (add->len == 0 || to->len == 0) {
6222 clear_opt_exact(to);
6223 return ;
6224 }
6225
6226 if (! mml_is_equal(&to->mm, &add->mm)) {
6227 clear_opt_exact(to);
6228 return ;
6229 }
6230
6231 for (i = 0; i < to->len && i < add->len; ) {
6232 if (to->s[i] != add->s[i]) break;
6233 len = enclen(env->enc, to->s + i);
6234
6235 for (j = 1; j < len; j++) {
6236 if (to->s[i+j] != add->s[i+j]) break;
6237 }
6238 if (j < len) break;
6239 i += len;
6240 }
6241
6242 if (! add->reach_end || i < add->len || i < to->len) {
6243 to->reach_end = 0;
6244 }
6245 to->len = i;
6246
6247 alt_merge_opt_anc_info(&to->anc, &add->anc);
6248 if (! to->reach_end) to->anc.right = 0;
6249 }
6250
6251 static void
select_opt_exact(OnigEncoding enc,OptStr * now,OptStr * alt)6252 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
6253 {
6254 int vn, va;
6255
6256 vn = now->len;
6257 va = alt->len;
6258
6259 if (va == 0) {
6260 return ;
6261 }
6262 else if (vn == 0) {
6263 copy_opt_exact(now, alt);
6264 return ;
6265 }
6266 else if (vn <= 2 && va <= 2) {
6267 /* ByteValTable[x] is big value --> low price */
6268 va = map_position_value(enc, now->s[0]);
6269 vn = map_position_value(enc, alt->s[0]);
6270
6271 if (now->len > 1) vn += 5;
6272 if (alt->len > 1) va += 5;
6273 }
6274
6275 vn *= 2;
6276 va *= 2;
6277
6278 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6279 copy_opt_exact(now, alt);
6280 }
6281
6282 static void
clear_opt_map(OptMap * map)6283 clear_opt_map(OptMap* map)
6284 {
6285 static const OptMap clean_info = {
6286 {0, 0}, {0, 0}, 0,
6287 {
6288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6291 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6304 }
6305 };
6306
6307 xmemcpy(map, &clean_info, sizeof(OptMap));
6308 }
6309
6310 static void
copy_opt_map(OptMap * to,OptMap * from)6311 copy_opt_map(OptMap* to, OptMap* from)
6312 {
6313 *to = *from;
6314 }
6315
6316 static void
add_char_opt_map(OptMap * m,UChar c,OnigEncoding enc)6317 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
6318 {
6319 if (m->map[c] == 0) {
6320 m->map[c] = 1;
6321 m->value += map_position_value(enc, c);
6322 }
6323 }
6324
6325 static void
select_opt_map(OptMap * now,OptMap * alt)6326 select_opt_map(OptMap* now, OptMap* alt)
6327 {
6328 static int z = 1<<15; /* 32768: something big value */
6329
6330 int vn, va;
6331
6332 if (alt->value == 0) return ;
6333 if (now->value == 0) {
6334 copy_opt_map(now, alt);
6335 return ;
6336 }
6337
6338 vn = z / now->value;
6339 va = z / alt->value;
6340 if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0)
6341 copy_opt_map(now, alt);
6342 }
6343
6344 static int
comp_opt_exact_or_map(OptStr * e,OptMap * m)6345 comp_opt_exact_or_map(OptStr* e, OptMap* m)
6346 {
6347 #define COMP_EM_BASE 20
6348 int ae, am;
6349 int case_value;
6350
6351 if (m->value <= 0) return -1;
6352
6353 case_value = 3;
6354 ae = COMP_EM_BASE * e->len * case_value;
6355 am = COMP_EM_BASE * 5 * 2 / m->value;
6356 return comp_distance_value(&e->mm, &m->mm, ae, am);
6357 }
6358
6359 static void
alt_merge_opt_map(OnigEncoding enc,OptMap * to,OptMap * add)6360 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
6361 {
6362 int i, val;
6363
6364 /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */
6365 if (to->value == 0) return ;
6366 if (add->value == 0 || to->mm.max < add->mm.min) {
6367 clear_opt_map(to);
6368 return ;
6369 }
6370
6371 mml_alt_merge(&to->mm, &add->mm);
6372
6373 val = 0;
6374 for (i = 0; i < CHAR_MAP_SIZE; i++) {
6375 if (add->map[i])
6376 to->map[i] = 1;
6377
6378 if (to->map[i])
6379 val += map_position_value(enc, i);
6380 }
6381 to->value = val;
6382
6383 alt_merge_opt_anc_info(&to->anc, &add->anc);
6384 }
6385
6386 static void
set_bound_node_opt_info(OptNode * opt,MinMaxLen * plen)6387 set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen)
6388 {
6389 mml_copy(&(opt->sb.mm), plen);
6390 mml_copy(&(opt->spr.mm), plen);
6391 mml_copy(&(opt->map.mm), plen);
6392 }
6393
6394 static void
clear_node_opt_info(OptNode * opt)6395 clear_node_opt_info(OptNode* opt)
6396 {
6397 mml_clear(&opt->len);
6398 clear_opt_anc_info(&opt->anc);
6399 clear_opt_exact(&opt->sb);
6400 clear_opt_exact(&opt->sm);
6401 clear_opt_exact(&opt->spr);
6402 clear_opt_map(&opt->map);
6403 }
6404
6405 static void
copy_node_opt_info(OptNode * to,OptNode * from)6406 copy_node_opt_info(OptNode* to, OptNode* from)
6407 {
6408 *to = *from;
6409 }
6410
6411 static void
concat_left_node_opt_info(OnigEncoding enc,OptNode * to,OptNode * add)6412 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
6413 {
6414 int sb_reach, sm_reach;
6415 OptAnc tanc;
6416
6417 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
6418 copy_opt_anc_info(&to->anc, &tanc);
6419
6420 if (add->sb.len > 0 && to->len.max == 0) {
6421 concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
6422 copy_opt_anc_info(&add->sb.anc, &tanc);
6423 }
6424
6425 if (add->map.value > 0 && to->len.max == 0) {
6426 if (add->map.mm.max == 0)
6427 add->map.anc.left |= to->anc.left;
6428 }
6429
6430 sb_reach = to->sb.reach_end;
6431 sm_reach = to->sm.reach_end;
6432
6433 if (add->len.max != 0)
6434 to->sb.reach_end = to->sm.reach_end = 0;
6435
6436 if (add->sb.len > 0) {
6437 if (sb_reach) {
6438 concat_opt_exact(&to->sb, &add->sb, enc);
6439 clear_opt_exact(&add->sb);
6440 }
6441 else if (sm_reach) {
6442 concat_opt_exact(&to->sm, &add->sb, enc);
6443 clear_opt_exact(&add->sb);
6444 }
6445 }
6446 select_opt_exact(enc, &to->sm, &add->sb);
6447 select_opt_exact(enc, &to->sm, &add->sm);
6448
6449 if (to->spr.len > 0) {
6450 if (add->len.max > 0) {
6451 if (to->spr.mm.max == 0)
6452 select_opt_exact(enc, &to->sb, &to->spr);
6453 else
6454 select_opt_exact(enc, &to->sm, &to->spr);
6455 }
6456 }
6457 else if (add->spr.len > 0) {
6458 copy_opt_exact(&to->spr, &add->spr);
6459 }
6460
6461 select_opt_map(&to->map, &add->map);
6462 mml_add(&to->len, &add->len);
6463 }
6464
6465 static void
alt_merge_node_opt_info(OptNode * to,OptNode * add,OptEnv * env)6466 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
6467 {
6468 alt_merge_opt_anc_info(&to->anc, &add->anc);
6469 alt_merge_opt_exact(&to->sb, &add->sb, env);
6470 alt_merge_opt_exact(&to->sm, &add->sm, env);
6471 alt_merge_opt_exact(&to->spr, &add->spr, env);
6472 alt_merge_opt_map(env->enc, &to->map, &add->map);
6473
6474 mml_alt_merge(&to->len, &add->len);
6475 }
6476
6477
6478 #define MAX_NODE_OPT_INFO_REF_COUNT 5
6479
6480 static int
optimize_nodes(Node * node,OptNode * opt,OptEnv * env)6481 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
6482 {
6483 int i;
6484 int r;
6485 OptNode xo;
6486 OnigEncoding enc;
6487
6488 r = 0;
6489 enc = env->enc;
6490 clear_node_opt_info(opt);
6491 set_bound_node_opt_info(opt, &env->mm);
6492
6493 switch (NODE_TYPE(node)) {
6494 case NODE_LIST:
6495 {
6496 OptEnv nenv;
6497 Node* nd = node;
6498
6499 copy_opt_env(&nenv, env);
6500 do {
6501 r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
6502 if (r == 0) {
6503 mml_add(&nenv.mm, &xo.len);
6504 concat_left_node_opt_info(enc, opt, &xo);
6505 }
6506 } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
6507 }
6508 break;
6509
6510 case NODE_ALT:
6511 {
6512 Node* nd = node;
6513
6514 do {
6515 r = optimize_nodes(NODE_CAR(nd), &xo, env);
6516 if (r == 0) {
6517 if (nd == node) copy_node_opt_info(opt, &xo);
6518 else alt_merge_node_opt_info(opt, &xo, env);
6519 }
6520 } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
6521 }
6522 break;
6523
6524 case NODE_STRING:
6525 {
6526 StrNode* sn = STR_(node);
6527 int slen = (int )(sn->end - sn->s);
6528
6529 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
6530 if (slen > 0) {
6531 add_char_opt_map(&opt->map, *(sn->s), enc);
6532 }
6533 mml_set_min_max(&opt->len, slen, slen);
6534 }
6535 break;
6536
6537 case NODE_CCLASS:
6538 {
6539 int z;
6540 CClassNode* cc = CCLASS_(node);
6541
6542 /* no need to check ignore case. (set in tune_tree()) */
6543
6544 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
6545 OnigLen min = ONIGENC_MBC_MINLEN(enc);
6546 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
6547
6548 mml_set_min_max(&opt->len, min, max);
6549 }
6550 else {
6551 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
6552 z = BITSET_AT(cc->bs, i);
6553 if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
6554 add_char_opt_map(&opt->map, (UChar )i, enc);
6555 }
6556 }
6557 mml_set_min_max(&opt->len, 1, 1);
6558 }
6559 }
6560 break;
6561
6562 case NODE_CTYPE:
6563 {
6564 int min, max;
6565 int range;
6566
6567 max = ONIGENC_MBC_MAXLEN_DIST(enc);
6568
6569 if (max == 1) {
6570 min = 1;
6571
6572 switch (CTYPE_(node)->ctype) {
6573 case CTYPE_ANYCHAR:
6574 break;
6575
6576 case ONIGENC_CTYPE_WORD:
6577 range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
6578 if (CTYPE_(node)->not != 0) {
6579 for (i = 0; i < range; i++) {
6580 if (! ONIGENC_IS_CODE_WORD(enc, i)) {
6581 add_char_opt_map(&opt->map, (UChar )i, enc);
6582 }
6583 }
6584 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
6585 add_char_opt_map(&opt->map, (UChar )i, enc);
6586 }
6587 }
6588 else {
6589 for (i = 0; i < range; i++) {
6590 if (ONIGENC_IS_CODE_WORD(enc, i)) {
6591 add_char_opt_map(&opt->map, (UChar )i, enc);
6592 }
6593 }
6594 }
6595 break;
6596 }
6597 }
6598 else {
6599 min = ONIGENC_MBC_MINLEN(enc);
6600 }
6601 mml_set_min_max(&opt->len, min, max);
6602 }
6603 break;
6604
6605 case NODE_ANCHOR:
6606 switch (ANCHOR_(node)->type) {
6607 case ANCR_BEGIN_BUF:
6608 case ANCR_BEGIN_POSITION:
6609 case ANCR_BEGIN_LINE:
6610 case ANCR_END_BUF:
6611 case ANCR_SEMI_END_BUF:
6612 case ANCR_END_LINE:
6613 case ANCR_PREC_READ_NOT:
6614 case ANCR_LOOK_BEHIND:
6615 add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
6616 break;
6617
6618 case ANCR_PREC_READ:
6619 {
6620 r = optimize_nodes(NODE_BODY(node), &xo, env);
6621 if (r == 0) {
6622 if (xo.sb.len > 0)
6623 copy_opt_exact(&opt->spr, &xo.sb);
6624 else if (xo.sm.len > 0)
6625 copy_opt_exact(&opt->spr, &xo.sm);
6626
6627 opt->spr.reach_end = 0;
6628
6629 if (xo.map.value > 0)
6630 copy_opt_map(&opt->map, &xo.map);
6631 }
6632 }
6633 break;
6634
6635 case ANCR_LOOK_BEHIND_NOT:
6636 break;
6637 }
6638 break;
6639
6640 case NODE_BACKREF:
6641 if (! NODE_IS_CHECKER(node)) {
6642 OnigLen min, max;
6643
6644 min = node_min_byte_len(node, env->scan_env);
6645 max = node_max_byte_len(node, env->scan_env);
6646 mml_set_min_max(&opt->len, min, max);
6647 }
6648 break;
6649
6650 #ifdef USE_CALL
6651 case NODE_CALL:
6652 if (NODE_IS_RECURSION(node))
6653 mml_set_min_max(&opt->len, 0, INFINITE_LEN);
6654 else {
6655 r = optimize_nodes(NODE_BODY(node), opt, env);
6656 }
6657 break;
6658 #endif
6659
6660 case NODE_QUANT:
6661 {
6662 OnigLen min, max;
6663 QuantNode* qn = QUANT_(node);
6664
6665 /* Issue #175
6666 ex. /\g<1>{0}(?<=|())/
6667
6668 Empty and unused nodes in look-behind is removed in
6669 tune_look_behind().
6670 Called group nodes are assigned to be not called if the caller side is
6671 inside of zero-repetition.
6672 As a result, the nodes are considered unused.
6673 */
6674 if (qn->upper == 0) {
6675 mml_set_min_max(&opt->len, 0, 0);
6676 break;
6677 }
6678
6679 r = optimize_nodes(NODE_BODY(node), &xo, env);
6680 if (r != 0) break;
6681
6682 if (qn->lower > 0) {
6683 copy_node_opt_info(opt, &xo);
6684 if (xo.sb.len > 0) {
6685 if (xo.sb.reach_end) {
6686 for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
6687 int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
6688 if (rc > 0) break;
6689 }
6690 if (i < qn->lower) opt->sb.reach_end = 0;
6691 }
6692 }
6693
6694 if (qn->lower != qn->upper) {
6695 opt->sb.reach_end = 0;
6696 opt->sm.reach_end = 0;
6697 }
6698 if (qn->lower > 1)
6699 opt->sm.reach_end = 0;
6700 }
6701
6702 if (IS_INFINITE_REPEAT(qn->upper)) {
6703 if (env->mm.max == 0 &&
6704 NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
6705 if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)))
6706 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
6707 else
6708 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
6709 }
6710
6711 max = (xo.len.max > 0 ? INFINITE_LEN : 0);
6712 }
6713 else {
6714 max = distance_multiply(xo.len.max, qn->upper);
6715 }
6716
6717 min = distance_multiply(xo.len.min, qn->lower);
6718 mml_set_min_max(&opt->len, min, max);
6719 }
6720 break;
6721
6722 case NODE_BAG:
6723 {
6724 BagNode* en = BAG_(node);
6725
6726 switch (en->type) {
6727 case BAG_STOP_BACKTRACK:
6728 case BAG_OPTION:
6729 r = optimize_nodes(NODE_BODY(node), opt, env);
6730 break;
6731
6732 case BAG_MEMORY:
6733 #ifdef USE_CALL
6734 en->opt_count++;
6735 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
6736 OnigLen min, max;
6737
6738 min = 0;
6739 max = INFINITE_LEN;
6740 if (NODE_IS_FIXED_MIN(node)) min = en->min_len;
6741 if (NODE_IS_FIXED_MAX(node)) max = en->max_len;
6742 mml_set_min_max(&opt->len, min, max);
6743 }
6744 else
6745 #endif
6746 {
6747 r = optimize_nodes(NODE_BODY(node), opt, env);
6748 if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
6749 if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
6750 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
6751 }
6752 }
6753 break;
6754
6755 case BAG_IF_ELSE:
6756 {
6757 OptEnv nenv;
6758
6759 copy_opt_env(&nenv, env);
6760 r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
6761 if (r == 0) {
6762 mml_add(&nenv.mm, &xo.len);
6763 concat_left_node_opt_info(enc, opt, &xo);
6764 if (IS_NOT_NULL(en->te.Then)) {
6765 r = optimize_nodes(en->te.Then, &xo, &nenv);
6766 if (r == 0) {
6767 concat_left_node_opt_info(enc, opt, &xo);
6768 }
6769 }
6770
6771 if (IS_NOT_NULL(en->te.Else)) {
6772 r = optimize_nodes(en->te.Else, &xo, env);
6773 if (r == 0)
6774 alt_merge_node_opt_info(opt, &xo, env);
6775 }
6776 }
6777 }
6778 break;
6779 }
6780 }
6781 break;
6782
6783 case NODE_GIMMICK:
6784 break;
6785
6786 default:
6787 #ifdef ONIG_DEBUG
6788 fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
6789 #endif
6790 r = ONIGERR_TYPE_BUG;
6791 break;
6792 }
6793
6794 return r;
6795 }
6796
6797 static int
set_optimize_exact(regex_t * reg,OptStr * e)6798 set_optimize_exact(regex_t* reg, OptStr* e)
6799 {
6800 int r;
6801 int allow_reverse;
6802
6803 if (e->len == 0) return 0;
6804
6805 reg->exact = (UChar* )xmalloc(e->len);
6806 CHECK_NULL_RETURN_MEMERR(reg->exact);
6807 xmemcpy(reg->exact, e->s, e->len);
6808 reg->exact_end = reg->exact + e->len;
6809
6810 allow_reverse =
6811 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
6812
6813 if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
6814 r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
6815 reg->exact, reg->exact_end,
6816 reg->map, &(reg->map_offset));
6817 if (r != 0) return r;
6818
6819 reg->optimize = (allow_reverse != 0
6820 ? OPTIMIZE_STR_FAST
6821 : OPTIMIZE_STR_FAST_STEP_FORWARD);
6822 }
6823 else {
6824 reg->optimize = OPTIMIZE_STR;
6825 }
6826
6827 reg->dist_min = e->mm.min;
6828 reg->dist_max = e->mm.max;
6829
6830 if (reg->dist_min != INFINITE_LEN) {
6831 int n = (int )(reg->exact_end - reg->exact);
6832 reg->threshold_len = reg->dist_min + n;
6833 }
6834
6835 return 0;
6836 }
6837
6838 static void
set_optimize_map(regex_t * reg,OptMap * m)6839 set_optimize_map(regex_t* reg, OptMap* m)
6840 {
6841 int i;
6842
6843 for (i = 0; i < CHAR_MAP_SIZE; i++)
6844 reg->map[i] = m->map[i];
6845
6846 reg->optimize = OPTIMIZE_MAP;
6847 reg->dist_min = m->mm.min;
6848 reg->dist_max = m->mm.max;
6849
6850 if (reg->dist_min != INFINITE_LEN) {
6851 reg->threshold_len = reg->dist_min + ONIGENC_MBC_MINLEN(reg->enc);
6852 }
6853 }
6854
6855 static void
set_sub_anchor(regex_t * reg,OptAnc * anc)6856 set_sub_anchor(regex_t* reg, OptAnc* anc)
6857 {
6858 reg->sub_anchor |= anc->left & ANCR_BEGIN_LINE;
6859 reg->sub_anchor |= anc->right & ANCR_END_LINE;
6860 }
6861
6862 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6863 static void print_optimize_info(FILE* f, regex_t* reg);
6864 #endif
6865
6866 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)6867 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
6868 {
6869 int r;
6870 OptNode opt;
6871 OptEnv env;
6872
6873 env.enc = reg->enc;
6874 env.case_fold_flag = reg->case_fold_flag;
6875 env.scan_env = scan_env;
6876 mml_clear(&env.mm);
6877
6878 r = optimize_nodes(node, &opt, &env);
6879 if (r != 0) return r;
6880
6881 reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
6882 ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
6883 ANCR_LOOK_BEHIND);
6884
6885 if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6886 reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6887
6888 reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6889 ANCR_PREC_READ_NOT);
6890
6891 if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6892 reg->anc_dist_min = opt.len.min;
6893 reg->anc_dist_max = opt.len.max;
6894 }
6895
6896 if (opt.sb.len > 0 || opt.sm.len > 0) {
6897 select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6898 if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6899 goto set_map;
6900 }
6901 else {
6902 r = set_optimize_exact(reg, &opt.sb);
6903 set_sub_anchor(reg, &opt.sb.anc);
6904 }
6905 }
6906 else if (opt.map.value > 0) {
6907 set_map:
6908 set_optimize_map(reg, &opt.map);
6909 set_sub_anchor(reg, &opt.map.anc);
6910 }
6911 else {
6912 reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
6913 if (opt.len.max == 0)
6914 reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
6915 }
6916
6917 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6918 print_optimize_info(DBGFP, reg);
6919 #endif
6920 return r;
6921 }
6922
6923 static void
clear_optimize_info(regex_t * reg)6924 clear_optimize_info(regex_t* reg)
6925 {
6926 reg->optimize = OPTIMIZE_NONE;
6927 reg->anchor = 0;
6928 reg->anc_dist_min = 0;
6929 reg->anc_dist_max = 0;
6930 reg->sub_anchor = 0;
6931 reg->exact_end = (UChar* )NULL;
6932 reg->map_offset = 0;
6933 reg->threshold_len = 0;
6934 if (IS_NOT_NULL(reg->exact)) {
6935 xfree(reg->exact);
6936 reg->exact = (UChar* )NULL;
6937 }
6938 }
6939
6940 #ifdef ONIG_DEBUG
6941
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)6942 static void print_enc_string(FILE* fp, OnigEncoding enc,
6943 const UChar *s, const UChar *end)
6944 {
6945 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6946 const UChar *p;
6947 OnigCodePoint code;
6948
6949 p = s;
6950 while (p < end) {
6951 code = ONIGENC_MBC_TO_CODE(enc, p, end);
6952 if (code >= 0x80) {
6953 fprintf(fp, " 0x%04x ", (int )code);
6954 }
6955 else {
6956 fputc((int )code, fp);
6957 }
6958
6959 p += enclen(enc, p);
6960 }
6961 }
6962 else {
6963 while (s < end) {
6964 fputc((int )*s, fp);
6965 s++;
6966 }
6967 }
6968
6969 fprintf(fp, "/\n");
6970 }
6971
6972 #endif /* ONIG_DEBUG */
6973
6974 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6975
6976 static void
print_distance_range(FILE * f,OnigLen a,OnigLen b)6977 print_distance_range(FILE* f, OnigLen a, OnigLen b)
6978 {
6979 if (a == INFINITE_LEN)
6980 fputs("inf", f);
6981 else
6982 fprintf(f, "(%u)", a);
6983
6984 fputs("-", f);
6985
6986 if (b == INFINITE_LEN)
6987 fputs("inf", f);
6988 else
6989 fprintf(f, "(%u)", b);
6990 }
6991
6992 static void
print_anchor(FILE * f,int anchor)6993 print_anchor(FILE* f, int anchor)
6994 {
6995 int q = 0;
6996
6997 fprintf(f, "[");
6998
6999 if (anchor & ANCR_BEGIN_BUF) {
7000 fprintf(f, "begin-buf");
7001 q = 1;
7002 }
7003 if (anchor & ANCR_BEGIN_LINE) {
7004 if (q) fprintf(f, ", ");
7005 q = 1;
7006 fprintf(f, "begin-line");
7007 }
7008 if (anchor & ANCR_BEGIN_POSITION) {
7009 if (q) fprintf(f, ", ");
7010 q = 1;
7011 fprintf(f, "begin-pos");
7012 }
7013 if (anchor & ANCR_END_BUF) {
7014 if (q) fprintf(f, ", ");
7015 q = 1;
7016 fprintf(f, "end-buf");
7017 }
7018 if (anchor & ANCR_SEMI_END_BUF) {
7019 if (q) fprintf(f, ", ");
7020 q = 1;
7021 fprintf(f, "semi-end-buf");
7022 }
7023 if (anchor & ANCR_END_LINE) {
7024 if (q) fprintf(f, ", ");
7025 q = 1;
7026 fprintf(f, "end-line");
7027 }
7028 if (anchor & ANCR_ANYCHAR_INF) {
7029 if (q) fprintf(f, ", ");
7030 q = 1;
7031 fprintf(f, "anychar-inf");
7032 }
7033 if (anchor & ANCR_ANYCHAR_INF_ML) {
7034 if (q) fprintf(f, ", ");
7035 fprintf(f, "anychar-inf-ml");
7036 }
7037
7038 fprintf(f, "]");
7039 }
7040
7041 static void
print_optimize_info(FILE * f,regex_t * reg)7042 print_optimize_info(FILE* f, regex_t* reg)
7043 {
7044 static const char* on[] =
7045 { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", "MAP" };
7046
7047 fprintf(f, "optimize: %s\n", on[reg->optimize]);
7048 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
7049 if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
7050 print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max);
7051 fprintf(f, "\n");
7052
7053 if (reg->optimize) {
7054 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
7055 fprintf(f, "\n");
7056 }
7057 fprintf(f, "\n");
7058
7059 if (reg->exact) {
7060 UChar *p;
7061 fprintf(f, "exact: [");
7062 for (p = reg->exact; p < reg->exact_end; p++) {
7063 fputc(*p, f);
7064 }
7065 fprintf(f, "]: length: %ld, dmin: %u, dmax: %u\n",
7066 (reg->exact_end - reg->exact), reg->dist_min, reg->dist_max);
7067 }
7068 else if (reg->optimize & OPTIMIZE_MAP) {
7069 int c, i, n = 0;
7070
7071 for (i = 0; i < CHAR_MAP_SIZE; i++)
7072 if (reg->map[i]) n++;
7073
7074 fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n",
7075 n, reg->dist_min, reg->dist_max);
7076 if (n > 0) {
7077 c = 0;
7078 fputc('[', f);
7079 for (i = 0; i < CHAR_MAP_SIZE; i++) {
7080 if (reg->map[i] != 0) {
7081 if (c > 0) fputs(", ", f);
7082 c++;
7083 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
7084 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
7085 fputc(i, f);
7086 else
7087 fprintf(f, "%d", i);
7088 }
7089 }
7090 fprintf(f, "]\n");
7091 }
7092 }
7093 }
7094 #endif
7095
7096
7097 extern RegexExt*
onig_get_regex_ext(regex_t * reg)7098 onig_get_regex_ext(regex_t* reg)
7099 {
7100 if (IS_NULL(reg->extp)) {
7101 RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
7102 if (IS_NULL(ext)) return 0;
7103
7104 ext->pattern = 0;
7105 ext->pattern_end = 0;
7106 #ifdef USE_CALLOUT
7107 ext->tag_table = 0;
7108 ext->callout_num = 0;
7109 ext->callout_list_alloc = 0;
7110 ext->callout_list = 0;
7111 #endif
7112
7113 reg->extp = ext;
7114 }
7115
7116 return reg->extp;
7117 }
7118
7119 static void
free_regex_ext(RegexExt * ext)7120 free_regex_ext(RegexExt* ext)
7121 {
7122 if (IS_NOT_NULL(ext)) {
7123 if (IS_NOT_NULL(ext->pattern))
7124 xfree((void* )ext->pattern);
7125
7126 #ifdef USE_CALLOUT
7127 if (IS_NOT_NULL(ext->tag_table))
7128 onig_callout_tag_table_free(ext->tag_table);
7129
7130 if (IS_NOT_NULL(ext->callout_list))
7131 onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
7132 #endif
7133
7134 xfree(ext);
7135 }
7136 }
7137
7138 extern int
onig_ext_set_pattern(regex_t * reg,const UChar * pattern,const UChar * pattern_end)7139 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
7140 {
7141 RegexExt* ext;
7142 UChar* s;
7143
7144 ext = onig_get_regex_ext(reg);
7145 CHECK_NULL_RETURN_MEMERR(ext);
7146
7147 s = onigenc_strdup(reg->enc, pattern, pattern_end);
7148 CHECK_NULL_RETURN_MEMERR(s);
7149
7150 ext->pattern = s;
7151 ext->pattern_end = s + (pattern_end - pattern);
7152
7153 return ONIG_NORMAL;
7154 }
7155
7156 extern void
onig_free_body(regex_t * reg)7157 onig_free_body(regex_t* reg)
7158 {
7159 if (IS_NOT_NULL(reg)) {
7160 ops_free(reg);
7161 if (IS_NOT_NULL(reg->string_pool)) {
7162 xfree(reg->string_pool);
7163 reg->string_pool_end = reg->string_pool = 0;
7164 }
7165 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
7166 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
7167 if (IS_NOT_NULL(reg->extp)) {
7168 free_regex_ext(reg->extp);
7169 reg->extp = 0;
7170 }
7171
7172 onig_names_free(reg);
7173 }
7174 }
7175
7176 extern void
onig_free(regex_t * reg)7177 onig_free(regex_t* reg)
7178 {
7179 if (IS_NOT_NULL(reg)) {
7180 onig_free_body(reg);
7181 xfree(reg);
7182 }
7183 }
7184
7185
7186 #ifdef ONIG_DEBUG_PARSE
7187 static void print_tree P_((FILE* f, Node* node));
7188 #endif
7189
7190 extern int onig_init_for_match_at(regex_t* reg);
7191
7192 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)7193 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
7194 OnigErrorInfo* einfo)
7195 {
7196 int r;
7197 Node* root;
7198 ScanEnv scan_env;
7199 #ifdef USE_CALL
7200 UnsetAddrList uslist = {0};
7201 #endif
7202
7203 root = 0;
7204 if (IS_NOT_NULL(einfo)) {
7205 einfo->enc = reg->enc;
7206 einfo->par = (UChar* )NULL;
7207 }
7208
7209 #ifdef ONIG_DEBUG
7210 fprintf(DBGFP, "\nPATTERN: /");
7211 print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
7212 #endif
7213
7214 if (reg->ops_alloc == 0) {
7215 r = ops_init(reg, OPS_INIT_SIZE);
7216 if (r != 0) goto end;
7217 }
7218 else
7219 reg->ops_used = 0;
7220
7221 reg->string_pool = 0;
7222 reg->string_pool_end = 0;
7223 reg->num_mem = 0;
7224 reg->num_repeat = 0;
7225 reg->num_empty_check = 0;
7226 reg->repeat_range_alloc = 0;
7227 reg->repeat_range = (RepeatRange* )NULL;
7228 reg->empty_status_mem = 0;
7229
7230 r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
7231 if (r != 0) goto err;
7232
7233 r = reduce_string_list(root);
7234 if (r != 0) goto err;
7235
7236 /* mixed use named group and no-named group */
7237 if (scan_env.num_named > 0 &&
7238 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
7239 ! OPTON_CAPTURE_GROUP(reg->options)) {
7240 if (scan_env.num_named != scan_env.num_mem)
7241 r = disable_noname_group_capture(&root, reg, &scan_env);
7242 else
7243 r = numbered_ref_check(root);
7244
7245 if (r != 0) goto err;
7246 }
7247
7248 r = check_backrefs(root, &scan_env);
7249 if (r != 0) goto err;
7250
7251 #ifdef USE_CALL
7252 if (scan_env.num_call > 0) {
7253 r = unset_addr_list_init(&uslist, scan_env.num_call);
7254 if (r != 0) goto err;
7255 scan_env.unset_addr_list = &uslist;
7256 r = tune_call(root, &scan_env, 0);
7257 if (r != 0) goto err_unset;
7258 r = tune_call2(root);
7259 if (r != 0) goto err_unset;
7260 r = recursive_call_check_trav(root, &scan_env, 0);
7261 if (r < 0) goto err_unset;
7262 r = infinite_recursive_call_check_trav(root, &scan_env);
7263 if (r != 0) goto err_unset;
7264
7265 tune_called_state(root, 0);
7266 }
7267
7268 reg->num_call = scan_env.num_call;
7269 #endif
7270
7271 #ifdef ONIG_DEBUG_PARSE
7272 fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth);
7273 fprintf(DBGFP, "TREE (parsed)\n");
7274 print_tree(DBGFP, root);
7275 fprintf(DBGFP, "\n");
7276 #endif
7277
7278 r = tune_tree(root, reg, 0, &scan_env);
7279 if (r != 0) goto err_unset;
7280
7281 if (scan_env.backref_num != 0) {
7282 set_parent_node_trav(root, NULL_NODE);
7283 r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env);
7284 if (r != 0) goto err_unset;
7285 set_empty_status_check_trav(root, &scan_env);
7286 }
7287
7288 #ifdef ONIG_DEBUG_PARSE
7289 fprintf(DBGFP, "TREE (after tune)\n");
7290 print_tree(DBGFP, root);
7291 fprintf(DBGFP, "\n");
7292 #endif
7293
7294 reg->capture_history = scan_env.cap_history;
7295 reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history;
7296
7297 #ifdef USE_CALLOUT
7298 if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) {
7299 reg->push_mem_end = reg->push_mem_start;
7300 }
7301 else {
7302 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7303 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7304 else
7305 reg->push_mem_end = reg->push_mem_start &
7306 (scan_env.backrefed_mem | scan_env.cap_history);
7307 }
7308 #else
7309 if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start))
7310 reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history;
7311 else
7312 reg->push_mem_end = reg->push_mem_start &
7313 (scan_env.backrefed_mem | scan_env.cap_history);
7314 #endif
7315
7316 clear_optimize_info(reg);
7317 #ifndef ONIG_DONT_OPTIMIZE
7318 r = set_optimize_info_from_tree(root, reg, &scan_env);
7319 if (r != 0) goto err_unset;
7320 #endif
7321
7322 if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
7323 xfree(scan_env.mem_env_dynamic);
7324 scan_env.mem_env_dynamic = (MemEnv* )NULL;
7325 }
7326
7327 r = compile_tree(root, reg, &scan_env);
7328 if (r == 0) {
7329 if (scan_env.keep_num > 0) {
7330 r = add_op(reg, OP_UPDATE_VAR);
7331 if (r != 0) goto err;
7332
7333 COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
7334 COP(reg)->update_var.id = 0; /* not used */
7335 COP(reg)->update_var.clear = FALSE;
7336 }
7337
7338 r = add_op(reg, OP_END);
7339 if (r != 0) goto err;
7340
7341 #ifdef USE_CALL
7342 if (scan_env.num_call > 0) {
7343 r = fix_unset_addr_list(&uslist, reg);
7344 unset_addr_list_end(&uslist);
7345 if (r != 0) goto err;
7346 }
7347 #endif
7348
7349 set_addr_in_repeat_range(reg);
7350
7351 if ((reg->push_mem_end != 0)
7352 #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
7353 || (reg->num_repeat != 0)
7354 || (reg->num_empty_check != 0)
7355 #endif
7356 #ifdef USE_CALLOUT
7357 || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
7358 #endif
7359 #ifdef USE_CALL
7360 || scan_env.num_call > 0
7361 #endif
7362 )
7363 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
7364 else {
7365 if (reg->push_mem_start != 0)
7366 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
7367 else
7368 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
7369 }
7370
7371 r = ops_make_string_pool(reg);
7372 if (r != 0) goto err;
7373 }
7374 #ifdef USE_CALL
7375 else if (scan_env.num_call > 0) {
7376 unset_addr_list_end(&uslist);
7377 }
7378 #endif
7379 onig_node_free(root);
7380
7381 #ifdef ONIG_DEBUG_COMPILE
7382 onig_print_names(DBGFP, reg);
7383 onig_print_compiled_byte_code_list(DBGFP, reg);
7384 #endif
7385
7386 #ifdef USE_DIRECT_THREADED_CODE
7387 /* opcode -> opaddr */
7388 onig_init_for_match_at(reg);
7389 #endif
7390
7391 end:
7392 return r;
7393
7394 err_unset:
7395 #ifdef USE_CALL
7396 if (scan_env.num_call > 0) {
7397 unset_addr_list_end(&uslist);
7398 }
7399 #endif
7400 err:
7401 if (IS_NOT_NULL(scan_env.error)) {
7402 if (IS_NOT_NULL(einfo)) {
7403 einfo->par = scan_env.error;
7404 einfo->par_end = scan_env.error_end;
7405 }
7406 }
7407
7408 onig_node_free(root);
7409 if (IS_NOT_NULL(scan_env.mem_env_dynamic))
7410 xfree(scan_env.mem_env_dynamic);
7411 return r;
7412 }
7413
7414
7415 static int onig_inited = 0;
7416
7417 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)7418 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
7419 OnigEncoding enc, OnigSyntaxType* syntax)
7420 {
7421 int r;
7422
7423 xmemset(reg, 0, sizeof(*reg));
7424
7425 if (onig_inited == 0) {
7426 #if 0
7427 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
7428 #else
7429 r = onig_initialize(&enc, 1);
7430 if (r != 0)
7431 return ONIGERR_FAIL_TO_INITIALIZE;
7432
7433 onig_warning("You didn't call onig_initialize() explicitly");
7434 #endif
7435 }
7436
7437 if (IS_NULL(reg))
7438 return ONIGERR_INVALID_ARGUMENT;
7439
7440 if (ONIGENC_IS_UNDEF(enc))
7441 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
7442
7443 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
7444 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
7445 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
7446 }
7447
7448 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
7449 option |= syntax->options;
7450 option &= ~ONIG_OPTION_SINGLELINE;
7451 }
7452 else
7453 option |= syntax->options;
7454
7455 (reg)->enc = enc;
7456 (reg)->options = option;
7457 (reg)->syntax = syntax;
7458 (reg)->optimize = 0;
7459 (reg)->exact = (UChar* )NULL;
7460 (reg)->extp = (RegexExt* )NULL;
7461 (reg)->ops = (Operation* )NULL;
7462 (reg)->ops_curr = (Operation* )NULL;
7463 (reg)->ops_used = 0;
7464 (reg)->ops_alloc = 0;
7465 (reg)->name_table = (void* )NULL;
7466 (reg)->case_fold_flag = case_fold_flag;
7467 return 0;
7468 }
7469
7470 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7471 onig_new_without_alloc(regex_t* reg,
7472 const UChar* pattern, const UChar* pattern_end,
7473 OnigOptionType option, OnigEncoding enc,
7474 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
7475 {
7476 int r;
7477
7478 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7479 if (r != 0) return r;
7480
7481 r = onig_compile(reg, pattern, pattern_end, einfo);
7482 return r;
7483 }
7484
7485 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)7486 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
7487 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
7488 OnigErrorInfo* einfo)
7489 {
7490 int r;
7491
7492 *reg = (regex_t* )xmalloc(sizeof(regex_t));
7493 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
7494
7495 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
7496 if (r != 0) {
7497 xfree(*reg);
7498 *reg = NULL;
7499 return r;
7500 }
7501
7502 r = onig_compile(*reg, pattern, pattern_end, einfo);
7503 if (r != 0) {
7504 onig_free(*reg);
7505 *reg = NULL;
7506 }
7507 return r;
7508 }
7509
7510 extern int
onig_initialize(OnigEncoding encodings[],int n)7511 onig_initialize(OnigEncoding encodings[], int n)
7512 {
7513 int i;
7514 int r;
7515
7516 if (onig_inited != 0)
7517 return 0;
7518
7519 onigenc_init();
7520
7521 onig_inited = 1;
7522
7523 for (i = 0; i < n; i++) {
7524 OnigEncoding enc = encodings[i];
7525 r = onig_initialize_encoding(enc);
7526 if (r != 0)
7527 return r;
7528 }
7529
7530 return ONIG_NORMAL;
7531 }
7532
7533 typedef struct EndCallListItem {
7534 struct EndCallListItem* next;
7535 void (*func)(void);
7536 } EndCallListItemType;
7537
7538 static EndCallListItemType* EndCallTop;
7539
onig_add_end_call(void (* func)(void))7540 extern void onig_add_end_call(void (*func)(void))
7541 {
7542 EndCallListItemType* item;
7543
7544 item = (EndCallListItemType* )xmalloc(sizeof(*item));
7545 if (item == 0) return ;
7546
7547 item->next = EndCallTop;
7548 item->func = func;
7549
7550 EndCallTop = item;
7551 }
7552
7553 static void
exec_end_call_list(void)7554 exec_end_call_list(void)
7555 {
7556 EndCallListItemType* prev;
7557 void (*func)(void);
7558
7559 while (EndCallTop != 0) {
7560 func = EndCallTop->func;
7561 (*func)();
7562
7563 prev = EndCallTop;
7564 EndCallTop = EndCallTop->next;
7565 xfree(prev);
7566 }
7567 }
7568
7569 extern int
onig_end(void)7570 onig_end(void)
7571 {
7572 exec_end_call_list();
7573
7574 #ifdef USE_CALLOUT
7575 onig_global_callout_names_free();
7576 #endif
7577
7578 onigenc_end();
7579
7580 onig_inited = 0;
7581
7582 return 0;
7583 }
7584
7585 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)7586 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
7587 {
7588 OnigCodePoint n, *data;
7589 OnigCodePoint low, high, x;
7590
7591 GET_CODE_POINT(n, p);
7592 data = (OnigCodePoint* )p;
7593 data++;
7594
7595 for (low = 0, high = n; low < high; ) {
7596 x = (low + high) >> 1;
7597 if (code > data[x * 2 + 1])
7598 low = x + 1;
7599 else
7600 high = x;
7601 }
7602
7603 return ((low < n && code >= data[low * 2]) ? 1 : 0);
7604 }
7605
7606 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,void * cc_arg)7607 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
7608 {
7609 int found;
7610 CClassNode* cc = (CClassNode* )cc_arg;
7611
7612 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
7613 if (IS_NULL(cc->mbuf)) {
7614 found = 0;
7615 }
7616 else {
7617 found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
7618 }
7619 }
7620 else {
7621 found = BITSET_AT(cc->bs, code) != 0;
7622 }
7623
7624 if (IS_NCCLASS_NOT(cc))
7625 return !found;
7626 else
7627 return found;
7628 }
7629
7630 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)7631 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
7632 {
7633 int len;
7634
7635 if (ONIGENC_MBC_MINLEN(enc) > 1) {
7636 len = 2;
7637 }
7638 else {
7639 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
7640 if (len < 0) return 0;
7641 }
7642 return onig_is_code_in_cc_len(len, code, cc);
7643 }
7644
7645
7646 #ifdef ONIG_DEBUG_PARSE
7647
7648 #ifdef USE_CALL
7649 static void
p_string(FILE * f,int len,UChar * s)7650 p_string(FILE* f, int len, UChar* s)
7651 {
7652 fputs(":", f);
7653 while (len-- > 0) { fputc(*s++, f); }
7654 }
7655 #endif
7656
7657 static void
Indent(FILE * f,int indent)7658 Indent(FILE* f, int indent)
7659 {
7660 int i;
7661 for (i = 0; i < indent; i++) putc(' ', f);
7662 }
7663
7664 static void
print_indent_tree(FILE * f,Node * node,int indent)7665 print_indent_tree(FILE* f, Node* node, int indent)
7666 {
7667 int i;
7668 NodeType type;
7669 UChar* p;
7670 int add = 3;
7671
7672 Indent(f, indent);
7673 if (IS_NULL(node)) {
7674 fprintf(f, "ERROR: null node!!!\n");
7675 exit(0);
7676 }
7677
7678 type = NODE_TYPE(node);
7679 switch (type) {
7680 case NODE_LIST:
7681 case NODE_ALT:
7682 if (type == NODE_LIST)
7683 fprintf(f, "<list:%p>\n", node);
7684 else
7685 fprintf(f, "<alt:%p>\n", node);
7686
7687 print_indent_tree(f, NODE_CAR(node), indent + add);
7688 while (IS_NOT_NULL(node = NODE_CDR(node))) {
7689 if (NODE_TYPE(node) != type) {
7690 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
7691 exit(0);
7692 }
7693 print_indent_tree(f, NODE_CAR(node), indent + add);
7694 }
7695 break;
7696
7697 case NODE_STRING:
7698 {
7699 char* str;
7700 char* mode;
7701
7702 if (NODE_STRING_IS_CRUDE(node))
7703 mode = "-crude";
7704 else if (NODE_IS_IGNORECASE(node))
7705 mode = "-ignorecase";
7706 else
7707 mode = "";
7708
7709 if (STR_(node)->s == STR_(node)->end)
7710 str = "empty-string";
7711 else
7712 str = "string";
7713
7714 fprintf(f, "<%s%s:%p>", str, mode, node);
7715 for (p = STR_(node)->s; p < STR_(node)->end; p++) {
7716 if (*p >= 0x20 && *p < 0x7f)
7717 fputc(*p, f);
7718 else {
7719 fprintf(f, " 0x%02x", *p);
7720 }
7721 }
7722 }
7723 break;
7724
7725 case NODE_CCLASS:
7726 fprintf(f, "<cclass:%p>", node);
7727 if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
7728 if (CCLASS_(node)->mbuf) {
7729 BBuf* bbuf = CCLASS_(node)->mbuf;
7730 for (i = 0; i < bbuf->used; i++) {
7731 if (i > 0) fprintf(f, ",");
7732 fprintf(f, "%0x", bbuf->p[i]);
7733 }
7734 }
7735 break;
7736
7737 case NODE_CTYPE:
7738 fprintf(f, "<ctype:%p> ", node);
7739 switch (CTYPE_(node)->ctype) {
7740 case CTYPE_ANYCHAR:
7741 fprintf(f, "anychar");
7742 break;
7743
7744 case ONIGENC_CTYPE_WORD:
7745 if (CTYPE_(node)->not != 0)
7746 fputs("not word", f);
7747 else
7748 fputs("word", f);
7749
7750 if (CTYPE_(node)->ascii_mode != 0)
7751 fputs(" (ascii)", f);
7752
7753 break;
7754
7755 default:
7756 fprintf(f, "ERROR: undefined ctype.\n");
7757 exit(0);
7758 }
7759 break;
7760
7761 case NODE_ANCHOR:
7762 fprintf(f, "<anchor:%p> ", node);
7763 switch (ANCHOR_(node)->type) {
7764 case ANCR_BEGIN_BUF: fputs("begin buf", f); break;
7765 case ANCR_END_BUF: fputs("end buf", f); break;
7766 case ANCR_BEGIN_LINE: fputs("begin line", f); break;
7767 case ANCR_END_LINE: fputs("end line", f); break;
7768 case ANCR_SEMI_END_BUF: fputs("semi end buf", f); break;
7769 case ANCR_BEGIN_POSITION: fputs("begin position", f); break;
7770
7771 case ANCR_WORD_BOUNDARY: fputs("word boundary", f); break;
7772 case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
7773 #ifdef USE_WORD_BEGIN_END
7774 case ANCR_WORD_BEGIN: fputs("word begin", f); break;
7775 case ANCR_WORD_END: fputs("word end", f); break;
7776 #endif
7777 case ANCR_TEXT_SEGMENT_BOUNDARY:
7778 fputs("text-segment boundary", f); break;
7779 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
7780 fputs("no text-segment boundary", f); break;
7781 case ANCR_PREC_READ:
7782 fprintf(f, "prec read\n");
7783 print_indent_tree(f, NODE_BODY(node), indent + add);
7784 break;
7785 case ANCR_PREC_READ_NOT:
7786 fprintf(f, "prec read not\n");
7787 print_indent_tree(f, NODE_BODY(node), indent + add);
7788 break;
7789 case ANCR_LOOK_BEHIND:
7790 fprintf(f, "look behind\n");
7791 print_indent_tree(f, NODE_BODY(node), indent + add);
7792 break;
7793 case ANCR_LOOK_BEHIND_NOT:
7794 fprintf(f, "look behind not\n");
7795 print_indent_tree(f, NODE_BODY(node), indent + add);
7796 break;
7797
7798 default:
7799 fprintf(f, "ERROR: undefined anchor type.\n");
7800 break;
7801 }
7802 break;
7803
7804 case NODE_BACKREF:
7805 {
7806 int* p;
7807 BackRefNode* br = BACKREF_(node);
7808 p = BACKREFS_P(br);
7809 fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
7810 for (i = 0; i < br->back_num; i++) {
7811 if (i > 0) fputs(", ", f);
7812 fprintf(f, "%d", p[i]);
7813 }
7814 }
7815 break;
7816
7817 #ifdef USE_CALL
7818 case NODE_CALL:
7819 {
7820 CallNode* cn = CALL_(node);
7821 fprintf(f, "<call:%p>", node);
7822 p_string(f, cn->name_end - cn->name, cn->name);
7823 }
7824 break;
7825 #endif
7826
7827 case NODE_QUANT:
7828 fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node,
7829 QUANT_(node)->lower, QUANT_(node)->upper,
7830 (QUANT_(node)->greedy ? "" : "?"),
7831 QUANT_(node)->include_referred == 0 ? "" : " referred");
7832 print_indent_tree(f, NODE_BODY(node), indent + add);
7833 break;
7834
7835 case NODE_BAG:
7836 fprintf(f, "<bag:%p> ", node);
7837 if (BAG_(node)->type == BAG_IF_ELSE) {
7838 Node* Then;
7839 Node* Else;
7840 BagNode* bn;
7841
7842 bn = BAG_(node);
7843 fprintf(f, "if-else\n");
7844 print_indent_tree(f, NODE_BODY(node), indent + add);
7845
7846 Then = bn->te.Then;
7847 Else = bn->te.Else;
7848 if (IS_NULL(Then)) {
7849 Indent(f, indent + add);
7850 fprintf(f, "THEN empty\n");
7851 }
7852 else
7853 print_indent_tree(f, Then, indent + add);
7854
7855 if (IS_NULL(Else)) {
7856 Indent(f, indent + add);
7857 fprintf(f, "ELSE empty\n");
7858 }
7859 else
7860 print_indent_tree(f, Else, indent + add);
7861
7862 break;
7863 }
7864
7865 switch (BAG_(node)->type) {
7866 case BAG_OPTION:
7867 fprintf(f, "option:%d", BAG_(node)->o.options);
7868 break;
7869 case BAG_MEMORY:
7870 fprintf(f, "memory:%d", BAG_(node)->m.regnum);
7871 if (NODE_IS_CALLED(node))
7872 fprintf(f, ", called");
7873 if (NODE_IS_FIXED_ADDR(node))
7874 fprintf(f, ", fixed-addr");
7875 break;
7876 case BAG_STOP_BACKTRACK:
7877 fprintf(f, "stop-bt");
7878 break;
7879 default:
7880 break;
7881 }
7882 fprintf(f, "\n");
7883 print_indent_tree(f, NODE_BODY(node), indent + add);
7884 break;
7885
7886 case NODE_GIMMICK:
7887 fprintf(f, "<gimmick:%p> ", node);
7888 switch (GIMMICK_(node)->type) {
7889 case GIMMICK_FAIL:
7890 fprintf(f, "fail");
7891 break;
7892 case GIMMICK_SAVE:
7893 fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7894 break;
7895 case GIMMICK_UPDATE_VAR:
7896 fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
7897 break;
7898 #ifdef USE_CALLOUT
7899 case GIMMICK_CALLOUT:
7900 switch (GIMMICK_(node)->detail_type) {
7901 case ONIG_CALLOUT_OF_CONTENTS:
7902 fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
7903 break;
7904 case ONIG_CALLOUT_OF_NAME:
7905 fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
7906 break;
7907 }
7908 #endif
7909 }
7910 break;
7911
7912 default:
7913 fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
7914 break;
7915 }
7916
7917 if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
7918 type != NODE_BAG)
7919 fprintf(f, "\n");
7920 fflush(f);
7921 }
7922
7923 static void
print_tree(FILE * f,Node * node)7924 print_tree(FILE* f, Node* node)
7925 {
7926 print_indent_tree(f, node, 0);
7927 }
7928 #endif
7929