1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
33
34 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)35 onig_get_default_case_fold_flag(void)
36 {
37 return OnigDefaultCaseFoldFlag;
38 }
39
40 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
42 {
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
45 }
46
47
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
51
52 static UChar*
str_dup(UChar * s,UChar * end)53 str_dup(UChar* s, UChar* end)
54 {
55 int len = end - s;
56
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
63 }
64 else return NULL;
65 }
66
67 static void
swap_node(Node * a,Node * b)68 swap_node(Node* a, Node* b)
69 {
70 Node c;
71 c = *a; *a = *b; *b = c;
72
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
79 }
80 }
81
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
88 }
89 }
90 }
91
92 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)93 distance_add(OnigDistance d1, OnigDistance d2)
94 {
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
100 }
101 }
102
103 static OnigDistance
distance_multiply(OnigDistance d,int m)104 distance_multiply(OnigDistance d, int m)
105 {
106 if (m == 0) return 0;
107
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
112 }
113
114 static int
bitset_is_empty(BitSetRef bs)115 bitset_is_empty(BitSetRef bs)
116 {
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
120 }
121 return 1;
122 }
123
124 #ifdef ONIG_DEBUG
125 static int
bitset_on_num(BitSetRef bs)126 bitset_on_num(BitSetRef bs)
127 {
128 int i, n;
129
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
133 }
134 return n;
135 }
136 #endif
137
138 extern int
onig_bbuf_init(BBuf * buf,int size)139 onig_bbuf_init(BBuf* buf, int size)
140 {
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
144 }
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
148 }
149
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
153 }
154
155
156 #ifdef USE_SUBEXP_CALL
157
158 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)159 unset_addr_list_init(UnsetAddrList* uslist, int size)
160 {
161 UnsetAddr* p;
162
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
169 }
170
171 static void
unset_addr_list_end(UnsetAddrList * uslist)172 unset_addr_list_end(UnsetAddrList* uslist)
173 {
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
176 }
177
178 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
180 {
181 UnsetAddr* p;
182 int size;
183
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
190 }
191
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
196 }
197 #endif /* USE_SUBEXP_CALL */
198
199
200 static int
add_opcode(regex_t * reg,int opcode)201 add_opcode(regex_t* reg, int opcode)
202 {
203 BBUF_ADD1(reg, opcode);
204 return 0;
205 }
206
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
add_state_check_num(regex_t * reg,int num)209 add_state_check_num(regex_t* reg, int num)
210 {
211 StateCheckNumType n = (StateCheckNumType )num;
212
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
215 }
216 #endif
217
218 static int
add_rel_addr(regex_t * reg,int addr)219 add_rel_addr(regex_t* reg, int addr)
220 {
221 RelAddrType ra = (RelAddrType )addr;
222
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
225 }
226
227 static int
add_abs_addr(regex_t * reg,int addr)228 add_abs_addr(regex_t* reg, int addr)
229 {
230 AbsAddrType ra = (AbsAddrType )addr;
231
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
234 }
235
236 static int
add_length(regex_t * reg,int len)237 add_length(regex_t* reg, int len)
238 {
239 LengthType l = (LengthType )len;
240
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
243 }
244
245 static int
add_mem_num(regex_t * reg,int num)246 add_mem_num(regex_t* reg, int num)
247 {
248 MemNumType n = (MemNumType )num;
249
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
252 }
253
254 static int
add_pointer(regex_t * reg,void * addr)255 add_pointer(regex_t* reg, void* addr)
256 {
257 PointerType ptr = (PointerType )addr;
258
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
261 }
262
263 static int
add_option(regex_t * reg,OnigOptionType option)264 add_option(regex_t* reg, OnigOptionType option)
265 {
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
268 }
269
270 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
272 {
273 int r;
274
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
279 }
280
281 static int
add_bytes(regex_t * reg,UChar * bytes,int len)282 add_bytes(regex_t* reg, UChar* bytes, int len)
283 {
284 BBUF_ADD(reg, bytes, len);
285 return 0;
286 }
287
288 static int
add_bitset(regex_t * reg,BitSetRef bs)289 add_bitset(regex_t* reg, BitSetRef bs)
290 {
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
293 }
294
295 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
297 {
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
304 }
305
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
308
309
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
313
314 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)315 select_str_opcode(int mb_len, int str_len, int ignore_case)
316 {
317 int op;
318
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
323 }
324 }
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
335 }
336 break;
337
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
344 }
345 break;
346
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
350
351 default:
352 op = OP_EXACTMBN;
353 break;
354 }
355 }
356 return op;
357 }
358
359 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
361 {
362 int r;
363 int saved_num_null_check = reg->num_null_check;
364
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
371 }
372
373 r = compile_tree(node, reg);
374 if (r) return r;
375
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
383
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
386 }
387 return r;
388 }
389
390 #ifdef USE_SUBEXP_CALL
391 static int
compile_call(CallNode * node,regex_t * reg)392 compile_call(CallNode* node, regex_t* reg)
393 {
394 int r;
395
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
403 }
404 #endif
405
406 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)407 compile_tree_n_times(Node* node, int n, regex_t* reg)
408 {
409 int i, r;
410
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
414 }
415 return 0;
416 }
417
418 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
421 {
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
424
425 len = SIZE_OPCODE;
426
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
430
431 len += mb_len * str_len;
432 return len;
433 }
434
435 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
438 {
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
441
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
444
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
450 }
451
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
454 }
455
456
457 static int
compile_length_string_node(Node * node,regex_t * reg)458 compile_length_string_node(Node* node, regex_t* reg)
459 {
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
464
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
468
469 ambig = NSTRING_IS_AMBIG(node);
470
471 p = prev = sn->s;
472 prev_len = enclen(enc, p);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
476
477 for (; p < sn->end; ) {
478 len = enclen(enc, p);
479 if (len == prev_len) {
480 slen++;
481 }
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
488 }
489 p += len;
490 }
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
494 }
495
496 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
498 {
499 if (sn->end <= sn->s)
500 return 0;
501
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
503 }
504
505 static int
compile_string_node(Node * node,regex_t * reg)506 compile_string_node(Node* node, regex_t* reg)
507 {
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
512
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
516
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
519
520 p = prev = sn->s;
521 prev_len = enclen(enc, p);
522 p += prev_len;
523 slen = 1;
524
525 for (; p < end; ) {
526 len = enclen(enc, p);
527 if (len == prev_len) {
528 slen++;
529 }
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
533
534 prev = p;
535 slen = 1;
536 prev_len = len;
537 }
538
539 p += len;
540 }
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
542 }
543
544 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)545 compile_string_raw_node(StrNode* sn, regex_t* reg)
546 {
547 if (sn->end <= sn->s)
548 return 0;
549
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
551 }
552
553 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
555 {
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
562
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
566
567 r = add_bytes(reg, mbuf->p, mbuf->used);
568
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
574 }
575
576 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
578 {
579 int len;
580
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
584 }
585
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
588 }
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
592 }
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
595 }
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
601 }
602
603 return len;
604 }
605
606 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)607 compile_cclass_node(CClassNode* cc, regex_t* reg)
608 {
609 int r;
610
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
615 }
616
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
622
623 r = add_bitset(reg, cc->bs);
624 }
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
631
632 r = add_multi_byte_cclass(cc->mbuf, reg);
633 }
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
639
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
643 }
644 }
645
646 return r;
647 }
648
649 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
651 {
652 #define REPEAT_RANGE_ALLOC 4
653
654 OnigRepeatRange* p;
655
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
661 }
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
670 }
671 else {
672 p = reg->repeat_range;
673 }
674
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
678 }
679
680 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
683 {
684 int r;
685 int num_repeat = reg->num_repeat;
686
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
694
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
697
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
700
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
707 }
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
710 }
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
714 }
715
716 static int
is_anychar_star_quantifier(QtfrNode * qn)717 is_anychar_star_quantifier(QtfrNode* qn)
718 {
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
724 }
725
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
728
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
730
731 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
733 {
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
739
740 if (tlen < 0) return tlen;
741
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
743
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
745
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
753 }
754 }
755
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
760
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
767
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
769 }
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
775
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
777 }
778 }
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
784 }
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
789 }
790 else {
791 len = SIZE_OP_PUSH + tlen;
792 }
793 }
794 else {
795 len = tlen;
796 }
797 }
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
800 }
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
806 }
807
808 return len;
809 }
810
811 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
813 {
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
819
820 if (tlen < 0) return tlen;
821
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
823
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
836 }
837
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
839 }
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
845 }
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
850 }
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
854
855 return r;
856 }
857 }
858
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
863
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
870 }
871
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
878 }
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
881 }
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
888 }
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
893 }
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
903 }
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
906 }
907 }
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
913 }
914 else
915 r = 0;
916 }
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
925 }
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
928 }
929 if (r) return r;
930 }
931
932 r = compile_tree(qn->target, reg);
933 }
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
941 }
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
944 }
945
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
950 }
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
958 }
959 }
960 return r;
961 }
962
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
964
965 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
967 {
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
972
973 if (tlen < 0) return tlen;
974
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
982 }
983 }
984
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
989
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
994 }
995 else {
996 len = tlen * qn->lower;
997 }
998
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1006 }
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1009 }
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1012 }
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1018 }
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1021 }
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1025 }
1026
1027 return len;
1028 }
1029
1030 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1032 {
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1037
1038 if (tlen < 0) return tlen;
1039
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1050 }
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1056 }
1057 }
1058
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1063
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1074 }
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1077 }
1078 if (r) return r;
1079 }
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1083 }
1084
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1095 }
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1105 }
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1113 }
1114 }
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1121 }
1122 }
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1127 }
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1132
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1135
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1142 }
1143 }
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1150 }
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1153 }
1154 return r;
1155 }
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1157
1158 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1160 {
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1163
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1167
1168 if (tlen < 0) return tlen;
1169
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1173 }
1174 else
1175 return tlen;
1176 }
1177
1178 static int
compile_option_node(EncloseNode * node,regex_t * reg)1179 compile_option_node(EncloseNode* node, regex_t* reg)
1180 {
1181 int r;
1182 OnigOptionType prev = reg->options;
1183
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1191 }
1192
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1196
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1200 }
1201 return r;
1202 }
1203
1204 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1206 {
1207 int len;
1208 int tlen;
1209
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1212
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1216 }
1217 else
1218 tlen = 0;
1219
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1232 }
1233 else
1234 #endif
1235 {
1236 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 len = SIZE_OP_MEMORY_START_PUSH;
1238 else
1239 len = SIZE_OP_MEMORY_START;
1240
1241 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1243 }
1244 break;
1245
1246 case ENCLOSE_STOP_BACKTRACK:
1247 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248 QtfrNode* qn = NQTFR(node->target);
1249 tlen = compile_length_tree(qn->target, reg);
1250 if (tlen < 0) return tlen;
1251
1252 len = tlen * qn->lower
1253 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1254 }
1255 else {
1256 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1257 }
1258 break;
1259
1260 default:
1261 return ONIGERR_TYPE_BUG;
1262 break;
1263 }
1264
1265 return len;
1266 }
1267
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1269
1270 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1272 {
1273 int r, len;
1274
1275 if (node->type == ENCLOSE_OPTION)
1276 return compile_option_node(node, reg);
1277
1278 switch (node->type) {
1279 case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281 if (IS_ENCLOSE_CALLED(node)) {
1282 r = add_opcode(reg, OP_CALL);
1283 if (r) return r;
1284 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285 node->state |= NST_ADDR_FIXED;
1286 r = add_abs_addr(reg, (int )node->call_addr);
1287 if (r) return r;
1288 len = compile_length_tree(node->target, reg);
1289 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 len += (IS_ENCLOSE_RECURSION(node)
1292 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293 else
1294 len += (IS_ENCLOSE_RECURSION(node)
1295 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1296
1297 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298 if (r) return r;
1299 }
1300 #endif
1301 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303 else
1304 r = add_opcode(reg, OP_MEMORY_START);
1305 if (r) return r;
1306 r = add_mem_num(reg, node->regnum);
1307 if (r) return r;
1308 r = compile_tree(node->target, reg);
1309 if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311 if (IS_ENCLOSE_CALLED(node)) {
1312 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315 else
1316 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1318
1319 if (r) return r;
1320 r = add_mem_num(reg, node->regnum);
1321 if (r) return r;
1322 r = add_opcode(reg, OP_RETURN);
1323 }
1324 else
1325 #endif
1326 {
1327 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329 else
1330 r = add_opcode(reg, OP_MEMORY_END);
1331 if (r) return r;
1332 r = add_mem_num(reg, node->regnum);
1333 }
1334 break;
1335
1336 case ENCLOSE_STOP_BACKTRACK:
1337 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338 QtfrNode* qn = NQTFR(node->target);
1339 r = compile_tree_n_times(qn->target, qn->lower, reg);
1340 if (r) return r;
1341
1342 len = compile_length_tree(qn->target, reg);
1343 if (len < 0) return len;
1344
1345 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346 if (r) return r;
1347 r = compile_tree(qn->target, reg);
1348 if (r) return r;
1349 r = add_opcode(reg, OP_POP);
1350 if (r) return r;
1351 r = add_opcode_rel_addr(reg, OP_JUMP,
1352 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1353 }
1354 else {
1355 r = add_opcode(reg, OP_PUSH_STOP_BT);
1356 if (r) return r;
1357 r = compile_tree(node->target, reg);
1358 if (r) return r;
1359 r = add_opcode(reg, OP_POP_STOP_BT);
1360 }
1361 break;
1362
1363 default:
1364 return ONIGERR_TYPE_BUG;
1365 break;
1366 }
1367
1368 return r;
1369 }
1370
1371 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1373 {
1374 int len;
1375 int tlen = 0;
1376
1377 if (node->target) {
1378 tlen = compile_length_tree(node->target, reg);
1379 if (tlen < 0) return tlen;
1380 }
1381
1382 switch (node->type) {
1383 case ANCHOR_PREC_READ:
1384 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385 break;
1386 case ANCHOR_PREC_READ_NOT:
1387 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388 break;
1389 case ANCHOR_LOOK_BEHIND:
1390 len = SIZE_OP_LOOK_BEHIND + tlen;
1391 break;
1392 case ANCHOR_LOOK_BEHIND_NOT:
1393 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394 break;
1395
1396 default:
1397 len = SIZE_OPCODE;
1398 break;
1399 }
1400
1401 return len;
1402 }
1403
1404 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1406 {
1407 int r, len;
1408
1409 switch (node->type) {
1410 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1411 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1412 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1413 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1414 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1415 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1416
1417 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1418 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1421 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1422 #endif
1423
1424 case ANCHOR_PREC_READ:
1425 r = add_opcode(reg, OP_PUSH_POS);
1426 if (r) return r;
1427 r = compile_tree(node->target, reg);
1428 if (r) return r;
1429 r = add_opcode(reg, OP_POP_POS);
1430 break;
1431
1432 case ANCHOR_PREC_READ_NOT:
1433 len = compile_length_tree(node->target, reg);
1434 if (len < 0) return len;
1435 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436 if (r) return r;
1437 r = compile_tree(node->target, reg);
1438 if (r) return r;
1439 r = add_opcode(reg, OP_FAIL_POS);
1440 break;
1441
1442 case ANCHOR_LOOK_BEHIND:
1443 {
1444 int n;
1445 r = add_opcode(reg, OP_LOOK_BEHIND);
1446 if (r) return r;
1447 if (node->char_len < 0) {
1448 r = get_char_length_tree(node->target, reg, &n);
1449 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1450 }
1451 else
1452 n = node->char_len;
1453 r = add_length(reg, n);
1454 if (r) return r;
1455 r = compile_tree(node->target, reg);
1456 }
1457 break;
1458
1459 case ANCHOR_LOOK_BEHIND_NOT:
1460 {
1461 int n;
1462 len = compile_length_tree(node->target, reg);
1463 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465 if (r) return r;
1466 if (node->char_len < 0) {
1467 r = get_char_length_tree(node->target, reg, &n);
1468 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1469 }
1470 else
1471 n = node->char_len;
1472 r = add_length(reg, n);
1473 if (r) return r;
1474 r = compile_tree(node->target, reg);
1475 if (r) return r;
1476 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1477 }
1478 break;
1479
1480 default:
1481 return ONIGERR_TYPE_BUG;
1482 break;
1483 }
1484
1485 return r;
1486 }
1487
1488 static int
compile_length_tree(Node * node,regex_t * reg)1489 compile_length_tree(Node* node, regex_t* reg)
1490 {
1491 int len, type, r;
1492
1493 type = NTYPE(node);
1494 switch (type) {
1495 case NT_LIST:
1496 len = 0;
1497 do {
1498 r = compile_length_tree(NCAR(node), reg);
1499 if (r < 0) return r;
1500 len += r;
1501 } while (IS_NOT_NULL(node = NCDR(node)));
1502 r = len;
1503 break;
1504
1505 case NT_ALT:
1506 {
1507 int n;
1508
1509 n = r = 0;
1510 do {
1511 r += compile_length_tree(NCAR(node), reg);
1512 n++;
1513 } while (IS_NOT_NULL(node = NCDR(node)));
1514 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1515 }
1516 break;
1517
1518 case NT_STR:
1519 if (NSTRING_IS_RAW(node))
1520 r = compile_length_string_raw_node(NSTR(node), reg);
1521 else
1522 r = compile_length_string_node(node, reg);
1523 break;
1524
1525 case NT_CCLASS:
1526 r = compile_length_cclass_node(NCCLASS(node), reg);
1527 break;
1528
1529 case NT_CTYPE:
1530 case NT_CANY:
1531 r = SIZE_OPCODE;
1532 break;
1533
1534 case NT_BREF:
1535 {
1536 BRefNode* br = NBREF(node);
1537
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539 if (IS_BACKREF_NEST_LEVEL(br)) {
1540 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1542 }
1543 else
1544 #endif
1545 if (br->back_num == 1) {
1546 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1548 }
1549 else {
1550 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1551 }
1552 }
1553 break;
1554
1555 #ifdef USE_SUBEXP_CALL
1556 case NT_CALL:
1557 r = SIZE_OP_CALL;
1558 break;
1559 #endif
1560
1561 case NT_QTFR:
1562 r = compile_length_quantifier_node(NQTFR(node), reg);
1563 break;
1564
1565 case NT_ENCLOSE:
1566 r = compile_length_enclose_node(NENCLOSE(node), reg);
1567 break;
1568
1569 case NT_ANCHOR:
1570 r = compile_length_anchor_node(NANCHOR(node), reg);
1571 break;
1572
1573 default:
1574 return ONIGERR_TYPE_BUG;
1575 break;
1576 }
1577
1578 return r;
1579 }
1580
1581 static int
compile_tree(Node * node,regex_t * reg)1582 compile_tree(Node* node, regex_t* reg)
1583 {
1584 int n, type, len, pos, r = 0;
1585
1586 type = NTYPE(node);
1587 switch (type) {
1588 case NT_LIST:
1589 do {
1590 r = compile_tree(NCAR(node), reg);
1591 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592 break;
1593
1594 case NT_ALT:
1595 {
1596 Node* x = node;
1597 len = 0;
1598 do {
1599 len += compile_length_tree(NCAR(x), reg);
1600 if (NCDR(x) != NULL) {
1601 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1602 }
1603 } while (IS_NOT_NULL(x = NCDR(x)));
1604 pos = reg->used + len; /* goal position */
1605
1606 do {
1607 len = compile_length_tree(NCAR(node), reg);
1608 if (IS_NOT_NULL(NCDR(node))) {
1609 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 if (r) break;
1611 }
1612 r = compile_tree(NCAR(node), reg);
1613 if (r) break;
1614 if (IS_NOT_NULL(NCDR(node))) {
1615 len = pos - (reg->used + SIZE_OP_JUMP);
1616 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 if (r) break;
1618 }
1619 } while (IS_NOT_NULL(node = NCDR(node)));
1620 }
1621 break;
1622
1623 case NT_STR:
1624 if (NSTRING_IS_RAW(node))
1625 r = compile_string_raw_node(NSTR(node), reg);
1626 else
1627 r = compile_string_node(node, reg);
1628 break;
1629
1630 case NT_CCLASS:
1631 r = compile_cclass_node(NCCLASS(node), reg);
1632 break;
1633
1634 case NT_CTYPE:
1635 {
1636 int op;
1637
1638 switch (NCTYPE(node)->ctype) {
1639 case ONIGENC_CTYPE_WORD:
1640 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1641 else op = OP_WORD;
1642 break;
1643 default:
1644 return ONIGERR_TYPE_BUG;
1645 break;
1646 }
1647 r = add_opcode(reg, op);
1648 }
1649 break;
1650
1651 case NT_CANY:
1652 if (IS_MULTILINE(reg->options))
1653 r = add_opcode(reg, OP_ANYCHAR_ML);
1654 else
1655 r = add_opcode(reg, OP_ANYCHAR);
1656 break;
1657
1658 case NT_BREF:
1659 {
1660 BRefNode* br = NBREF(node);
1661
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663 if (IS_BACKREF_NEST_LEVEL(br)) {
1664 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 if (r) return r;
1666 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 if (r) return r;
1668 r = add_length(reg, br->nest_level);
1669 if (r) return r;
1670
1671 goto add_bacref_mems;
1672 }
1673 else
1674 #endif
1675 if (br->back_num == 1) {
1676 n = br->back_static[0];
1677 if (IS_IGNORECASE(reg->options)) {
1678 r = add_opcode(reg, OP_BACKREFN_IC);
1679 if (r) return r;
1680 r = add_mem_num(reg, n);
1681 }
1682 else {
1683 switch (n) {
1684 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1685 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1686 default:
1687 r = add_opcode(reg, OP_BACKREFN);
1688 if (r) return r;
1689 r = add_mem_num(reg, n);
1690 break;
1691 }
1692 }
1693 }
1694 else {
1695 int i;
1696 int* p;
1697
1698 if (IS_IGNORECASE(reg->options)) {
1699 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1700 }
1701 else {
1702 r = add_opcode(reg, OP_BACKREF_MULTI);
1703 }
1704 if (r) return r;
1705
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707 add_bacref_mems:
1708 #endif
1709 r = add_length(reg, br->back_num);
1710 if (r) return r;
1711 p = BACKREFS_P(br);
1712 for (i = br->back_num - 1; i >= 0; i--) {
1713 r = add_mem_num(reg, p[i]);
1714 if (r) return r;
1715 }
1716 }
1717 }
1718 break;
1719
1720 #ifdef USE_SUBEXP_CALL
1721 case NT_CALL:
1722 r = compile_call(NCALL(node), reg);
1723 break;
1724 #endif
1725
1726 case NT_QTFR:
1727 r = compile_quantifier_node(NQTFR(node), reg);
1728 break;
1729
1730 case NT_ENCLOSE:
1731 r = compile_enclose_node(NENCLOSE(node), reg);
1732 break;
1733
1734 case NT_ANCHOR:
1735 r = compile_anchor_node(NANCHOR(node), reg);
1736 break;
1737
1738 default:
1739 #ifdef ONIG_DEBUG
1740 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742 break;
1743 }
1744
1745 return r;
1746 }
1747
1748 #ifdef USE_NAMED_GROUP
1749
1750 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1752 {
1753 int r = 0;
1754 Node* node = *plink;
1755
1756 switch (NTYPE(node)) {
1757 case NT_LIST:
1758 case NT_ALT:
1759 do {
1760 r = noname_disable_map(&(NCAR(node)), map, counter);
1761 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762 break;
1763
1764 case NT_QTFR:
1765 {
1766 Node** ptarget = &(NQTFR(node)->target);
1767 Node* old = *ptarget;
1768 r = noname_disable_map(ptarget, map, counter);
1769 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 onig_reduce_nested_quantifier(node, *ptarget);
1771 }
1772 }
1773 break;
1774
1775 case NT_ENCLOSE:
1776 {
1777 EncloseNode* en = NENCLOSE(node);
1778 if (en->type == ENCLOSE_MEMORY) {
1779 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 (*counter)++;
1781 map[en->regnum].new_val = *counter;
1782 en->regnum = *counter;
1783 r = noname_disable_map(&(en->target), map, counter);
1784 }
1785 else {
1786 *plink = en->target;
1787 en->target = NULL_NODE;
1788 onig_node_free(node);
1789 r = noname_disable_map(plink, map, counter);
1790 }
1791 }
1792 else
1793 r = noname_disable_map(&(en->target), map, counter);
1794 }
1795 break;
1796
1797 case NT_ANCHOR:
1798 {
1799 AnchorNode* an = NANCHOR(node);
1800 switch (an->type) {
1801 case ANCHOR_PREC_READ:
1802 case ANCHOR_PREC_READ_NOT:
1803 case ANCHOR_LOOK_BEHIND:
1804 case ANCHOR_LOOK_BEHIND_NOT:
1805 r = noname_disable_map(&(an->target), map, counter);
1806 break;
1807 }
1808 }
1809 break;
1810
1811 default:
1812 break;
1813 }
1814
1815 return r;
1816 }
1817
1818 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1819 renumber_node_backref(Node* node, GroupNumRemap* map)
1820 {
1821 int i, pos, n, old_num;
1822 int *backs;
1823 BRefNode* bn = NBREF(node);
1824
1825 if (! IS_BACKREF_NAME_REF(bn))
1826 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1827
1828 old_num = bn->back_num;
1829 if (IS_NULL(bn->back_dynamic))
1830 backs = bn->back_static;
1831 else
1832 backs = bn->back_dynamic;
1833
1834 for (i = 0, pos = 0; i < old_num; i++) {
1835 n = map[backs[i]].new_val;
1836 if (n > 0) {
1837 backs[pos] = n;
1838 pos++;
1839 }
1840 }
1841
1842 bn->back_num = pos;
1843 return 0;
1844 }
1845
1846 static int
renumber_by_map(Node * node,GroupNumRemap * map)1847 renumber_by_map(Node* node, GroupNumRemap* map)
1848 {
1849 int r = 0;
1850
1851 switch (NTYPE(node)) {
1852 case NT_LIST:
1853 case NT_ALT:
1854 do {
1855 r = renumber_by_map(NCAR(node), map);
1856 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1857 break;
1858 case NT_QTFR:
1859 r = renumber_by_map(NQTFR(node)->target, map);
1860 break;
1861 case NT_ENCLOSE:
1862 r = renumber_by_map(NENCLOSE(node)->target, map);
1863 break;
1864
1865 case NT_BREF:
1866 r = renumber_node_backref(node, map);
1867 break;
1868
1869 case NT_ANCHOR:
1870 {
1871 AnchorNode* an = NANCHOR(node);
1872 switch (an->type) {
1873 case ANCHOR_PREC_READ:
1874 case ANCHOR_PREC_READ_NOT:
1875 case ANCHOR_LOOK_BEHIND:
1876 case ANCHOR_LOOK_BEHIND_NOT:
1877 r = renumber_by_map(an->target, map);
1878 break;
1879 }
1880 }
1881 break;
1882
1883 default:
1884 break;
1885 }
1886
1887 return r;
1888 }
1889
1890 static int
numbered_ref_check(Node * node)1891 numbered_ref_check(Node* node)
1892 {
1893 int r = 0;
1894
1895 switch (NTYPE(node)) {
1896 case NT_LIST:
1897 case NT_ALT:
1898 do {
1899 r = numbered_ref_check(NCAR(node));
1900 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1901 break;
1902 case NT_QTFR:
1903 r = numbered_ref_check(NQTFR(node)->target);
1904 break;
1905 case NT_ENCLOSE:
1906 r = numbered_ref_check(NENCLOSE(node)->target);
1907 break;
1908
1909 case NT_BREF:
1910 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1911 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1912 break;
1913
1914 default:
1915 break;
1916 }
1917
1918 return r;
1919 }
1920
1921 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1922 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1923 {
1924 int r, i, pos, counter;
1925 BitStatusType loc;
1926 GroupNumRemap* map;
1927
1928 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1929 CHECK_NULL_RETURN_MEMERR(map);
1930 for (i = 1; i <= env->num_mem; i++) {
1931 map[i].new_val = 0;
1932 }
1933 counter = 0;
1934 r = noname_disable_map(root, map, &counter);
1935 if (r != 0) return r;
1936
1937 r = renumber_by_map(*root, map);
1938 if (r != 0) return r;
1939
1940 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1941 if (map[i].new_val > 0) {
1942 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1943 pos++;
1944 }
1945 }
1946
1947 loc = env->capture_history;
1948 BIT_STATUS_CLEAR(env->capture_history);
1949 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1950 if (BIT_STATUS_AT(loc, i)) {
1951 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1952 }
1953 }
1954
1955 env->num_mem = env->num_named;
1956 reg->num_mem = env->num_named;
1957
1958 return onig_renumber_name_table(reg, map);
1959 }
1960 #endif /* USE_NAMED_GROUP */
1961
1962 #ifdef USE_SUBEXP_CALL
1963 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1964 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1965 {
1966 int i, offset;
1967 EncloseNode* en;
1968 AbsAddrType addr;
1969
1970 for (i = 0; i < uslist->num; i++) {
1971 en = NENCLOSE(uslist->us[i].target);
1972 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1973 addr = en->call_addr;
1974 offset = uslist->us[i].offset;
1975
1976 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1977 }
1978 return 0;
1979 }
1980 #endif
1981
1982 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1983 static int
quantifiers_memory_node_info(Node * node)1984 quantifiers_memory_node_info(Node* node)
1985 {
1986 int r = 0;
1987
1988 switch (NTYPE(node)) {
1989 case NT_LIST:
1990 case NT_ALT:
1991 {
1992 int v;
1993 do {
1994 v = quantifiers_memory_node_info(NCAR(node));
1995 if (v > r) r = v;
1996 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1997 }
1998 break;
1999
2000 #ifdef USE_SUBEXP_CALL
2001 case NT_CALL:
2002 if (IS_CALL_RECURSION(NCALL(node))) {
2003 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2004 }
2005 else
2006 r = quantifiers_memory_node_info(NCALL(node)->target);
2007 break;
2008 #endif
2009
2010 case NT_QTFR:
2011 {
2012 QtfrNode* qn = NQTFR(node);
2013 if (qn->upper != 0) {
2014 r = quantifiers_memory_node_info(qn->target);
2015 }
2016 }
2017 break;
2018
2019 case NT_ENCLOSE:
2020 {
2021 EncloseNode* en = NENCLOSE(node);
2022 switch (en->type) {
2023 case ENCLOSE_MEMORY:
2024 return NQ_TARGET_IS_EMPTY_MEM;
2025 break;
2026
2027 case ENCLOSE_OPTION:
2028 case ENCLOSE_STOP_BACKTRACK:
2029 r = quantifiers_memory_node_info(en->target);
2030 break;
2031 default:
2032 break;
2033 }
2034 }
2035 break;
2036
2037 case NT_BREF:
2038 case NT_STR:
2039 case NT_CTYPE:
2040 case NT_CCLASS:
2041 case NT_CANY:
2042 case NT_ANCHOR:
2043 default:
2044 break;
2045 }
2046
2047 return r;
2048 }
2049 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2050
2051 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2052 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2053 {
2054 OnigDistance tmin;
2055 int r = 0;
2056
2057 *min = 0;
2058 switch (NTYPE(node)) {
2059 case NT_BREF:
2060 {
2061 int i;
2062 int* backs;
2063 Node** nodes = SCANENV_MEM_NODES(env);
2064 BRefNode* br = NBREF(node);
2065 if (br->state & NST_RECURSION) break;
2066
2067 backs = BACKREFS_P(br);
2068 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2069 r = get_min_match_length(nodes[backs[0]], min, env);
2070 if (r != 0) break;
2071 for (i = 1; i < br->back_num; i++) {
2072 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2073 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2074 if (r != 0) break;
2075 if (*min > tmin) *min = tmin;
2076 }
2077 }
2078 break;
2079
2080 #ifdef USE_SUBEXP_CALL
2081 case NT_CALL:
2082 if (IS_CALL_RECURSION(NCALL(node))) {
2083 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2084 if (IS_ENCLOSE_MIN_FIXED(en))
2085 *min = en->min_len;
2086 }
2087 else
2088 r = get_min_match_length(NCALL(node)->target, min, env);
2089 break;
2090 #endif
2091
2092 case NT_LIST:
2093 do {
2094 r = get_min_match_length(NCAR(node), &tmin, env);
2095 if (r == 0) *min += tmin;
2096 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2097 break;
2098
2099 case NT_ALT:
2100 {
2101 Node *x, *y;
2102 y = node;
2103 do {
2104 x = NCAR(y);
2105 r = get_min_match_length(x, &tmin, env);
2106 if (r != 0) break;
2107 if (y == node) *min = tmin;
2108 else if (*min > tmin) *min = tmin;
2109 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2110 }
2111 break;
2112
2113 case NT_STR:
2114 {
2115 StrNode* sn = NSTR(node);
2116 *min = sn->end - sn->s;
2117 }
2118 break;
2119
2120 case NT_CTYPE:
2121 *min = 1;
2122 break;
2123
2124 case NT_CCLASS:
2125 case NT_CANY:
2126 *min = 1;
2127 break;
2128
2129 case NT_QTFR:
2130 {
2131 QtfrNode* qn = NQTFR(node);
2132
2133 if (qn->lower > 0) {
2134 r = get_min_match_length(qn->target, min, env);
2135 if (r == 0)
2136 *min = distance_multiply(*min, qn->lower);
2137 }
2138 }
2139 break;
2140
2141 case NT_ENCLOSE:
2142 {
2143 EncloseNode* en = NENCLOSE(node);
2144 switch (en->type) {
2145 case ENCLOSE_MEMORY:
2146 #ifdef USE_SUBEXP_CALL
2147 if (IS_ENCLOSE_MIN_FIXED(en))
2148 *min = en->min_len;
2149 else {
2150 r = get_min_match_length(en->target, min, env);
2151 if (r == 0) {
2152 en->min_len = *min;
2153 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2154 }
2155 }
2156 break;
2157 #endif
2158 case ENCLOSE_OPTION:
2159 case ENCLOSE_STOP_BACKTRACK:
2160 r = get_min_match_length(en->target, min, env);
2161 break;
2162 }
2163 }
2164 break;
2165
2166 case NT_ANCHOR:
2167 default:
2168 break;
2169 }
2170
2171 return r;
2172 }
2173
2174 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2175 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2176 {
2177 OnigDistance tmax;
2178 int r = 0;
2179
2180 *max = 0;
2181 switch (NTYPE(node)) {
2182 case NT_LIST:
2183 do {
2184 r = get_max_match_length(NCAR(node), &tmax, env);
2185 if (r == 0)
2186 *max = distance_add(*max, tmax);
2187 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2188 break;
2189
2190 case NT_ALT:
2191 do {
2192 r = get_max_match_length(NCAR(node), &tmax, env);
2193 if (r == 0 && *max < tmax) *max = tmax;
2194 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2195 break;
2196
2197 case NT_STR:
2198 {
2199 StrNode* sn = NSTR(node);
2200 *max = sn->end - sn->s;
2201 }
2202 break;
2203
2204 case NT_CTYPE:
2205 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2206 break;
2207
2208 case NT_CCLASS:
2209 case NT_CANY:
2210 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2211 break;
2212
2213 case NT_BREF:
2214 {
2215 int i;
2216 int* backs;
2217 Node** nodes = SCANENV_MEM_NODES(env);
2218 BRefNode* br = NBREF(node);
2219 if (br->state & NST_RECURSION) {
2220 *max = ONIG_INFINITE_DISTANCE;
2221 break;
2222 }
2223 backs = BACKREFS_P(br);
2224 for (i = 0; i < br->back_num; i++) {
2225 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2226 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2227 if (r != 0) break;
2228 if (*max < tmax) *max = tmax;
2229 }
2230 }
2231 break;
2232
2233 #ifdef USE_SUBEXP_CALL
2234 case NT_CALL:
2235 if (! IS_CALL_RECURSION(NCALL(node)))
2236 r = get_max_match_length(NCALL(node)->target, max, env);
2237 else
2238 *max = ONIG_INFINITE_DISTANCE;
2239 break;
2240 #endif
2241
2242 case NT_QTFR:
2243 {
2244 QtfrNode* qn = NQTFR(node);
2245
2246 if (qn->upper != 0) {
2247 r = get_max_match_length(qn->target, max, env);
2248 if (r == 0 && *max != 0) {
2249 if (! IS_REPEAT_INFINITE(qn->upper))
2250 *max = distance_multiply(*max, qn->upper);
2251 else
2252 *max = ONIG_INFINITE_DISTANCE;
2253 }
2254 }
2255 }
2256 break;
2257
2258 case NT_ENCLOSE:
2259 {
2260 EncloseNode* en = NENCLOSE(node);
2261 switch (en->type) {
2262 case ENCLOSE_MEMORY:
2263 #ifdef USE_SUBEXP_CALL
2264 if (IS_ENCLOSE_MAX_FIXED(en))
2265 *max = en->max_len;
2266 else {
2267 r = get_max_match_length(en->target, max, env);
2268 if (r == 0) {
2269 en->max_len = *max;
2270 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2271 }
2272 }
2273 break;
2274 #endif
2275 case ENCLOSE_OPTION:
2276 case ENCLOSE_STOP_BACKTRACK:
2277 r = get_max_match_length(en->target, max, env);
2278 break;
2279 }
2280 }
2281 break;
2282
2283 case NT_ANCHOR:
2284 default:
2285 break;
2286 }
2287
2288 return r;
2289 }
2290
2291 #define GET_CHAR_LEN_VARLEN -1
2292 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2293
2294 /* fixed size pattern node only */
2295 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2296 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2297 {
2298 int tlen;
2299 int r = 0;
2300
2301 level++;
2302 *len = 0;
2303 switch (NTYPE(node)) {
2304 case NT_LIST:
2305 do {
2306 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2307 if (r == 0)
2308 *len = distance_add(*len, tlen);
2309 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2310 break;
2311
2312 case NT_ALT:
2313 {
2314 int tlen2;
2315 int varlen = 0;
2316
2317 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2318 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2319 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2320 if (r == 0) {
2321 if (tlen != tlen2)
2322 varlen = 1;
2323 }
2324 }
2325 if (r == 0) {
2326 if (varlen != 0) {
2327 if (level == 1)
2328 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2331 }
2332 else
2333 *len = tlen;
2334 }
2335 }
2336 break;
2337
2338 case NT_STR:
2339 {
2340 StrNode* sn = NSTR(node);
2341 UChar *s = sn->s;
2342 while (s < sn->end) {
2343 s += enclen(reg->enc, s);
2344 (*len)++;
2345 }
2346 }
2347 break;
2348
2349 case NT_QTFR:
2350 {
2351 QtfrNode* qn = NQTFR(node);
2352 if (qn->lower == qn->upper) {
2353 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2354 if (r == 0)
2355 *len = distance_multiply(tlen, qn->lower);
2356 }
2357 else
2358 r = GET_CHAR_LEN_VARLEN;
2359 }
2360 break;
2361
2362 #ifdef USE_SUBEXP_CALL
2363 case NT_CALL:
2364 if (! IS_CALL_RECURSION(NCALL(node)))
2365 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2366 else
2367 r = GET_CHAR_LEN_VARLEN;
2368 break;
2369 #endif
2370
2371 case NT_CTYPE:
2372 *len = 1;
2373 break;
2374
2375 case NT_CCLASS:
2376 case NT_CANY:
2377 *len = 1;
2378 break;
2379
2380 case NT_ENCLOSE:
2381 {
2382 EncloseNode* en = NENCLOSE(node);
2383 switch (en->type) {
2384 case ENCLOSE_MEMORY:
2385 #ifdef USE_SUBEXP_CALL
2386 if (IS_ENCLOSE_CLEN_FIXED(en))
2387 *len = en->char_len;
2388 else {
2389 r = get_char_length_tree1(en->target, reg, len, level);
2390 if (r == 0) {
2391 en->char_len = *len;
2392 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2393 }
2394 }
2395 break;
2396 #endif
2397 case ENCLOSE_OPTION:
2398 case ENCLOSE_STOP_BACKTRACK:
2399 r = get_char_length_tree1(en->target, reg, len, level);
2400 break;
2401 default:
2402 break;
2403 }
2404 }
2405 break;
2406
2407 case NT_ANCHOR:
2408 break;
2409
2410 default:
2411 r = GET_CHAR_LEN_VARLEN;
2412 break;
2413 }
2414
2415 return r;
2416 }
2417
2418 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2419 get_char_length_tree(Node* node, regex_t* reg, int* len)
2420 {
2421 return get_char_length_tree1(node, reg, len, 0);
2422 }
2423
2424 /* x is not included y ==> 1 : 0 */
2425 static int
is_not_included(Node * x,Node * y,regex_t * reg)2426 is_not_included(Node* x, Node* y, regex_t* reg)
2427 {
2428 int i, len;
2429 OnigCodePoint code;
2430 UChar *p, c;
2431 int ytype;
2432
2433 retry:
2434 ytype = NTYPE(y);
2435 switch (NTYPE(x)) {
2436 case NT_CTYPE:
2437 {
2438 switch (ytype) {
2439 case NT_CTYPE:
2440 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2441 NCTYPE(y)->not != NCTYPE(x)->not)
2442 return 1;
2443 else
2444 return 0;
2445 break;
2446
2447 case NT_CCLASS:
2448 swap:
2449 {
2450 Node* tmp;
2451 tmp = x; x = y; y = tmp;
2452 goto retry;
2453 }
2454 break;
2455
2456 case NT_STR:
2457 goto swap;
2458 break;
2459
2460 default:
2461 break;
2462 }
2463 }
2464 break;
2465
2466 case NT_CCLASS:
2467 {
2468 CClassNode* xc = NCCLASS(x);
2469 switch (ytype) {
2470 case NT_CTYPE:
2471 switch (NCTYPE(y)->ctype) {
2472 case ONIGENC_CTYPE_WORD:
2473 if (NCTYPE(y)->not == 0) {
2474 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2475 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2476 if (BITSET_AT(xc->bs, i)) {
2477 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2478 }
2479 }
2480 return 1;
2481 }
2482 return 0;
2483 }
2484 else {
2485 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2486 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2487 if (!IS_NCCLASS_NOT(xc)) {
2488 if (BITSET_AT(xc->bs, i))
2489 return 0;
2490 }
2491 else {
2492 if (! BITSET_AT(xc->bs, i))
2493 return 0;
2494 }
2495 }
2496 }
2497 return 1;
2498 }
2499 break;
2500
2501 default:
2502 break;
2503 }
2504 break;
2505
2506 case NT_CCLASS:
2507 {
2508 int v;
2509 CClassNode* yc = NCCLASS(y);
2510
2511 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2512 v = BITSET_AT(xc->bs, i);
2513 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2514 (v == 0 && IS_NCCLASS_NOT(xc))) {
2515 v = BITSET_AT(yc->bs, i);
2516 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2517 (v == 0 && IS_NCCLASS_NOT(yc)))
2518 return 0;
2519 }
2520 }
2521 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2522 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2523 return 1;
2524 return 0;
2525 }
2526 break;
2527
2528 case NT_STR:
2529 goto swap;
2530 break;
2531
2532 default:
2533 break;
2534 }
2535 }
2536 break;
2537
2538 case NT_STR:
2539 {
2540 StrNode* xs = NSTR(x);
2541 if (NSTRING_LEN(x) == 0)
2542 break;
2543
2544 c = *(xs->s);
2545 switch (ytype) {
2546 case NT_CTYPE:
2547 switch (NCTYPE(y)->ctype) {
2548 case ONIGENC_CTYPE_WORD:
2549 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2550 return NCTYPE(y)->not;
2551 else
2552 return !(NCTYPE(y)->not);
2553 break;
2554 default:
2555 break;
2556 }
2557 break;
2558
2559 case NT_CCLASS:
2560 {
2561 CClassNode* cc = NCCLASS(y);
2562
2563 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2564 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2565 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2566 }
2567 break;
2568
2569 case NT_STR:
2570 {
2571 UChar *q;
2572 StrNode* ys = NSTR(y);
2573 len = NSTRING_LEN(x);
2574 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2575 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2576 /* tiny version */
2577 return 0;
2578 }
2579 else {
2580 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2581 if (*p != *q) return 1;
2582 }
2583 }
2584 }
2585 break;
2586
2587 default:
2588 break;
2589 }
2590 }
2591 break;
2592
2593 default:
2594 break;
2595 }
2596
2597 return 0;
2598 }
2599
2600 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2601 get_head_value_node(Node* node, int exact, regex_t* reg)
2602 {
2603 Node* n = NULL_NODE;
2604
2605 switch (NTYPE(node)) {
2606 case NT_BREF:
2607 case NT_ALT:
2608 case NT_CANY:
2609 #ifdef USE_SUBEXP_CALL
2610 case NT_CALL:
2611 #endif
2612 break;
2613
2614 case NT_CTYPE:
2615 case NT_CCLASS:
2616 if (exact == 0) {
2617 n = node;
2618 }
2619 break;
2620
2621 case NT_LIST:
2622 n = get_head_value_node(NCAR(node), exact, reg);
2623 break;
2624
2625 case NT_STR:
2626 {
2627 StrNode* sn = NSTR(node);
2628
2629 if (sn->end <= sn->s)
2630 break;
2631
2632 if (exact != 0 &&
2633 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2634 }
2635 else {
2636 n = node;
2637 }
2638 }
2639 break;
2640
2641 case NT_QTFR:
2642 {
2643 QtfrNode* qn = NQTFR(node);
2644 if (qn->lower > 0) {
2645 if (IS_NOT_NULL(qn->head_exact))
2646 n = qn->head_exact;
2647 else
2648 n = get_head_value_node(qn->target, exact, reg);
2649 }
2650 }
2651 break;
2652
2653 case NT_ENCLOSE:
2654 {
2655 EncloseNode* en = NENCLOSE(node);
2656 switch (en->type) {
2657 case ENCLOSE_OPTION:
2658 {
2659 OnigOptionType options = reg->options;
2660
2661 reg->options = NENCLOSE(node)->option;
2662 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2663 reg->options = options;
2664 }
2665 break;
2666
2667 case ENCLOSE_MEMORY:
2668 case ENCLOSE_STOP_BACKTRACK:
2669 n = get_head_value_node(en->target, exact, reg);
2670 break;
2671 }
2672 }
2673 break;
2674
2675 case NT_ANCHOR:
2676 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2677 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2678 break;
2679
2680 default:
2681 break;
2682 }
2683
2684 return n;
2685 }
2686
2687 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2688 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2689 {
2690 int type, r = 0;
2691
2692 type = NTYPE(node);
2693 if ((NTYPE2BIT(type) & type_mask) == 0)
2694 return 1;
2695
2696 switch (type) {
2697 case NT_LIST:
2698 case NT_ALT:
2699 do {
2700 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2701 anchor_mask);
2702 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2703 break;
2704
2705 case NT_QTFR:
2706 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2707 anchor_mask);
2708 break;
2709
2710 case NT_ENCLOSE:
2711 {
2712 EncloseNode* en = NENCLOSE(node);
2713 if ((en->type & enclose_mask) == 0)
2714 return 1;
2715
2716 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2717 }
2718 break;
2719
2720 case NT_ANCHOR:
2721 type = NANCHOR(node)->type;
2722 if ((type & anchor_mask) == 0)
2723 return 1;
2724
2725 if (NANCHOR(node)->target)
2726 r = check_type_tree(NANCHOR(node)->target,
2727 type_mask, enclose_mask, anchor_mask);
2728 break;
2729
2730 default:
2731 break;
2732 }
2733 return r;
2734 }
2735
2736 #ifdef USE_SUBEXP_CALL
2737
2738 #define RECURSION_EXIST 1
2739 #define RECURSION_INFINITE 2
2740
2741 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2742 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2743 {
2744 int type;
2745 int r = 0;
2746
2747 type = NTYPE(node);
2748 switch (type) {
2749 case NT_LIST:
2750 {
2751 Node *x;
2752 OnigDistance min;
2753 int ret;
2754
2755 x = node;
2756 do {
2757 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2758 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2759 r |= ret;
2760 if (head) {
2761 ret = get_min_match_length(NCAR(x), &min, env);
2762 if (ret != 0) return ret;
2763 if (min != 0) head = 0;
2764 }
2765 } while (IS_NOT_NULL(x = NCDR(x)));
2766 }
2767 break;
2768
2769 case NT_ALT:
2770 {
2771 int ret;
2772 r = RECURSION_EXIST;
2773 do {
2774 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2775 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2776 r &= ret;
2777 } while (IS_NOT_NULL(node = NCDR(node)));
2778 }
2779 break;
2780
2781 case NT_QTFR:
2782 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2783 if (r == RECURSION_EXIST) {
2784 if (NQTFR(node)->lower == 0) r = 0;
2785 }
2786 break;
2787
2788 case NT_ANCHOR:
2789 {
2790 AnchorNode* an = NANCHOR(node);
2791 switch (an->type) {
2792 case ANCHOR_PREC_READ:
2793 case ANCHOR_PREC_READ_NOT:
2794 case ANCHOR_LOOK_BEHIND:
2795 case ANCHOR_LOOK_BEHIND_NOT:
2796 r = subexp_inf_recursive_check(an->target, env, head);
2797 break;
2798 }
2799 }
2800 break;
2801
2802 case NT_CALL:
2803 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2804 break;
2805
2806 case NT_ENCLOSE:
2807 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2808 return 0;
2809 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2810 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2811 else {
2812 SET_ENCLOSE_STATUS(node, NST_MARK2);
2813 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2814 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2815 }
2816 break;
2817
2818 default:
2819 break;
2820 }
2821
2822 return r;
2823 }
2824
2825 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2826 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2827 {
2828 int type;
2829 int r = 0;
2830
2831 type = NTYPE(node);
2832 switch (type) {
2833 case NT_LIST:
2834 case NT_ALT:
2835 do {
2836 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2837 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2838 break;
2839
2840 case NT_QTFR:
2841 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2842 break;
2843
2844 case NT_ANCHOR:
2845 {
2846 AnchorNode* an = NANCHOR(node);
2847 switch (an->type) {
2848 case ANCHOR_PREC_READ:
2849 case ANCHOR_PREC_READ_NOT:
2850 case ANCHOR_LOOK_BEHIND:
2851 case ANCHOR_LOOK_BEHIND_NOT:
2852 r = subexp_inf_recursive_check_trav(an->target, env);
2853 break;
2854 }
2855 }
2856 break;
2857
2858 case NT_ENCLOSE:
2859 {
2860 EncloseNode* en = NENCLOSE(node);
2861
2862 if (IS_ENCLOSE_RECURSION(en)) {
2863 SET_ENCLOSE_STATUS(node, NST_MARK1);
2864 r = subexp_inf_recursive_check(en->target, env, 1);
2865 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2866 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2867 }
2868 r = subexp_inf_recursive_check_trav(en->target, env);
2869 }
2870
2871 break;
2872
2873 default:
2874 break;
2875 }
2876
2877 return r;
2878 }
2879
2880 static int
subexp_recursive_check(Node * node)2881 subexp_recursive_check(Node* node)
2882 {
2883 int r = 0;
2884
2885 switch (NTYPE(node)) {
2886 case NT_LIST:
2887 case NT_ALT:
2888 do {
2889 r |= subexp_recursive_check(NCAR(node));
2890 } while (IS_NOT_NULL(node = NCDR(node)));
2891 break;
2892
2893 case NT_QTFR:
2894 r = subexp_recursive_check(NQTFR(node)->target);
2895 break;
2896
2897 case NT_ANCHOR:
2898 {
2899 AnchorNode* an = NANCHOR(node);
2900 switch (an->type) {
2901 case ANCHOR_PREC_READ:
2902 case ANCHOR_PREC_READ_NOT:
2903 case ANCHOR_LOOK_BEHIND:
2904 case ANCHOR_LOOK_BEHIND_NOT:
2905 r = subexp_recursive_check(an->target);
2906 break;
2907 }
2908 }
2909 break;
2910
2911 case NT_CALL:
2912 r = subexp_recursive_check(NCALL(node)->target);
2913 if (r != 0) SET_CALL_RECURSION(node);
2914 break;
2915
2916 case NT_ENCLOSE:
2917 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2918 return 0;
2919 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2920 return 1; /* recursion */
2921 else {
2922 SET_ENCLOSE_STATUS(node, NST_MARK2);
2923 r = subexp_recursive_check(NENCLOSE(node)->target);
2924 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2925 }
2926 break;
2927
2928 default:
2929 break;
2930 }
2931
2932 return r;
2933 }
2934
2935
2936 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2937 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2938 {
2939 #define FOUND_CALLED_NODE 1
2940
2941 int type;
2942 int r = 0;
2943
2944 type = NTYPE(node);
2945 switch (type) {
2946 case NT_LIST:
2947 case NT_ALT:
2948 {
2949 int ret;
2950 do {
2951 ret = subexp_recursive_check_trav(NCAR(node), env);
2952 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2953 else if (ret < 0) return ret;
2954 } while (IS_NOT_NULL(node = NCDR(node)));
2955 }
2956 break;
2957
2958 case NT_QTFR:
2959 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2960 if (NQTFR(node)->upper == 0) {
2961 if (r == FOUND_CALLED_NODE)
2962 NQTFR(node)->is_refered = 1;
2963 }
2964 break;
2965
2966 case NT_ANCHOR:
2967 {
2968 AnchorNode* an = NANCHOR(node);
2969 switch (an->type) {
2970 case ANCHOR_PREC_READ:
2971 case ANCHOR_PREC_READ_NOT:
2972 case ANCHOR_LOOK_BEHIND:
2973 case ANCHOR_LOOK_BEHIND_NOT:
2974 r = subexp_recursive_check_trav(an->target, env);
2975 break;
2976 }
2977 }
2978 break;
2979
2980 case NT_ENCLOSE:
2981 {
2982 EncloseNode* en = NENCLOSE(node);
2983
2984 if (! IS_ENCLOSE_RECURSION(en)) {
2985 if (IS_ENCLOSE_CALLED(en)) {
2986 SET_ENCLOSE_STATUS(node, NST_MARK1);
2987 r = subexp_recursive_check(en->target);
2988 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2989 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2990 }
2991 }
2992 r = subexp_recursive_check_trav(en->target, env);
2993 if (IS_ENCLOSE_CALLED(en))
2994 r |= FOUND_CALLED_NODE;
2995 }
2996 break;
2997
2998 default:
2999 break;
3000 }
3001
3002 return r;
3003 }
3004
3005 static int
setup_subexp_call(Node * node,ScanEnv * env)3006 setup_subexp_call(Node* node, ScanEnv* env)
3007 {
3008 int type;
3009 int r = 0;
3010
3011 type = NTYPE(node);
3012 switch (type) {
3013 case NT_LIST:
3014 do {
3015 r = setup_subexp_call(NCAR(node), env);
3016 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3017 break;
3018
3019 case NT_ALT:
3020 do {
3021 r = setup_subexp_call(NCAR(node), env);
3022 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3023 break;
3024
3025 case NT_QTFR:
3026 r = setup_subexp_call(NQTFR(node)->target, env);
3027 break;
3028 case NT_ENCLOSE:
3029 r = setup_subexp_call(NENCLOSE(node)->target, env);
3030 break;
3031
3032 case NT_CALL:
3033 {
3034 CallNode* cn = NCALL(node);
3035 Node** nodes = SCANENV_MEM_NODES(env);
3036
3037 if (cn->group_num != 0) {
3038 int gnum = cn->group_num;
3039
3040 #ifdef USE_NAMED_GROUP
3041 if (env->num_named > 0 &&
3042 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3043 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3044 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3045 }
3046 #endif
3047 if (gnum > env->num_mem) {
3048 onig_scan_env_set_error_string(env,
3049 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3050 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3051 }
3052
3053 #ifdef USE_NAMED_GROUP
3054 set_call_attr:
3055 #endif
3056 cn->target = nodes[cn->group_num];
3057 if (IS_NULL(cn->target)) {
3058 onig_scan_env_set_error_string(env,
3059 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3060 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3061 }
3062 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3063 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3064 cn->unset_addr_list = env->unset_addr_list;
3065 }
3066 #ifdef USE_NAMED_GROUP
3067 else {
3068 int *refs;
3069
3070 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3071 &refs);
3072 if (n <= 0) {
3073 onig_scan_env_set_error_string(env,
3074 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3075 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3076 }
3077 else if (n > 1) {
3078 onig_scan_env_set_error_string(env,
3079 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3080 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3081 }
3082 else {
3083 cn->group_num = refs[0];
3084 goto set_call_attr;
3085 }
3086 }
3087 #endif
3088 }
3089 break;
3090
3091 case NT_ANCHOR:
3092 {
3093 AnchorNode* an = NANCHOR(node);
3094
3095 switch (an->type) {
3096 case ANCHOR_PREC_READ:
3097 case ANCHOR_PREC_READ_NOT:
3098 case ANCHOR_LOOK_BEHIND:
3099 case ANCHOR_LOOK_BEHIND_NOT:
3100 r = setup_subexp_call(an->target, env);
3101 break;
3102 }
3103 }
3104 break;
3105
3106 default:
3107 break;
3108 }
3109
3110 return r;
3111 }
3112 #endif
3113
3114 /* divide different length alternatives in look-behind.
3115 (?<=A|B) ==> (?<=A)|(?<=B)
3116 (?<!A|B) ==> (?<!A)(?<!B)
3117 */
3118 static int
divide_look_behind_alternatives(Node * node)3119 divide_look_behind_alternatives(Node* node)
3120 {
3121 Node *head, *np, *insert_node;
3122 AnchorNode* an = NANCHOR(node);
3123 int anc_type = an->type;
3124
3125 head = an->target;
3126 np = NCAR(head);
3127 swap_node(node, head);
3128 NCAR(node) = head;
3129 NANCHOR(head)->target = np;
3130
3131 np = node;
3132 while ((np = NCDR(np)) != NULL_NODE) {
3133 insert_node = onig_node_new_anchor(anc_type);
3134 CHECK_NULL_RETURN_MEMERR(insert_node);
3135 NANCHOR(insert_node)->target = NCAR(np);
3136 NCAR(np) = insert_node;
3137 }
3138
3139 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3140 np = node;
3141 do {
3142 SET_NTYPE(np, NT_LIST); /* alt -> list */
3143 } while ((np = NCDR(np)) != NULL_NODE);
3144 }
3145 return 0;
3146 }
3147
3148 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3149 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3150 {
3151 int r, len;
3152 AnchorNode* an = NANCHOR(node);
3153
3154 r = get_char_length_tree(an->target, reg, &len);
3155 if (r == 0)
3156 an->char_len = len;
3157 else if (r == GET_CHAR_LEN_VARLEN)
3158 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3159 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3160 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3161 r = divide_look_behind_alternatives(node);
3162 else
3163 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3164 }
3165
3166 return r;
3167 }
3168
3169 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3170 next_setup(Node* node, Node* next_node, regex_t* reg)
3171 {
3172 int type;
3173
3174 retry:
3175 type = NTYPE(node);
3176 if (type == NT_QTFR) {
3177 QtfrNode* qn = NQTFR(node);
3178 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3179 #ifdef USE_QTFR_PEEK_NEXT
3180 Node* n = get_head_value_node(next_node, 1, reg);
3181 /* '\0': for UTF-16BE etc... */
3182 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3183 qn->next_head_exact = n;
3184 }
3185 #endif
3186 /* automatic posseivation a*b ==> (?>a*)b */
3187 if (qn->lower <= 1) {
3188 int ttype = NTYPE(qn->target);
3189 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3190 Node *x, *y;
3191 x = get_head_value_node(qn->target, 0, reg);
3192 if (IS_NOT_NULL(x)) {
3193 y = get_head_value_node(next_node, 0, reg);
3194 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3195 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3196 CHECK_NULL_RETURN_MEMERR(en);
3197 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3198 swap_node(node, en);
3199 NENCLOSE(node)->target = en;
3200 }
3201 }
3202 }
3203 }
3204 }
3205 }
3206 else if (type == NT_ENCLOSE) {
3207 EncloseNode* en = NENCLOSE(node);
3208 if (en->type == ENCLOSE_MEMORY) {
3209 node = en->target;
3210 goto retry;
3211 }
3212 }
3213 return 0;
3214 }
3215
3216
3217 static int
update_string_node_case_fold(regex_t * reg,Node * node)3218 update_string_node_case_fold(regex_t* reg, Node *node)
3219 {
3220 UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3221 UChar *sbuf, *ebuf, *sp;
3222 int r, i, len, sbuf_size;
3223 StrNode* sn = NSTR(node);
3224
3225 end = sn->end;
3226 sbuf_size = (end - sn->s) * 2;
3227 sbuf = (UChar* )xmalloc(sbuf_size);
3228 CHECK_NULL_RETURN_MEMERR(sbuf);
3229 ebuf = sbuf + sbuf_size;
3230
3231 sp = sbuf;
3232 p = sn->s;
3233 while (p < end) {
3234 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3235 q = buf;
3236 for (i = 0; i < len; i++) {
3237 if (sp >= ebuf) {
3238 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3239 CHECK_NULL_RETURN_MEMERR(sbuf);
3240 sp = sbuf + sbuf_size;
3241 sbuf_size *= 2;
3242 ebuf = sbuf + sbuf_size;
3243 }
3244
3245 *sp++ = buf[i];
3246 }
3247 }
3248
3249 r = onig_node_str_set(node, sbuf, sp);
3250 if (r != 0) {
3251 xfree(sbuf);
3252 return r;
3253 }
3254
3255 xfree(sbuf);
3256 return 0;
3257 }
3258
3259 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3260 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3261 regex_t* reg)
3262 {
3263 int r;
3264 Node *node;
3265
3266 node = onig_node_new_str(s, end);
3267 if (IS_NULL(node)) return ONIGERR_MEMORY;
3268
3269 r = update_string_node_case_fold(reg, node);
3270 if (r != 0) {
3271 onig_node_free(node);
3272 return r;
3273 }
3274
3275 NSTRING_SET_AMBIG(node);
3276 NSTRING_SET_DONT_GET_OPT_INFO(node);
3277 *rnode = node;
3278 return 0;
3279 }
3280
3281 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3282 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3283 UChar *p, int slen, UChar *end,
3284 regex_t* reg, Node **rnode)
3285 {
3286 int r, i, j, len, varlen;
3287 Node *anode, *var_anode, *snode, *xnode, *an;
3288 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3289
3290 *rnode = var_anode = NULL_NODE;
3291
3292 varlen = 0;
3293 for (i = 0; i < item_num; i++) {
3294 if (items[i].byte_len != slen) {
3295 varlen = 1;
3296 break;
3297 }
3298 }
3299
3300 if (varlen != 0) {
3301 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3302 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3303
3304 xnode = onig_node_new_list(NULL, NULL);
3305 if (IS_NULL(xnode)) goto mem_err;
3306 NCAR(var_anode) = xnode;
3307
3308 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3309 if (IS_NULL(anode)) goto mem_err;
3310 NCAR(xnode) = anode;
3311 }
3312 else {
3313 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3314 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3315 }
3316
3317 snode = onig_node_new_str(p, p + slen);
3318 if (IS_NULL(snode)) goto mem_err;
3319
3320 NCAR(anode) = snode;
3321
3322 for (i = 0; i < item_num; i++) {
3323 snode = onig_node_new_str(NULL, NULL);
3324 if (IS_NULL(snode)) goto mem_err;
3325
3326 for (j = 0; j < items[i].code_len; j++) {
3327 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3328 if (len < 0) {
3329 r = len;
3330 goto mem_err2;
3331 }
3332
3333 r = onig_node_str_cat(snode, buf, buf + len);
3334 if (r != 0) goto mem_err2;
3335 }
3336
3337 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3338 if (IS_NULL(an)) {
3339 goto mem_err2;
3340 }
3341
3342 if (items[i].byte_len != slen) {
3343 Node *rem;
3344 UChar *q = p + items[i].byte_len;
3345
3346 if (q < end) {
3347 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3348 if (r != 0) {
3349 onig_node_free(an);
3350 goto mem_err2;
3351 }
3352
3353 xnode = onig_node_list_add(NULL_NODE, snode);
3354 if (IS_NULL(xnode)) {
3355 onig_node_free(an);
3356 onig_node_free(rem);
3357 goto mem_err2;
3358 }
3359 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3360 onig_node_free(an);
3361 onig_node_free(xnode);
3362 onig_node_free(rem);
3363 goto mem_err;
3364 }
3365
3366 NCAR(an) = xnode;
3367 }
3368 else {
3369 NCAR(an) = snode;
3370 }
3371
3372 NCDR(var_anode) = an;
3373 var_anode = an;
3374 }
3375 else {
3376 NCAR(an) = snode;
3377 NCDR(anode) = an;
3378 anode = an;
3379 }
3380 }
3381
3382 return varlen;
3383
3384 mem_err2:
3385 onig_node_free(snode);
3386
3387 mem_err:
3388 onig_node_free(*rnode);
3389
3390 return ONIGERR_MEMORY;
3391 }
3392
3393 static int
expand_case_fold_string(Node * node,regex_t * reg)3394 expand_case_fold_string(Node* node, regex_t* reg)
3395 {
3396 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3397
3398 int r, n, len, alt_num;
3399 UChar *start, *end, *p;
3400 Node *top_root, *root, *snode, *prev_node;
3401 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3402 StrNode* sn = NSTR(node);
3403
3404 if (NSTRING_IS_AMBIG(node)) return 0;
3405
3406 start = sn->s;
3407 end = sn->end;
3408 if (start >= end) return 0;
3409
3410 r = 0;
3411 top_root = root = prev_node = snode = NULL_NODE;
3412 alt_num = 1;
3413 p = start;
3414 while (p < end) {
3415 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3416 p, end, items);
3417 if (n < 0) {
3418 r = n;
3419 goto err;
3420 }
3421
3422 len = enclen(reg->enc, p);
3423
3424 if (n == 0) {
3425 if (IS_NULL(snode)) {
3426 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3427 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3428 if (IS_NULL(root)) {
3429 onig_node_free(prev_node);
3430 goto mem_err;
3431 }
3432 }
3433
3434 prev_node = snode = onig_node_new_str(NULL, NULL);
3435 if (IS_NULL(snode)) goto mem_err;
3436 if (IS_NOT_NULL(root)) {
3437 if (IS_NULL(onig_node_list_add(root, snode))) {
3438 onig_node_free(snode);
3439 goto mem_err;
3440 }
3441 }
3442 }
3443
3444 r = onig_node_str_cat(snode, p, p + len);
3445 if (r != 0) goto err;
3446 }
3447 else {
3448 alt_num *= (n + 1);
3449 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3450
3451 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3452 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3453 if (IS_NULL(root)) {
3454 onig_node_free(prev_node);
3455 goto mem_err;
3456 }
3457 }
3458
3459 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3460 if (r < 0) goto mem_err;
3461 if (r == 1) {
3462 if (IS_NULL(root)) {
3463 top_root = prev_node;
3464 }
3465 else {
3466 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3467 onig_node_free(prev_node);
3468 goto mem_err;
3469 }
3470 }
3471
3472 root = NCAR(prev_node);
3473 }
3474 else { /* r == 0 */
3475 if (IS_NOT_NULL(root)) {
3476 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3477 onig_node_free(prev_node);
3478 goto mem_err;
3479 }
3480 }
3481 }
3482
3483 snode = NULL_NODE;
3484 }
3485
3486 p += len;
3487 }
3488
3489 if (p < end) {
3490 Node *srem;
3491
3492 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3493 if (r != 0) goto mem_err;
3494
3495 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3496 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3497 if (IS_NULL(root)) {
3498 onig_node_free(srem);
3499 onig_node_free(prev_node);
3500 goto mem_err;
3501 }
3502 }
3503
3504 if (IS_NULL(root)) {
3505 prev_node = srem;
3506 }
3507 else {
3508 if (IS_NULL(onig_node_list_add(root, srem))) {
3509 onig_node_free(srem);
3510 goto mem_err;
3511 }
3512 }
3513 }
3514
3515 /* ending */
3516 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3517 swap_node(node, top_root);
3518 onig_node_free(top_root);
3519 return 0;
3520
3521 mem_err:
3522 r = ONIGERR_MEMORY;
3523
3524 err:
3525 onig_node_free(top_root);
3526 return r;
3527 }
3528
3529
3530 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3531
3532 #define CEC_THRES_NUM_BIG_REPEAT 512
3533 #define CEC_INFINITE_NUM 0x7fffffff
3534
3535 #define CEC_IN_INFINITE_REPEAT (1<<0)
3536 #define CEC_IN_FINITE_REPEAT (1<<1)
3537 #define CEC_CONT_BIG_REPEAT (1<<2)
3538
3539 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3540 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3541 {
3542 int type;
3543 int r = state;
3544
3545 type = NTYPE(node);
3546 switch (type) {
3547 case NT_LIST:
3548 {
3549 Node* prev = NULL_NODE;
3550 do {
3551 r = setup_comb_exp_check(NCAR(node), r, env);
3552 prev = NCAR(node);
3553 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3554 }
3555 break;
3556
3557 case NT_ALT:
3558 {
3559 int ret;
3560 do {
3561 ret = setup_comb_exp_check(NCAR(node), state, env);
3562 r |= ret;
3563 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3564 }
3565 break;
3566
3567 case NT_QTFR:
3568 {
3569 int child_state = state;
3570 int add_state = 0;
3571 QtfrNode* qn = NQTFR(node);
3572 Node* target = qn->target;
3573 int var_num;
3574
3575 if (! IS_REPEAT_INFINITE(qn->upper)) {
3576 if (qn->upper > 1) {
3577 /* {0,1}, {1,1} are allowed */
3578 child_state |= CEC_IN_FINITE_REPEAT;
3579
3580 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3581 if (env->backrefed_mem == 0) {
3582 if (NTYPE(qn->target) == NT_ENCLOSE) {
3583 EncloseNode* en = NENCLOSE(qn->target);
3584 if (en->type == ENCLOSE_MEMORY) {
3585 if (NTYPE(en->target) == NT_QTFR) {
3586 QtfrNode* q = NQTFR(en->target);
3587 if (IS_REPEAT_INFINITE(q->upper)
3588 && q->greedy == qn->greedy) {
3589 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3590 if (qn->upper == 1)
3591 child_state = state;
3592 }
3593 }
3594 }
3595 }
3596 }
3597 }
3598 }
3599
3600 if (state & CEC_IN_FINITE_REPEAT) {
3601 qn->comb_exp_check_num = -1;
3602 }
3603 else {
3604 if (IS_REPEAT_INFINITE(qn->upper)) {
3605 var_num = CEC_INFINITE_NUM;
3606 child_state |= CEC_IN_INFINITE_REPEAT;
3607 }
3608 else {
3609 var_num = qn->upper - qn->lower;
3610 }
3611
3612 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3613 add_state |= CEC_CONT_BIG_REPEAT;
3614
3615 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3616 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3617 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3618 if (qn->comb_exp_check_num == 0) {
3619 env->num_comb_exp_check++;
3620 qn->comb_exp_check_num = env->num_comb_exp_check;
3621 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3622 env->comb_exp_max_regnum = env->curr_max_regnum;
3623 }
3624 }
3625 }
3626
3627 r = setup_comb_exp_check(target, child_state, env);
3628 r |= add_state;
3629 }
3630 break;
3631
3632 case NT_ENCLOSE:
3633 {
3634 EncloseNode* en = NENCLOSE(node);
3635
3636 switch (en->type) {
3637 case ENCLOSE_MEMORY:
3638 {
3639 if (env->curr_max_regnum < en->regnum)
3640 env->curr_max_regnum = en->regnum;
3641
3642 r = setup_comb_exp_check(en->target, state, env);
3643 }
3644 break;
3645
3646 default:
3647 r = setup_comb_exp_check(en->target, state, env);
3648 break;
3649 }
3650 }
3651 break;
3652
3653 #ifdef USE_SUBEXP_CALL
3654 case NT_CALL:
3655 if (IS_CALL_RECURSION(NCALL(node)))
3656 env->has_recursion = 1;
3657 else
3658 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3659 break;
3660 #endif
3661
3662 default:
3663 break;
3664 }
3665
3666 return r;
3667 }
3668 #endif
3669
3670 #define IN_ALT (1<<0)
3671 #define IN_NOT (1<<1)
3672 #define IN_REPEAT (1<<2)
3673 #define IN_VAR_REPEAT (1<<3)
3674
3675 /* setup_tree does the following work.
3676 1. check empty loop. (set qn->target_empty_info)
3677 2. expand ignore-case in char class.
3678 3. set memory status bit flags. (reg->mem_stats)
3679 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3680 5. find invalid patterns in look-behind.
3681 6. expand repeated string.
3682 */
3683 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3684 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3685 {
3686 int type;
3687 int r = 0;
3688
3689 type = NTYPE(node);
3690 switch (type) {
3691 case NT_LIST:
3692 {
3693 Node* prev = NULL_NODE;
3694 do {
3695 r = setup_tree(NCAR(node), reg, state, env);
3696 if (IS_NOT_NULL(prev) && r == 0) {
3697 r = next_setup(prev, NCAR(node), reg);
3698 }
3699 prev = NCAR(node);
3700 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3701 }
3702 break;
3703
3704 case NT_ALT:
3705 do {
3706 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3707 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3708 break;
3709
3710 case NT_CCLASS:
3711 break;
3712
3713 case NT_STR:
3714 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3715 r = expand_case_fold_string(node, reg);
3716 }
3717 break;
3718
3719 case NT_CTYPE:
3720 case NT_CANY:
3721 break;
3722
3723 #ifdef USE_SUBEXP_CALL
3724 case NT_CALL:
3725 break;
3726 #endif
3727
3728 case NT_BREF:
3729 {
3730 int i;
3731 int* p;
3732 Node** nodes = SCANENV_MEM_NODES(env);
3733 BRefNode* br = NBREF(node);
3734 p = BACKREFS_P(br);
3735 for (i = 0; i < br->back_num; i++) {
3736 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3737 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3738 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3739 #ifdef USE_BACKREF_WITH_LEVEL
3740 if (IS_BACKREF_NEST_LEVEL(br)) {
3741 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3742 }
3743 #endif
3744 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3745 }
3746 }
3747 break;
3748
3749 case NT_QTFR:
3750 {
3751 OnigDistance d;
3752 QtfrNode* qn = NQTFR(node);
3753 Node* target = qn->target;
3754
3755 if ((state & IN_REPEAT) != 0) {
3756 qn->state |= NST_IN_REPEAT;
3757 }
3758
3759 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3760 r = get_min_match_length(target, &d, env);
3761 if (r) break;
3762 if (d == 0) {
3763 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3764 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3765 r = quantifiers_memory_node_info(target);
3766 if (r < 0) break;
3767 if (r > 0) {
3768 qn->target_empty_info = r;
3769 }
3770 #endif
3771 #if 0
3772 r = get_max_match_length(target, &d, env);
3773 if (r == 0 && d == 0) {
3774 /* ()* ==> ()?, ()+ ==> () */
3775 qn->upper = 1;
3776 if (qn->lower > 1) qn->lower = 1;
3777 if (NTYPE(target) == NT_STR) {
3778 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3779 }
3780 }
3781 #endif
3782 }
3783 }
3784
3785 state |= IN_REPEAT;
3786 if (qn->lower != qn->upper)
3787 state |= IN_VAR_REPEAT;
3788 r = setup_tree(target, reg, state, env);
3789 if (r) break;
3790
3791 /* expand string */
3792 #define EXPAND_STRING_MAX_LENGTH 100
3793 if (NTYPE(target) == NT_STR) {
3794 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3795 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3796 int len = NSTRING_LEN(target);
3797 StrNode* sn = NSTR(target);
3798
3799 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3800 int i, n = qn->lower;
3801 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3802 for (i = 0; i < n; i++) {
3803 r = onig_node_str_cat(node, sn->s, sn->end);
3804 if (r) break;
3805 }
3806 onig_node_free(target);
3807 break; /* break case NT_QTFR: */
3808 }
3809 }
3810 }
3811
3812 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3813 if (qn->greedy && (qn->target_empty_info != 0)) {
3814 if (NTYPE(target) == NT_QTFR) {
3815 QtfrNode* tqn = NQTFR(target);
3816 if (IS_NOT_NULL(tqn->head_exact)) {
3817 qn->head_exact = tqn->head_exact;
3818 tqn->head_exact = NULL;
3819 }
3820 }
3821 else {
3822 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3823 }
3824 }
3825 #endif
3826 }
3827 break;
3828
3829 case NT_ENCLOSE:
3830 {
3831 EncloseNode* en = NENCLOSE(node);
3832
3833 switch (en->type) {
3834 case ENCLOSE_OPTION:
3835 {
3836 OnigOptionType options = reg->options;
3837 reg->options = NENCLOSE(node)->option;
3838 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3839 reg->options = options;
3840 }
3841 break;
3842
3843 case ENCLOSE_MEMORY:
3844 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3845 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3846 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3847 }
3848 r = setup_tree(en->target, reg, state, env);
3849 break;
3850
3851 case ENCLOSE_STOP_BACKTRACK:
3852 {
3853 Node* target = en->target;
3854 r = setup_tree(target, reg, state, env);
3855 if (NTYPE(target) == NT_QTFR) {
3856 QtfrNode* tqn = NQTFR(target);
3857 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3858 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3859 int qtype = NTYPE(tqn->target);
3860 if (IS_NODE_TYPE_SIMPLE(qtype))
3861 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3862 }
3863 }
3864 }
3865 break;
3866 }
3867 }
3868 break;
3869
3870 case NT_ANCHOR:
3871 {
3872 AnchorNode* an = NANCHOR(node);
3873
3874 switch (an->type) {
3875 case ANCHOR_PREC_READ:
3876 r = setup_tree(an->target, reg, state, env);
3877 break;
3878 case ANCHOR_PREC_READ_NOT:
3879 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3880 break;
3881
3882 /* allowed node types in look-behind */
3883 #define ALLOWED_TYPE_IN_LB \
3884 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3885 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3886
3887 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3888 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3889
3890 #define ALLOWED_ANCHOR_IN_LB \
3891 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3892 #define ALLOWED_ANCHOR_IN_LB_NOT \
3893 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3894
3895 case ANCHOR_LOOK_BEHIND:
3896 {
3897 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3898 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3899 if (r < 0) return r;
3900 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3901 r = setup_look_behind(node, reg, env);
3902 if (r != 0) return r;
3903 r = setup_tree(an->target, reg, state, env);
3904 }
3905 break;
3906
3907 case ANCHOR_LOOK_BEHIND_NOT:
3908 {
3909 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3910 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3911 if (r < 0) return r;
3912 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3913 r = setup_look_behind(node, reg, env);
3914 if (r != 0) return r;
3915 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3916 }
3917 break;
3918 }
3919 }
3920 break;
3921
3922 default:
3923 break;
3924 }
3925
3926 return r;
3927 }
3928
3929 /* set skip map for Boyer-Moor search */
3930 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3931 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3932 UChar skip[], int** int_skip)
3933 {
3934 int i, len;
3935
3936 len = end - s;
3937 if (len < ONIG_CHAR_TABLE_SIZE) {
3938 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3939
3940 for (i = 0; i < len - 1; i++)
3941 skip[s[i]] = len - 1 - i;
3942 }
3943 else {
3944 if (IS_NULL(*int_skip)) {
3945 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3946 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3947 }
3948 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3949
3950 for (i = 0; i < len - 1; i++)
3951 (*int_skip)[s[i]] = len - 1 - i;
3952 }
3953 return 0;
3954 }
3955
3956 #define OPT_EXACT_MAXLEN 24
3957
3958 typedef struct {
3959 OnigDistance min; /* min byte length */
3960 OnigDistance max; /* max byte length */
3961 } MinMaxLen;
3962
3963 typedef struct {
3964 MinMaxLen mmd;
3965 OnigEncoding enc;
3966 OnigOptionType options;
3967 OnigCaseFoldType case_fold_flag;
3968 ScanEnv* scan_env;
3969 } OptEnv;
3970
3971 typedef struct {
3972 int left_anchor;
3973 int right_anchor;
3974 } OptAncInfo;
3975
3976 typedef struct {
3977 MinMaxLen mmd; /* info position */
3978 OptAncInfo anc;
3979
3980 int reach_end;
3981 int ignore_case;
3982 int len;
3983 UChar s[OPT_EXACT_MAXLEN];
3984 } OptExactInfo;
3985
3986 typedef struct {
3987 MinMaxLen mmd; /* info position */
3988 OptAncInfo anc;
3989
3990 int value; /* weighted value */
3991 UChar map[ONIG_CHAR_TABLE_SIZE];
3992 } OptMapInfo;
3993
3994 typedef struct {
3995 MinMaxLen len;
3996
3997 OptAncInfo anc;
3998 OptExactInfo exb; /* boundary */
3999 OptExactInfo exm; /* middle */
4000 OptExactInfo expr; /* prec read (?=...) */
4001
4002 OptMapInfo map; /* boundary */
4003 } NodeOptInfo;
4004
4005
4006 static int
map_position_value(OnigEncoding enc,int i)4007 map_position_value(OnigEncoding enc, int i)
4008 {
4009 static const short int ByteValTable[] = {
4010 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4011 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4012 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4013 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4014 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4015 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4016 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4017 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4018 };
4019
4020 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
4021 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4022 return 20;
4023 else
4024 return (int )ByteValTable[i];
4025 }
4026 else
4027 return 4; /* Take it easy. */
4028 }
4029
4030 static int
distance_value(MinMaxLen * mm)4031 distance_value(MinMaxLen* mm)
4032 {
4033 /* 1000 / (min-max-dist + 1) */
4034 static const short int dist_vals[] = {
4035 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4036 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4037 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4038 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4039 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4040 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4041 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4042 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4043 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4044 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4045 };
4046
4047 int d;
4048
4049 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4050
4051 d = mm->max - mm->min;
4052 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4053 /* return dist_vals[d] * 16 / (mm->min + 12); */
4054 return (int )dist_vals[d];
4055 else
4056 return 1;
4057 }
4058
4059 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4060 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4061 {
4062 if (v2 <= 0) return -1;
4063 if (v1 <= 0) return 1;
4064
4065 v1 *= distance_value(d1);
4066 v2 *= distance_value(d2);
4067
4068 if (v2 > v1) return 1;
4069 if (v2 < v1) return -1;
4070
4071 if (d2->min < d1->min) return 1;
4072 if (d2->min > d1->min) return -1;
4073 return 0;
4074 }
4075
4076 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4077 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4078 {
4079 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4080 }
4081
4082
4083 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4084 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4085 {
4086 mml->min = min;
4087 mml->max = max;
4088 }
4089
4090 static void
clear_mml(MinMaxLen * mml)4091 clear_mml(MinMaxLen* mml)
4092 {
4093 mml->min = mml->max = 0;
4094 }
4095
4096 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4097 copy_mml(MinMaxLen* to, MinMaxLen* from)
4098 {
4099 to->min = from->min;
4100 to->max = from->max;
4101 }
4102
4103 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4104 add_mml(MinMaxLen* to, MinMaxLen* from)
4105 {
4106 to->min = distance_add(to->min, from->min);
4107 to->max = distance_add(to->max, from->max);
4108 }
4109
4110 #if 0
4111 static void
4112 add_len_mml(MinMaxLen* to, OnigDistance len)
4113 {
4114 to->min = distance_add(to->min, len);
4115 to->max = distance_add(to->max, len);
4116 }
4117 #endif
4118
4119 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4120 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4121 {
4122 if (to->min > from->min) to->min = from->min;
4123 if (to->max < from->max) to->max = from->max;
4124 }
4125
4126 static void
copy_opt_env(OptEnv * to,OptEnv * from)4127 copy_opt_env(OptEnv* to, OptEnv* from)
4128 {
4129 *to = *from;
4130 }
4131
4132 static void
clear_opt_anc_info(OptAncInfo * anc)4133 clear_opt_anc_info(OptAncInfo* anc)
4134 {
4135 anc->left_anchor = 0;
4136 anc->right_anchor = 0;
4137 }
4138
4139 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4140 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4141 {
4142 *to = *from;
4143 }
4144
4145 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4146 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4147 OnigDistance left_len, OnigDistance right_len)
4148 {
4149 clear_opt_anc_info(to);
4150
4151 to->left_anchor = left->left_anchor;
4152 if (left_len == 0) {
4153 to->left_anchor |= right->left_anchor;
4154 }
4155
4156 to->right_anchor = right->right_anchor;
4157 if (right_len == 0) {
4158 to->right_anchor |= left->right_anchor;
4159 }
4160 }
4161
4162 static int
is_left_anchor(int anc)4163 is_left_anchor(int anc)
4164 {
4165 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4166 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4167 anc == ANCHOR_PREC_READ_NOT)
4168 return 0;
4169
4170 return 1;
4171 }
4172
4173 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4174 is_set_opt_anc_info(OptAncInfo* to, int anc)
4175 {
4176 if ((to->left_anchor & anc) != 0) return 1;
4177
4178 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4179 }
4180
4181 static void
add_opt_anc_info(OptAncInfo * to,int anc)4182 add_opt_anc_info(OptAncInfo* to, int anc)
4183 {
4184 if (is_left_anchor(anc))
4185 to->left_anchor |= anc;
4186 else
4187 to->right_anchor |= anc;
4188 }
4189
4190 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4191 remove_opt_anc_info(OptAncInfo* to, int anc)
4192 {
4193 if (is_left_anchor(anc))
4194 to->left_anchor &= ~anc;
4195 else
4196 to->right_anchor &= ~anc;
4197 }
4198
4199 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4200 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4201 {
4202 to->left_anchor &= add->left_anchor;
4203 to->right_anchor &= add->right_anchor;
4204 }
4205
4206 static int
is_full_opt_exact_info(OptExactInfo * ex)4207 is_full_opt_exact_info(OptExactInfo* ex)
4208 {
4209 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4210 }
4211
4212 static void
clear_opt_exact_info(OptExactInfo * ex)4213 clear_opt_exact_info(OptExactInfo* ex)
4214 {
4215 clear_mml(&ex->mmd);
4216 clear_opt_anc_info(&ex->anc);
4217 ex->reach_end = 0;
4218 ex->ignore_case = 0;
4219 ex->len = 0;
4220 ex->s[0] = '\0';
4221 }
4222
4223 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4224 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4225 {
4226 *to = *from;
4227 }
4228
4229 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4230 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4231 {
4232 int i, j, len;
4233 UChar *p, *end;
4234 OptAncInfo tanc;
4235
4236 if (! to->ignore_case && add->ignore_case) {
4237 if (to->len >= add->len) return ; /* avoid */
4238
4239 to->ignore_case = 1;
4240 }
4241
4242 p = add->s;
4243 end = p + add->len;
4244 for (i = to->len; p < end; ) {
4245 len = enclen(enc, p);
4246 if (i + len > OPT_EXACT_MAXLEN) break;
4247 for (j = 0; j < len && p < end; j++)
4248 to->s[i++] = *p++;
4249 }
4250
4251 to->len = i;
4252 to->reach_end = (p == end ? add->reach_end : 0);
4253
4254 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4255 if (! to->reach_end) tanc.right_anchor = 0;
4256 copy_opt_anc_info(&to->anc, &tanc);
4257 }
4258
4259 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4260 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4261 int raw ARG_UNUSED, OnigEncoding enc)
4262 {
4263 int i, j, len;
4264 UChar *p;
4265
4266 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4267 len = enclen(enc, p);
4268 if (i + len > OPT_EXACT_MAXLEN) break;
4269 for (j = 0; j < len && p < end; j++)
4270 to->s[i++] = *p++;
4271 }
4272
4273 to->len = i;
4274 }
4275
4276 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4277 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4278 {
4279 int i, j, len;
4280
4281 if (add->len == 0 || to->len == 0) {
4282 clear_opt_exact_info(to);
4283 return ;
4284 }
4285
4286 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4287 clear_opt_exact_info(to);
4288 return ;
4289 }
4290
4291 for (i = 0; i < to->len && i < add->len; ) {
4292 if (to->s[i] != add->s[i]) break;
4293 len = enclen(env->enc, to->s + i);
4294
4295 for (j = 1; j < len; j++) {
4296 if (to->s[i+j] != add->s[i+j]) break;
4297 }
4298 if (j < len) break;
4299 i += len;
4300 }
4301
4302 if (! add->reach_end || i < add->len || i < to->len) {
4303 to->reach_end = 0;
4304 }
4305 to->len = i;
4306 to->ignore_case |= add->ignore_case;
4307
4308 alt_merge_opt_anc_info(&to->anc, &add->anc);
4309 if (! to->reach_end) to->anc.right_anchor = 0;
4310 }
4311
4312 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4313 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4314 {
4315 int v1, v2;
4316
4317 v1 = now->len;
4318 v2 = alt->len;
4319
4320 if (v2 == 0) {
4321 return ;
4322 }
4323 else if (v1 == 0) {
4324 copy_opt_exact_info(now, alt);
4325 return ;
4326 }
4327 else if (v1 <= 2 && v2 <= 2) {
4328 /* ByteValTable[x] is big value --> low price */
4329 v2 = map_position_value(enc, now->s[0]);
4330 v1 = map_position_value(enc, alt->s[0]);
4331
4332 if (now->len > 1) v1 += 5;
4333 if (alt->len > 1) v2 += 5;
4334 }
4335
4336 if (now->ignore_case == 0) v1 *= 2;
4337 if (alt->ignore_case == 0) v2 *= 2;
4338
4339 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4340 copy_opt_exact_info(now, alt);
4341 }
4342
4343 static void
clear_opt_map_info(OptMapInfo * map)4344 clear_opt_map_info(OptMapInfo* map)
4345 {
4346 static const OptMapInfo clean_info = {
4347 {0, 0}, {0, 0}, 0,
4348 {
4349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4351 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4358 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4359 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4362 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4363 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4364 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4365 }
4366 };
4367
4368 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4369 }
4370
4371 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4372 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4373 {
4374 *to = *from;
4375 }
4376
4377 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4378 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4379 {
4380 if (map->map[c] == 0) {
4381 map->map[c] = 1;
4382 map->value += map_position_value(enc, c);
4383 }
4384 }
4385
4386 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4387 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4388 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4389 {
4390 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4391 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4392 int i, n;
4393
4394 add_char_opt_map_info(map, p[0], enc);
4395
4396 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4397 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4398 if (n < 0) return n;
4399
4400 for (i = 0; i < n; i++) {
4401 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4402 add_char_opt_map_info(map, buf[0], enc);
4403 }
4404
4405 return 0;
4406 }
4407
4408 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4409 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4410 {
4411 static int z = 1<<15; /* 32768: something big value */
4412
4413 int v1, v2;
4414
4415 if (alt->value == 0) return ;
4416 if (now->value == 0) {
4417 copy_opt_map_info(now, alt);
4418 return ;
4419 }
4420
4421 v1 = z / now->value;
4422 v2 = z / alt->value;
4423 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4424 copy_opt_map_info(now, alt);
4425 }
4426
4427 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4428 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4429 {
4430 #define COMP_EM_BASE 20
4431 int ve, vm;
4432
4433 if (m->value <= 0) return -1;
4434
4435 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4436 vm = COMP_EM_BASE * 5 * 2 / m->value;
4437 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4438 }
4439
4440 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4441 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4442 {
4443 int i, val;
4444
4445 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4446 if (to->value == 0) return ;
4447 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4448 clear_opt_map_info(to);
4449 return ;
4450 }
4451
4452 alt_merge_mml(&to->mmd, &add->mmd);
4453
4454 val = 0;
4455 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4456 if (add->map[i])
4457 to->map[i] = 1;
4458
4459 if (to->map[i])
4460 val += map_position_value(enc, i);
4461 }
4462 to->value = val;
4463
4464 alt_merge_opt_anc_info(&to->anc, &add->anc);
4465 }
4466
4467 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4468 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4469 {
4470 copy_mml(&(opt->exb.mmd), mmd);
4471 copy_mml(&(opt->expr.mmd), mmd);
4472 copy_mml(&(opt->map.mmd), mmd);
4473 }
4474
4475 static void
clear_node_opt_info(NodeOptInfo * opt)4476 clear_node_opt_info(NodeOptInfo* opt)
4477 {
4478 clear_mml(&opt->len);
4479 clear_opt_anc_info(&opt->anc);
4480 clear_opt_exact_info(&opt->exb);
4481 clear_opt_exact_info(&opt->exm);
4482 clear_opt_exact_info(&opt->expr);
4483 clear_opt_map_info(&opt->map);
4484 }
4485
4486 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4487 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4488 {
4489 *to = *from;
4490 }
4491
4492 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4493 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4494 {
4495 int exb_reach, exm_reach;
4496 OptAncInfo tanc;
4497
4498 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4499 copy_opt_anc_info(&to->anc, &tanc);
4500
4501 if (add->exb.len > 0 && to->len.max == 0) {
4502 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4503 to->len.max, add->len.max);
4504 copy_opt_anc_info(&add->exb.anc, &tanc);
4505 }
4506
4507 if (add->map.value > 0 && to->len.max == 0) {
4508 if (add->map.mmd.max == 0)
4509 add->map.anc.left_anchor |= to->anc.left_anchor;
4510 }
4511
4512 exb_reach = to->exb.reach_end;
4513 exm_reach = to->exm.reach_end;
4514
4515 if (add->len.max != 0)
4516 to->exb.reach_end = to->exm.reach_end = 0;
4517
4518 if (add->exb.len > 0) {
4519 if (exb_reach) {
4520 concat_opt_exact_info(&to->exb, &add->exb, enc);
4521 clear_opt_exact_info(&add->exb);
4522 }
4523 else if (exm_reach) {
4524 concat_opt_exact_info(&to->exm, &add->exb, enc);
4525 clear_opt_exact_info(&add->exb);
4526 }
4527 }
4528 select_opt_exact_info(enc, &to->exm, &add->exb);
4529 select_opt_exact_info(enc, &to->exm, &add->exm);
4530
4531 if (to->expr.len > 0) {
4532 if (add->len.max > 0) {
4533 if (to->expr.len > (int )add->len.max)
4534 to->expr.len = add->len.max;
4535
4536 if (to->expr.mmd.max == 0)
4537 select_opt_exact_info(enc, &to->exb, &to->expr);
4538 else
4539 select_opt_exact_info(enc, &to->exm, &to->expr);
4540 }
4541 }
4542 else if (add->expr.len > 0) {
4543 copy_opt_exact_info(&to->expr, &add->expr);
4544 }
4545
4546 select_opt_map_info(&to->map, &add->map);
4547
4548 add_mml(&to->len, &add->len);
4549 }
4550
4551 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4552 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4553 {
4554 alt_merge_opt_anc_info (&to->anc, &add->anc);
4555 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4556 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4557 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4558 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4559
4560 alt_merge_mml(&to->len, &add->len);
4561 }
4562
4563
4564 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4565
4566 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4567 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4568 {
4569 int type;
4570 int r = 0;
4571
4572 clear_node_opt_info(opt);
4573 set_bound_node_opt_info(opt, &env->mmd);
4574
4575 type = NTYPE(node);
4576 switch (type) {
4577 case NT_LIST:
4578 {
4579 OptEnv nenv;
4580 NodeOptInfo nopt;
4581 Node* nd = node;
4582
4583 copy_opt_env(&nenv, env);
4584 do {
4585 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4586 if (r == 0) {
4587 add_mml(&nenv.mmd, &nopt.len);
4588 concat_left_node_opt_info(env->enc, opt, &nopt);
4589 }
4590 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4591 }
4592 break;
4593
4594 case NT_ALT:
4595 {
4596 NodeOptInfo nopt;
4597 Node* nd = node;
4598
4599 do {
4600 r = optimize_node_left(NCAR(nd), &nopt, env);
4601 if (r == 0) {
4602 if (nd == node) copy_node_opt_info(opt, &nopt);
4603 else alt_merge_node_opt_info(opt, &nopt, env);
4604 }
4605 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4606 }
4607 break;
4608
4609 case NT_STR:
4610 {
4611 StrNode* sn = NSTR(node);
4612 int slen = sn->end - sn->s;
4613 int is_raw = NSTRING_IS_RAW(node);
4614
4615 if (! NSTRING_IS_AMBIG(node)) {
4616 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4617 NSTRING_IS_RAW(node), env->enc);
4618 if (slen > 0) {
4619 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4620 }
4621 set_mml(&opt->len, slen, slen);
4622 }
4623 else {
4624 int max;
4625
4626 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4627 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4628 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4629 }
4630 else {
4631 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4632 is_raw, env->enc);
4633 opt->exb.ignore_case = 1;
4634
4635 if (slen > 0) {
4636 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4637 env->enc, env->case_fold_flag);
4638 if (r != 0) break;
4639 }
4640
4641 max = slen;
4642 }
4643
4644 set_mml(&opt->len, slen, max);
4645 }
4646
4647 if (opt->exb.len == slen)
4648 opt->exb.reach_end = 1;
4649 }
4650 break;
4651
4652 case NT_CCLASS:
4653 {
4654 int i, z;
4655 CClassNode* cc = NCCLASS(node);
4656
4657 /* no need to check ignore case. (setted in setup_tree()) */
4658
4659 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4660 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4661 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4662
4663 set_mml(&opt->len, min, max);
4664 }
4665 else {
4666 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4667 z = BITSET_AT(cc->bs, i);
4668 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4669 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4670 }
4671 }
4672 set_mml(&opt->len, 1, 1);
4673 }
4674 }
4675 break;
4676
4677 case NT_CTYPE:
4678 {
4679 int i, min, max;
4680
4681 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4682
4683 if (max == 1) {
4684 min = 1;
4685
4686 switch (NCTYPE(node)->ctype) {
4687 case ONIGENC_CTYPE_WORD:
4688 if (NCTYPE(node)->not != 0) {
4689 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4690 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4691 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4692 }
4693 }
4694 }
4695 else {
4696 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4697 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4698 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4699 }
4700 }
4701 }
4702 break;
4703 }
4704 }
4705 else {
4706 min = ONIGENC_MBC_MINLEN(env->enc);
4707 }
4708 set_mml(&opt->len, min, max);
4709 }
4710 break;
4711
4712 case NT_CANY:
4713 {
4714 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4715 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4716 set_mml(&opt->len, min, max);
4717 }
4718 break;
4719
4720 case NT_ANCHOR:
4721 switch (NANCHOR(node)->type) {
4722 case ANCHOR_BEGIN_BUF:
4723 case ANCHOR_BEGIN_POSITION:
4724 case ANCHOR_BEGIN_LINE:
4725 case ANCHOR_END_BUF:
4726 case ANCHOR_SEMI_END_BUF:
4727 case ANCHOR_END_LINE:
4728 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4729 break;
4730
4731 case ANCHOR_PREC_READ:
4732 {
4733 NodeOptInfo nopt;
4734
4735 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4736 if (r == 0) {
4737 if (nopt.exb.len > 0)
4738 copy_opt_exact_info(&opt->expr, &nopt.exb);
4739 else if (nopt.exm.len > 0)
4740 copy_opt_exact_info(&opt->expr, &nopt.exm);
4741
4742 opt->expr.reach_end = 0;
4743
4744 if (nopt.map.value > 0)
4745 copy_opt_map_info(&opt->map, &nopt.map);
4746 }
4747 }
4748 break;
4749
4750 case ANCHOR_PREC_READ_NOT:
4751 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4752 case ANCHOR_LOOK_BEHIND_NOT:
4753 break;
4754 }
4755 break;
4756
4757 case NT_BREF:
4758 {
4759 int i;
4760 int* backs;
4761 OnigDistance min, max, tmin, tmax;
4762 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4763 BRefNode* br = NBREF(node);
4764
4765 if (br->state & NST_RECURSION) {
4766 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4767 break;
4768 }
4769 backs = BACKREFS_P(br);
4770 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4771 if (r != 0) break;
4772 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4773 if (r != 0) break;
4774 for (i = 1; i < br->back_num; i++) {
4775 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4776 if (r != 0) break;
4777 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4778 if (r != 0) break;
4779 if (min > tmin) min = tmin;
4780 if (max < tmax) max = tmax;
4781 }
4782 if (r == 0) set_mml(&opt->len, min, max);
4783 }
4784 break;
4785
4786 #ifdef USE_SUBEXP_CALL
4787 case NT_CALL:
4788 if (IS_CALL_RECURSION(NCALL(node)))
4789 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4790 else {
4791 OnigOptionType save = env->options;
4792 env->options = NENCLOSE(NCALL(node)->target)->option;
4793 r = optimize_node_left(NCALL(node)->target, opt, env);
4794 env->options = save;
4795 }
4796 break;
4797 #endif
4798
4799 case NT_QTFR:
4800 {
4801 int i;
4802 OnigDistance min, max;
4803 NodeOptInfo nopt;
4804 QtfrNode* qn = NQTFR(node);
4805
4806 r = optimize_node_left(qn->target, &nopt, env);
4807 if (r) break;
4808
4809 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4810 if (env->mmd.max == 0 &&
4811 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4812 if (IS_MULTILINE(env->options))
4813 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4814 else
4815 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4816 }
4817 }
4818 else {
4819 if (qn->lower > 0) {
4820 copy_node_opt_info(opt, &nopt);
4821 if (nopt.exb.len > 0) {
4822 if (nopt.exb.reach_end) {
4823 for (i = 2; i <= qn->lower &&
4824 ! is_full_opt_exact_info(&opt->exb); i++) {
4825 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4826 }
4827 if (i < qn->lower) {
4828 opt->exb.reach_end = 0;
4829 }
4830 }
4831 }
4832
4833 if (qn->lower != qn->upper) {
4834 opt->exb.reach_end = 0;
4835 opt->exm.reach_end = 0;
4836 }
4837 if (qn->lower > 1)
4838 opt->exm.reach_end = 0;
4839 }
4840 }
4841
4842 min = distance_multiply(nopt.len.min, qn->lower);
4843 if (IS_REPEAT_INFINITE(qn->upper))
4844 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4845 else
4846 max = distance_multiply(nopt.len.max, qn->upper);
4847
4848 set_mml(&opt->len, min, max);
4849 }
4850 break;
4851
4852 case NT_ENCLOSE:
4853 {
4854 EncloseNode* en = NENCLOSE(node);
4855
4856 switch (en->type) {
4857 case ENCLOSE_OPTION:
4858 {
4859 OnigOptionType save = env->options;
4860
4861 env->options = en->option;
4862 r = optimize_node_left(en->target, opt, env);
4863 env->options = save;
4864 }
4865 break;
4866
4867 case ENCLOSE_MEMORY:
4868 #ifdef USE_SUBEXP_CALL
4869 en->opt_count++;
4870 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4871 OnigDistance min, max;
4872
4873 min = 0;
4874 max = ONIG_INFINITE_DISTANCE;
4875 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4876 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4877 set_mml(&opt->len, min, max);
4878 }
4879 else
4880 #endif
4881 {
4882 r = optimize_node_left(en->target, opt, env);
4883
4884 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4885 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4886 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4887 }
4888 }
4889 break;
4890
4891 case ENCLOSE_STOP_BACKTRACK:
4892 r = optimize_node_left(en->target, opt, env);
4893 break;
4894 }
4895 }
4896 break;
4897
4898 default:
4899 #ifdef ONIG_DEBUG
4900 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4901 NTYPE(node));
4902 #endif
4903 r = ONIGERR_TYPE_BUG;
4904 break;
4905 }
4906
4907 return r;
4908 }
4909
4910 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4911 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4912 {
4913 int r;
4914
4915 if (e->len == 0) return 0;
4916
4917 if (e->ignore_case) {
4918 reg->exact = (UChar* )xmalloc(e->len);
4919 CHECK_NULL_RETURN_MEMERR(reg->exact);
4920 xmemcpy(reg->exact, e->s, e->len);
4921 reg->exact_end = reg->exact + e->len;
4922 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4923 }
4924 else {
4925 int allow_reverse;
4926
4927 reg->exact = str_dup(e->s, e->s + e->len);
4928 CHECK_NULL_RETURN_MEMERR(reg->exact);
4929 reg->exact_end = reg->exact + e->len;
4930
4931 allow_reverse =
4932 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4933
4934 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4935 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4936 reg->map, &(reg->int_map));
4937 if (r) return r;
4938
4939 reg->optimize = (allow_reverse != 0
4940 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4941 }
4942 else {
4943 reg->optimize = ONIG_OPTIMIZE_EXACT;
4944 }
4945 }
4946
4947 reg->dmin = e->mmd.min;
4948 reg->dmax = e->mmd.max;
4949
4950 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4951 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4952 }
4953
4954 return 0;
4955 }
4956
4957 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4958 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4959 {
4960 int i;
4961
4962 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4963 reg->map[i] = m->map[i];
4964
4965 reg->optimize = ONIG_OPTIMIZE_MAP;
4966 reg->dmin = m->mmd.min;
4967 reg->dmax = m->mmd.max;
4968
4969 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4970 reg->threshold_len = reg->dmin + 1;
4971 }
4972 }
4973
4974 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4975 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4976 {
4977 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4978 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4979 }
4980
4981 #ifdef ONIG_DEBUG
4982 static void print_optimize_info(FILE* f, regex_t* reg);
4983 #endif
4984
4985 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4986 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4987 {
4988
4989 int r;
4990 NodeOptInfo opt;
4991 OptEnv env;
4992
4993 env.enc = reg->enc;
4994 env.options = reg->options;
4995 env.case_fold_flag = reg->case_fold_flag;
4996 env.scan_env = scan_env;
4997 clear_mml(&env.mmd);
4998
4999 r = optimize_node_left(node, &opt, &env);
5000 if (r) return r;
5001
5002 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5003 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
5004
5005 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
5006
5007 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5008 reg->anchor_dmin = opt.len.min;
5009 reg->anchor_dmax = opt.len.max;
5010 }
5011
5012 if (opt.exb.len > 0 || opt.exm.len > 0) {
5013 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5014 if (opt.map.value > 0 &&
5015 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5016 goto set_map;
5017 }
5018 else {
5019 r = set_optimize_exact_info(reg, &opt.exb);
5020 set_sub_anchor(reg, &opt.exb.anc);
5021 }
5022 }
5023 else if (opt.map.value > 0) {
5024 set_map:
5025 set_optimize_map_info(reg, &opt.map);
5026 set_sub_anchor(reg, &opt.map.anc);
5027 }
5028 else {
5029 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5030 if (opt.len.max == 0)
5031 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5032 }
5033
5034 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5035 print_optimize_info(stderr, reg);
5036 #endif
5037 return r;
5038 }
5039
5040 static void
clear_optimize_info(regex_t * reg)5041 clear_optimize_info(regex_t* reg)
5042 {
5043 reg->optimize = ONIG_OPTIMIZE_NONE;
5044 reg->anchor = 0;
5045 reg->anchor_dmin = 0;
5046 reg->anchor_dmax = 0;
5047 reg->sub_anchor = 0;
5048 reg->exact_end = (UChar* )NULL;
5049 reg->threshold_len = 0;
5050 if (IS_NOT_NULL(reg->exact)) {
5051 xfree(reg->exact);
5052 reg->exact = (UChar* )NULL;
5053 }
5054 }
5055
5056 #ifdef ONIG_DEBUG
5057
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5058 static void print_enc_string(FILE* fp, OnigEncoding enc,
5059 const UChar *s, const UChar *end)
5060 {
5061 fprintf(fp, "\nPATTERN: /");
5062
5063 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5064 const UChar *p;
5065 OnigCodePoint code;
5066
5067 p = s;
5068 while (p < end) {
5069 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5070 if (code >= 0x80) {
5071 fprintf(fp, " 0x%04x ", (int )code);
5072 }
5073 else {
5074 fputc((int )code, fp);
5075 }
5076
5077 p += enclen(enc, p);
5078 }
5079 }
5080 else {
5081 while (s < end) {
5082 fputc((int )*s, fp);
5083 s++;
5084 }
5085 }
5086
5087 fprintf(fp, "/\n");
5088 }
5089
5090 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5091 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5092 {
5093 if (a == ONIG_INFINITE_DISTANCE)
5094 fputs("inf", f);
5095 else
5096 fprintf(f, "(%u)", a);
5097
5098 fputs("-", f);
5099
5100 if (b == ONIG_INFINITE_DISTANCE)
5101 fputs("inf", f);
5102 else
5103 fprintf(f, "(%u)", b);
5104 }
5105
5106 static void
print_anchor(FILE * f,int anchor)5107 print_anchor(FILE* f, int anchor)
5108 {
5109 int q = 0;
5110
5111 fprintf(f, "[");
5112
5113 if (anchor & ANCHOR_BEGIN_BUF) {
5114 fprintf(f, "begin-buf");
5115 q = 1;
5116 }
5117 if (anchor & ANCHOR_BEGIN_LINE) {
5118 if (q) fprintf(f, ", ");
5119 q = 1;
5120 fprintf(f, "begin-line");
5121 }
5122 if (anchor & ANCHOR_BEGIN_POSITION) {
5123 if (q) fprintf(f, ", ");
5124 q = 1;
5125 fprintf(f, "begin-pos");
5126 }
5127 if (anchor & ANCHOR_END_BUF) {
5128 if (q) fprintf(f, ", ");
5129 q = 1;
5130 fprintf(f, "end-buf");
5131 }
5132 if (anchor & ANCHOR_SEMI_END_BUF) {
5133 if (q) fprintf(f, ", ");
5134 q = 1;
5135 fprintf(f, "semi-end-buf");
5136 }
5137 if (anchor & ANCHOR_END_LINE) {
5138 if (q) fprintf(f, ", ");
5139 q = 1;
5140 fprintf(f, "end-line");
5141 }
5142 if (anchor & ANCHOR_ANYCHAR_STAR) {
5143 if (q) fprintf(f, ", ");
5144 q = 1;
5145 fprintf(f, "anychar-star");
5146 }
5147 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5148 if (q) fprintf(f, ", ");
5149 fprintf(f, "anychar-star-pl");
5150 }
5151
5152 fprintf(f, "]");
5153 }
5154
5155 static void
print_optimize_info(FILE * f,regex_t * reg)5156 print_optimize_info(FILE* f, regex_t* reg)
5157 {
5158 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5159 "EXACT_IC", "MAP" };
5160
5161 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5162 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5163 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5164 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5165 fprintf(f, "\n");
5166
5167 if (reg->optimize) {
5168 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5169 fprintf(f, "\n");
5170 }
5171 fprintf(f, "\n");
5172
5173 if (reg->exact) {
5174 UChar *p;
5175 fprintf(f, "exact: [");
5176 for (p = reg->exact; p < reg->exact_end; p++) {
5177 fputc(*p, f);
5178 }
5179 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5180 }
5181 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5182 int c, i, n = 0;
5183
5184 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5185 if (reg->map[i]) n++;
5186
5187 fprintf(f, "map: n=%d\n", n);
5188 if (n > 0) {
5189 c = 0;
5190 fputc('[', f);
5191 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5192 if (reg->map[i] != 0) {
5193 if (c > 0) fputs(", ", f);
5194 c++;
5195 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5196 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5197 fputc(i, f);
5198 else
5199 fprintf(f, "%d", i);
5200 }
5201 }
5202 fprintf(f, "]\n");
5203 }
5204 }
5205 }
5206 #endif /* ONIG_DEBUG */
5207
5208
5209 extern void
onig_free_body(regex_t * reg)5210 onig_free_body(regex_t* reg)
5211 {
5212 if (IS_NOT_NULL(reg)) {
5213 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5214 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5215 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5216 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5217 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5218 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5219
5220 #ifdef USE_NAMED_GROUP
5221 onig_names_free(reg);
5222 #endif
5223 }
5224 }
5225
5226 extern void
onig_free(regex_t * reg)5227 onig_free(regex_t* reg)
5228 {
5229 if (IS_NOT_NULL(reg)) {
5230 onig_free_body(reg);
5231 xfree(reg);
5232 }
5233 }
5234
5235 #define REGEX_TRANSFER(to,from) do {\
5236 (to)->state = ONIG_STATE_MODIFY;\
5237 onig_free_body(to);\
5238 xmemcpy(to, from, sizeof(regex_t));\
5239 xfree(from);\
5240 } while (0)
5241
5242 extern void
onig_transfer(regex_t * to,regex_t * from)5243 onig_transfer(regex_t* to, regex_t* from)
5244 {
5245 THREAD_ATOMIC_START;
5246 REGEX_TRANSFER(to, from);
5247 THREAD_ATOMIC_END;
5248 }
5249
5250 #define REGEX_CHAIN_HEAD(reg) do {\
5251 while (IS_NOT_NULL((reg)->chain)) {\
5252 (reg) = (reg)->chain;\
5253 }\
5254 } while (0)
5255
5256 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5257 onig_chain_link_add(regex_t* to, regex_t* add)
5258 {
5259 THREAD_ATOMIC_START;
5260 REGEX_CHAIN_HEAD(to);
5261 to->chain = add;
5262 THREAD_ATOMIC_END;
5263 }
5264
5265 extern void
onig_chain_reduce(regex_t * reg)5266 onig_chain_reduce(regex_t* reg)
5267 {
5268 regex_t *head, *prev;
5269
5270 prev = reg;
5271 head = prev->chain;
5272 if (IS_NOT_NULL(head)) {
5273 reg->state = ONIG_STATE_MODIFY;
5274 while (IS_NOT_NULL(head->chain)) {
5275 prev = head;
5276 head = head->chain;
5277 }
5278 prev->chain = (regex_t* )NULL;
5279 REGEX_TRANSFER(reg, head);
5280 }
5281 }
5282
5283 #ifdef ONIG_DEBUG
5284 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5285 #endif
5286 #ifdef ONIG_DEBUG_PARSE_TREE
5287 static void print_tree P_((FILE* f, Node* node));
5288 #endif
5289
5290 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5291 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5292 OnigErrorInfo* einfo)
5293 {
5294 #define COMPILE_INIT_SIZE 20
5295
5296 int r, init_size;
5297 Node* root;
5298 ScanEnv scan_env;
5299 #ifdef USE_SUBEXP_CALL
5300 UnsetAddrList uslist;
5301 #endif
5302
5303 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5304
5305 reg->state = ONIG_STATE_COMPILING;
5306
5307 #ifdef ONIG_DEBUG
5308 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5309 #endif
5310
5311 if (reg->alloc == 0) {
5312 init_size = (pattern_end - pattern) * 2;
5313 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5314 r = BBUF_INIT(reg, init_size);
5315 if (r != 0) goto end;
5316 }
5317 else
5318 reg->used = 0;
5319
5320 reg->num_mem = 0;
5321 reg->num_repeat = 0;
5322 reg->num_null_check = 0;
5323 reg->repeat_range_alloc = 0;
5324 reg->repeat_range = (OnigRepeatRange* )NULL;
5325 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5326 reg->num_comb_exp_check = 0;
5327 #endif
5328
5329 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5330 if (r != 0) goto err;
5331
5332 #ifdef USE_NAMED_GROUP
5333 /* mixed use named group and no-named group */
5334 if (scan_env.num_named > 0 &&
5335 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5336 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5337 if (scan_env.num_named != scan_env.num_mem)
5338 r = disable_noname_group_capture(&root, reg, &scan_env);
5339 else
5340 r = numbered_ref_check(root);
5341
5342 if (r != 0) goto err;
5343 }
5344 #endif
5345
5346 #ifdef USE_SUBEXP_CALL
5347 if (scan_env.num_call > 0) {
5348 r = unset_addr_list_init(&uslist, scan_env.num_call);
5349 if (r != 0) goto err;
5350 scan_env.unset_addr_list = &uslist;
5351 r = setup_subexp_call(root, &scan_env);
5352 if (r != 0) goto err_unset;
5353 r = subexp_recursive_check_trav(root, &scan_env);
5354 if (r < 0) goto err_unset;
5355 r = subexp_inf_recursive_check_trav(root, &scan_env);
5356 if (r != 0) goto err_unset;
5357
5358 reg->num_call = scan_env.num_call;
5359 }
5360 else
5361 reg->num_call = 0;
5362 #endif
5363
5364 r = setup_tree(root, reg, 0, &scan_env);
5365 if (r != 0) goto err_unset;
5366
5367 #ifdef ONIG_DEBUG_PARSE_TREE
5368 print_tree(stderr, root);
5369 #endif
5370
5371 reg->capture_history = scan_env.capture_history;
5372 reg->bt_mem_start = scan_env.bt_mem_start;
5373 reg->bt_mem_start |= reg->capture_history;
5374 if (IS_FIND_CONDITION(reg->options))
5375 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5376 else {
5377 reg->bt_mem_end = scan_env.bt_mem_end;
5378 reg->bt_mem_end |= reg->capture_history;
5379 }
5380
5381 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5382 if (scan_env.backrefed_mem == 0
5383 #ifdef USE_SUBEXP_CALL
5384 || scan_env.num_call == 0
5385 #endif
5386 ) {
5387 setup_comb_exp_check(root, 0, &scan_env);
5388 #ifdef USE_SUBEXP_CALL
5389 if (scan_env.has_recursion != 0) {
5390 scan_env.num_comb_exp_check = 0;
5391 }
5392 else
5393 #endif
5394 if (scan_env.comb_exp_max_regnum > 0) {
5395 int i;
5396 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5397 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5398 scan_env.num_comb_exp_check = 0;
5399 break;
5400 }
5401 }
5402 }
5403 }
5404
5405 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5406 #endif
5407
5408 clear_optimize_info(reg);
5409 #ifndef ONIG_DONT_OPTIMIZE
5410 r = set_optimize_info_from_tree(root, reg, &scan_env);
5411 if (r != 0) goto err_unset;
5412 #endif
5413
5414 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5415 xfree(scan_env.mem_nodes_dynamic);
5416 scan_env.mem_nodes_dynamic = (Node** )NULL;
5417 }
5418
5419 r = compile_tree(root, reg);
5420 if (r == 0) {
5421 r = add_opcode(reg, OP_END);
5422 #ifdef USE_SUBEXP_CALL
5423 if (scan_env.num_call > 0) {
5424 r = unset_addr_list_fix(&uslist, reg);
5425 unset_addr_list_end(&uslist);
5426 if (r) goto err;
5427 }
5428 #endif
5429
5430 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5431 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5432 else {
5433 if (reg->bt_mem_start != 0)
5434 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5435 else
5436 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5437 }
5438 }
5439 #ifdef USE_SUBEXP_CALL
5440 else if (scan_env.num_call > 0) {
5441 unset_addr_list_end(&uslist);
5442 }
5443 #endif
5444 onig_node_free(root);
5445
5446 #ifdef ONIG_DEBUG_COMPILE
5447 #ifdef USE_NAMED_GROUP
5448 onig_print_names(stderr, reg);
5449 #endif
5450 print_compiled_byte_code_list(stderr, reg);
5451 #endif
5452
5453 end:
5454 reg->state = ONIG_STATE_NORMAL;
5455 return r;
5456
5457 err_unset:
5458 #ifdef USE_SUBEXP_CALL
5459 if (scan_env.num_call > 0) {
5460 unset_addr_list_end(&uslist);
5461 }
5462 #endif
5463 err:
5464 if (IS_NOT_NULL(scan_env.error)) {
5465 if (IS_NOT_NULL(einfo)) {
5466 einfo->enc = scan_env.enc;
5467 einfo->par = scan_env.error;
5468 einfo->par_end = scan_env.error_end;
5469 }
5470 }
5471
5472 onig_node_free(root);
5473 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5474 xfree(scan_env.mem_nodes_dynamic);
5475 return r;
5476 }
5477
5478 #ifdef USE_RECOMPILE_API
5479 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5480 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5481 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5482 OnigErrorInfo* einfo)
5483 {
5484 int r;
5485 regex_t *new_reg;
5486
5487 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5488 if (r) return r;
5489 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5490 onig_transfer(reg, new_reg);
5491 }
5492 else {
5493 onig_chain_link_add(reg, new_reg);
5494 }
5495 return 0;
5496 }
5497 #endif
5498
5499 static int onig_inited = 0;
5500
5501 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5502 onig_reg_init(regex_t* reg, OnigOptionType option,
5503 OnigCaseFoldType case_fold_flag,
5504 OnigEncoding enc, OnigSyntaxType* syntax)
5505 {
5506 if (! onig_inited)
5507 onig_init();
5508
5509 if (IS_NULL(reg))
5510 return ONIGERR_INVALID_ARGUMENT;
5511
5512 if (ONIGENC_IS_UNDEF(enc))
5513 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5514
5515 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5516 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5517 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5518 }
5519
5520 (reg)->state = ONIG_STATE_MODIFY;
5521
5522 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5523 option |= syntax->options;
5524 option &= ~ONIG_OPTION_SINGLELINE;
5525 }
5526 else
5527 option |= syntax->options;
5528
5529 (reg)->enc = enc;
5530 (reg)->options = option;
5531 (reg)->syntax = syntax;
5532 (reg)->optimize = 0;
5533 (reg)->exact = (UChar* )NULL;
5534 (reg)->int_map = (int* )NULL;
5535 (reg)->int_map_backward = (int* )NULL;
5536 (reg)->chain = (regex_t* )NULL;
5537
5538 (reg)->p = (UChar* )NULL;
5539 (reg)->alloc = 0;
5540 (reg)->used = 0;
5541 (reg)->name_table = (void* )NULL;
5542
5543 (reg)->case_fold_flag = case_fold_flag;
5544 return 0;
5545 }
5546
5547 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5548 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5549 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5550 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5551 {
5552 int r;
5553
5554 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5555 if (r) return r;
5556
5557 r = onig_compile(reg, pattern, pattern_end, einfo);
5558 return r;
5559 }
5560
5561 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5562 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5563 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5564 OnigErrorInfo* einfo)
5565 {
5566 int r;
5567
5568 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5569 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5570
5571 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5572 if (r) goto err;
5573
5574 r = onig_compile(*reg, pattern, pattern_end, einfo);
5575 if (r) {
5576 err:
5577 onig_free(*reg);
5578 *reg = NULL;
5579 }
5580 return r;
5581 }
5582
5583
5584 extern int
onig_init(void)5585 onig_init(void)
5586 {
5587 if (onig_inited != 0)
5588 return 0;
5589
5590 THREAD_SYSTEM_INIT;
5591 THREAD_ATOMIC_START;
5592
5593 onig_inited = 1;
5594
5595 onigenc_init();
5596 /* onigenc_set_default_caseconv_table((UChar* )0); */
5597
5598 #ifdef ONIG_DEBUG_STATISTICS
5599 onig_statistics_init();
5600 #endif
5601
5602 THREAD_ATOMIC_END;
5603 return 0;
5604 }
5605
5606
5607 extern int
onig_end(void)5608 onig_end(void)
5609 {
5610 THREAD_ATOMIC_START;
5611
5612 #ifdef ONIG_DEBUG_STATISTICS
5613 onig_print_statistics(stderr);
5614 #endif
5615
5616 #ifdef USE_SHARED_CCLASS_TABLE
5617 onig_free_shared_cclass_table();
5618 #endif
5619
5620 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5621 onig_free_node_list();
5622 #endif
5623
5624 onig_inited = 0;
5625
5626 THREAD_ATOMIC_END;
5627 THREAD_SYSTEM_END;
5628 return 0;
5629 }
5630
5631 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5632 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5633 {
5634 OnigCodePoint n, *data;
5635 OnigCodePoint low, high, x;
5636
5637 GET_CODE_POINT(n, p);
5638 data = (OnigCodePoint* )p;
5639 data++;
5640
5641 for (low = 0, high = n; low < high; ) {
5642 x = (low + high) >> 1;
5643 if (code > data[x * 2 + 1])
5644 low = x + 1;
5645 else
5646 high = x;
5647 }
5648
5649 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5650 }
5651
5652 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5653 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5654 {
5655 int found;
5656
5657 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5658 if (IS_NULL(cc->mbuf)) {
5659 found = 0;
5660 }
5661 else {
5662 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5663 }
5664 }
5665 else {
5666 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5667 }
5668
5669 if (IS_NCCLASS_NOT(cc))
5670 return !found;
5671 else
5672 return found;
5673 }
5674
5675 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5676 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5677 {
5678 int len;
5679
5680 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5681 len = 2;
5682 }
5683 else {
5684 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5685 }
5686 return onig_is_code_in_cc_len(len, code, cc);
5687 }
5688
5689
5690 #ifdef ONIG_DEBUG
5691
5692 /* arguments type */
5693 #define ARG_SPECIAL -1
5694 #define ARG_NON 0
5695 #define ARG_RELADDR 1
5696 #define ARG_ABSADDR 2
5697 #define ARG_LENGTH 3
5698 #define ARG_MEMNUM 4
5699 #define ARG_OPTION 5
5700 #define ARG_STATE_CHECK 6
5701
5702 OnigOpInfoType OnigOpInfo[] = {
5703 { OP_FINISH, "finish", ARG_NON },
5704 { OP_END, "end", ARG_NON },
5705 { OP_EXACT1, "exact1", ARG_SPECIAL },
5706 { OP_EXACT2, "exact2", ARG_SPECIAL },
5707 { OP_EXACT3, "exact3", ARG_SPECIAL },
5708 { OP_EXACT4, "exact4", ARG_SPECIAL },
5709 { OP_EXACT5, "exact5", ARG_SPECIAL },
5710 { OP_EXACTN, "exactn", ARG_SPECIAL },
5711 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5712 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5713 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5714 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5715 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5716 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5717 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5718 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5719 { OP_CCLASS, "cclass", ARG_SPECIAL },
5720 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5721 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5722 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5723 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5724 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5725 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5726 { OP_ANYCHAR, "anychar", ARG_NON },
5727 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5728 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5729 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5730 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5731 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5732 { OP_WORD, "word", ARG_NON },
5733 { OP_NOT_WORD, "not-word", ARG_NON },
5734 { OP_WORD_BOUND, "word-bound", ARG_NON },
5735 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5736 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5737 { OP_WORD_END, "word-end", ARG_NON },
5738 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5739 { OP_END_BUF, "end-buf", ARG_NON },
5740 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5741 { OP_END_LINE, "end-line", ARG_NON },
5742 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5743 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5744 { OP_BACKREF1, "backref1", ARG_NON },
5745 { OP_BACKREF2, "backref2", ARG_NON },
5746 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5747 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5748 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5749 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5750 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5751 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5752 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5753 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5754 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5755 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5756 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5757 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5758 { OP_SET_OPTION, "set-option", ARG_OPTION },
5759 { OP_FAIL, "fail", ARG_NON },
5760 { OP_JUMP, "jump", ARG_RELADDR },
5761 { OP_PUSH, "push", ARG_RELADDR },
5762 { OP_POP, "pop", ARG_NON },
5763 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5764 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5765 { OP_REPEAT, "repeat", ARG_SPECIAL },
5766 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5767 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5768 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5769 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5770 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5771 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5772 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5773 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5774 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5775 { OP_PUSH_POS, "push-pos", ARG_NON },
5776 { OP_POP_POS, "pop-pos", ARG_NON },
5777 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5778 { OP_FAIL_POS, "fail-pos", ARG_NON },
5779 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5780 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5781 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5782 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5783 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5784 { OP_CALL, "call", ARG_ABSADDR },
5785 { OP_RETURN, "return", ARG_NON },
5786 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5787 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5788 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5789 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5790 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5791 "state-check-anychar-ml*", ARG_STATE_CHECK },
5792 { -1, "", ARG_NON }
5793 };
5794
5795 static char*
op2name(int opcode)5796 op2name(int opcode)
5797 {
5798 int i;
5799
5800 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5801 if (opcode == OnigOpInfo[i].opcode)
5802 return OnigOpInfo[i].name;
5803 }
5804 return "";
5805 }
5806
5807 static int
op2arg_type(int opcode)5808 op2arg_type(int opcode)
5809 {
5810 int i;
5811
5812 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5813 if (opcode == OnigOpInfo[i].opcode)
5814 return OnigOpInfo[i].arg_type;
5815 }
5816 return ARG_SPECIAL;
5817 }
5818
5819 static void
Indent(FILE * f,int indent)5820 Indent(FILE* f, int indent)
5821 {
5822 int i;
5823 for (i = 0; i < indent; i++) putc(' ', f);
5824 }
5825
5826 static void
p_string(FILE * f,int len,UChar * s)5827 p_string(FILE* f, int len, UChar* s)
5828 {
5829 fputs(":", f);
5830 while (len-- > 0) { fputc(*s++, f); }
5831 }
5832
5833 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5834 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5835 {
5836 int x = len * mb_len;
5837
5838 fprintf(f, ":%d:", len);
5839 while (x-- > 0) { fputc(*s++, f); }
5840 }
5841
5842 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5843 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5844 OnigEncoding enc)
5845 {
5846 int i, n, arg_type;
5847 RelAddrType addr;
5848 LengthType len;
5849 MemNumType mem;
5850 StateCheckNumType scn;
5851 OnigCodePoint code;
5852 UChar *q;
5853
5854 fprintf(f, "[%s", op2name(*bp));
5855 arg_type = op2arg_type(*bp);
5856 if (arg_type != ARG_SPECIAL) {
5857 bp++;
5858 switch (arg_type) {
5859 case ARG_NON:
5860 break;
5861 case ARG_RELADDR:
5862 GET_RELADDR_INC(addr, bp);
5863 fprintf(f, ":(%d)", addr);
5864 break;
5865 case ARG_ABSADDR:
5866 GET_ABSADDR_INC(addr, bp);
5867 fprintf(f, ":(%d)", addr);
5868 break;
5869 case ARG_LENGTH:
5870 GET_LENGTH_INC(len, bp);
5871 fprintf(f, ":%d", len);
5872 break;
5873 case ARG_MEMNUM:
5874 mem = *((MemNumType* )bp);
5875 bp += SIZE_MEMNUM;
5876 fprintf(f, ":%d", mem);
5877 break;
5878 case ARG_OPTION:
5879 {
5880 OnigOptionType option = *((OnigOptionType* )bp);
5881 bp += SIZE_OPTION;
5882 fprintf(f, ":%d", option);
5883 }
5884 break;
5885
5886 case ARG_STATE_CHECK:
5887 scn = *((StateCheckNumType* )bp);
5888 bp += SIZE_STATE_CHECK_NUM;
5889 fprintf(f, ":%d", scn);
5890 break;
5891 }
5892 }
5893 else {
5894 switch (*bp++) {
5895 case OP_EXACT1:
5896 case OP_ANYCHAR_STAR_PEEK_NEXT:
5897 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5898 p_string(f, 1, bp++); break;
5899 case OP_EXACT2:
5900 p_string(f, 2, bp); bp += 2; break;
5901 case OP_EXACT3:
5902 p_string(f, 3, bp); bp += 3; break;
5903 case OP_EXACT4:
5904 p_string(f, 4, bp); bp += 4; break;
5905 case OP_EXACT5:
5906 p_string(f, 5, bp); bp += 5; break;
5907 case OP_EXACTN:
5908 GET_LENGTH_INC(len, bp);
5909 p_len_string(f, len, 1, bp);
5910 bp += len;
5911 break;
5912
5913 case OP_EXACTMB2N1:
5914 p_string(f, 2, bp); bp += 2; break;
5915 case OP_EXACTMB2N2:
5916 p_string(f, 4, bp); bp += 4; break;
5917 case OP_EXACTMB2N3:
5918 p_string(f, 6, bp); bp += 6; break;
5919 case OP_EXACTMB2N:
5920 GET_LENGTH_INC(len, bp);
5921 p_len_string(f, len, 2, bp);
5922 bp += len * 2;
5923 break;
5924 case OP_EXACTMB3N:
5925 GET_LENGTH_INC(len, bp);
5926 p_len_string(f, len, 3, bp);
5927 bp += len * 3;
5928 break;
5929 case OP_EXACTMBN:
5930 {
5931 int mb_len;
5932
5933 GET_LENGTH_INC(mb_len, bp);
5934 GET_LENGTH_INC(len, bp);
5935 fprintf(f, ":%d:%d:", mb_len, len);
5936 n = len * mb_len;
5937 while (n-- > 0) { fputc(*bp++, f); }
5938 }
5939 break;
5940
5941 case OP_EXACT1_IC:
5942 len = enclen(enc, bp);
5943 p_string(f, len, bp);
5944 bp += len;
5945 break;
5946 case OP_EXACTN_IC:
5947 GET_LENGTH_INC(len, bp);
5948 p_len_string(f, len, 1, bp);
5949 bp += len;
5950 break;
5951
5952 case OP_CCLASS:
5953 n = bitset_on_num((BitSetRef )bp);
5954 bp += SIZE_BITSET;
5955 fprintf(f, ":%d", n);
5956 break;
5957
5958 case OP_CCLASS_NOT:
5959 n = bitset_on_num((BitSetRef )bp);
5960 bp += SIZE_BITSET;
5961 fprintf(f, ":%d", n);
5962 break;
5963
5964 case OP_CCLASS_MB:
5965 case OP_CCLASS_MB_NOT:
5966 GET_LENGTH_INC(len, bp);
5967 q = bp;
5968 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5969 ALIGNMENT_RIGHT(q);
5970 #endif
5971 GET_CODE_POINT(code, q);
5972 bp += len;
5973 fprintf(f, ":%d:%d", (int )code, len);
5974 break;
5975
5976 case OP_CCLASS_MIX:
5977 case OP_CCLASS_MIX_NOT:
5978 n = bitset_on_num((BitSetRef )bp);
5979 bp += SIZE_BITSET;
5980 GET_LENGTH_INC(len, bp);
5981 q = bp;
5982 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5983 ALIGNMENT_RIGHT(q);
5984 #endif
5985 GET_CODE_POINT(code, q);
5986 bp += len;
5987 fprintf(f, ":%d:%d:%d", n, (int )code, len);
5988 break;
5989
5990 case OP_CCLASS_NODE:
5991 {
5992 CClassNode *cc;
5993
5994 GET_POINTER_INC(cc, bp);
5995 n = bitset_on_num(cc->bs);
5996 fprintf(f, ":%u:%d", (unsigned int )cc, n);
5997 }
5998 break;
5999
6000 case OP_BACKREFN_IC:
6001 mem = *((MemNumType* )bp);
6002 bp += SIZE_MEMNUM;
6003 fprintf(f, ":%d", mem);
6004 break;
6005
6006 case OP_BACKREF_MULTI_IC:
6007 case OP_BACKREF_MULTI:
6008 fputs(" ", f);
6009 GET_LENGTH_INC(len, bp);
6010 for (i = 0; i < len; i++) {
6011 GET_MEMNUM_INC(mem, bp);
6012 if (i > 0) fputs(", ", f);
6013 fprintf(f, "%d", mem);
6014 }
6015 break;
6016
6017 case OP_BACKREF_WITH_LEVEL:
6018 {
6019 OnigOptionType option;
6020 LengthType level;
6021
6022 GET_OPTION_INC(option, bp);
6023 fprintf(f, ":%d", option);
6024 GET_LENGTH_INC(level, bp);
6025 fprintf(f, ":%d", level);
6026
6027 fputs(" ", f);
6028 GET_LENGTH_INC(len, bp);
6029 for (i = 0; i < len; i++) {
6030 GET_MEMNUM_INC(mem, bp);
6031 if (i > 0) fputs(", ", f);
6032 fprintf(f, "%d", mem);
6033 }
6034 }
6035 break;
6036
6037 case OP_REPEAT:
6038 case OP_REPEAT_NG:
6039 {
6040 mem = *((MemNumType* )bp);
6041 bp += SIZE_MEMNUM;
6042 addr = *((RelAddrType* )bp);
6043 bp += SIZE_RELADDR;
6044 fprintf(f, ":%d:%d", mem, addr);
6045 }
6046 break;
6047
6048 case OP_PUSH_OR_JUMP_EXACT1:
6049 case OP_PUSH_IF_PEEK_NEXT:
6050 addr = *((RelAddrType* )bp);
6051 bp += SIZE_RELADDR;
6052 fprintf(f, ":(%d)", addr);
6053 p_string(f, 1, bp);
6054 bp += 1;
6055 break;
6056
6057 case OP_LOOK_BEHIND:
6058 GET_LENGTH_INC(len, bp);
6059 fprintf(f, ":%d", len);
6060 break;
6061
6062 case OP_PUSH_LOOK_BEHIND_NOT:
6063 GET_RELADDR_INC(addr, bp);
6064 GET_LENGTH_INC(len, bp);
6065 fprintf(f, ":%d:(%d)", len, addr);
6066 break;
6067
6068 case OP_STATE_CHECK_PUSH:
6069 case OP_STATE_CHECK_PUSH_OR_JUMP:
6070 scn = *((StateCheckNumType* )bp);
6071 bp += SIZE_STATE_CHECK_NUM;
6072 addr = *((RelAddrType* )bp);
6073 bp += SIZE_RELADDR;
6074 fprintf(f, ":%d:(%d)", scn, addr);
6075 break;
6076
6077 default:
6078 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6079 *--bp);
6080 }
6081 }
6082 fputs("]", f);
6083 if (nextp) *nextp = bp;
6084 }
6085
6086 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6087 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6088 {
6089 int ncode;
6090 UChar* bp = reg->p;
6091 UChar* end = reg->p + reg->used;
6092
6093 fprintf(f, "code length: %d\n", reg->used);
6094
6095 ncode = 0;
6096 while (bp < end) {
6097 ncode++;
6098 if (bp > reg->p) {
6099 if (ncode % 5 == 0)
6100 fprintf(f, "\n");
6101 else
6102 fputs(" ", f);
6103 }
6104 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6105 }
6106
6107 fprintf(f, "\n");
6108 }
6109
6110 static void
print_indent_tree(FILE * f,Node * node,int indent)6111 print_indent_tree(FILE* f, Node* node, int indent)
6112 {
6113 int i, type;
6114 int add = 3;
6115 UChar* p;
6116
6117 Indent(f, indent);
6118 if (IS_NULL(node)) {
6119 fprintf(f, "ERROR: null node!!!\n");
6120 exit (0);
6121 }
6122
6123 type = NTYPE(node);
6124 switch (type) {
6125 case NT_LIST:
6126 case NT_ALT:
6127 if (NTYPE(node) == NT_LIST)
6128 fprintf(f, "<list:%x>\n", (int )node);
6129 else
6130 fprintf(f, "<alt:%x>\n", (int )node);
6131
6132 print_indent_tree(f, NCAR(node), indent + add);
6133 while (IS_NOT_NULL(node = NCDR(node))) {
6134 if (NTYPE(node) != type) {
6135 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6136 exit(0);
6137 }
6138 print_indent_tree(f, NCAR(node), indent + add);
6139 }
6140 break;
6141
6142 case NT_STR:
6143 fprintf(f, "<string%s:%x>",
6144 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6145 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6146 if (*p >= 0x20 && *p < 0x7f)
6147 fputc(*p, f);
6148 else {
6149 fprintf(f, " 0x%02x", *p);
6150 }
6151 }
6152 break;
6153
6154 case NT_CCLASS:
6155 fprintf(f, "<cclass:%x>", (int )node);
6156 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6157 if (NCCLASS(node)->mbuf) {
6158 BBuf* bbuf = NCCLASS(node)->mbuf;
6159 for (i = 0; i < bbuf->used; i++) {
6160 if (i > 0) fprintf(f, ",");
6161 fprintf(f, "%0x", bbuf->p[i]);
6162 }
6163 }
6164 break;
6165
6166 case NT_CTYPE:
6167 fprintf(f, "<ctype:%x> ", (int )node);
6168 switch (NCTYPE(node)->ctype) {
6169 case ONIGENC_CTYPE_WORD:
6170 if (NCTYPE(node)->not != 0)
6171 fputs("not word", f);
6172 else
6173 fputs("word", f);
6174 break;
6175
6176 default:
6177 fprintf(f, "ERROR: undefined ctype.\n");
6178 exit(0);
6179 }
6180 break;
6181
6182 case NT_CANY:
6183 fprintf(f, "<anychar:%x>", (int )node);
6184 break;
6185
6186 case NT_ANCHOR:
6187 fprintf(f, "<anchor:%x> ", (int )node);
6188 switch (NANCHOR(node)->type) {
6189 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6190 case ANCHOR_END_BUF: fputs("end buf", f); break;
6191 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6192 case ANCHOR_END_LINE: fputs("end line", f); break;
6193 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6194 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6195
6196 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6197 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6198 #ifdef USE_WORD_BEGIN_END
6199 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6200 case ANCHOR_WORD_END: fputs("word end", f); break;
6201 #endif
6202 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6203 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6204 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6205 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6206
6207 default:
6208 fprintf(f, "ERROR: undefined anchor type.\n");
6209 break;
6210 }
6211 break;
6212
6213 case NT_BREF:
6214 {
6215 int* p;
6216 BRefNode* br = NBREF(node);
6217 p = BACKREFS_P(br);
6218 fprintf(f, "<backref:%x>", (int )node);
6219 for (i = 0; i < br->back_num; i++) {
6220 if (i > 0) fputs(", ", f);
6221 fprintf(f, "%d", p[i]);
6222 }
6223 }
6224 break;
6225
6226 #ifdef USE_SUBEXP_CALL
6227 case NT_CALL:
6228 {
6229 CallNode* cn = NCALL(node);
6230 fprintf(f, "<call:%x>", (int )node);
6231 p_string(f, cn->name_end - cn->name, cn->name);
6232 }
6233 break;
6234 #endif
6235
6236 case NT_QTFR:
6237 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6238 NQTFR(node)->lower, NQTFR(node)->upper,
6239 (NQTFR(node)->greedy ? "" : "?"));
6240 print_indent_tree(f, NQTFR(node)->target, indent + add);
6241 break;
6242
6243 case NT_ENCLOSE:
6244 fprintf(f, "<enclose:%x> ", (int )node);
6245 switch (NENCLOSE(node)->type) {
6246 case ENCLOSE_OPTION:
6247 fprintf(f, "option:%d\n", NENCLOSE(node)->option);
6248 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6249 break;
6250 case ENCLOSE_MEMORY:
6251 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6252 break;
6253 case ENCLOSE_STOP_BACKTRACK:
6254 fprintf(f, "stop-bt");
6255 break;
6256
6257 default:
6258 break;
6259 }
6260 fprintf(f, "\n");
6261 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6262 break;
6263
6264 default:
6265 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6266 break;
6267 }
6268
6269 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6270 type != NT_ENCLOSE)
6271 fprintf(f, "\n");
6272 fflush(f);
6273 }
6274 #endif /* ONIG_DEBUG */
6275
6276 #ifdef ONIG_DEBUG_PARSE_TREE
6277 static void
print_tree(FILE * f,Node * node)6278 print_tree(FILE* f, Node* node)
6279 {
6280 print_indent_tree(f, node, 0);
6281 }
6282 #endif
6283