1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2020  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #ifdef DEBUG_NODE_FREE
31 #ifndef NEED_TO_INCLUDE_STDIO
32 #define NEED_TO_INCLUDE_STDIO
33 #endif
34 #endif
35 
36 #include "regparse.h"
37 #include "st.h"
38 
39 #define INIT_TAG_NAMES_ALLOC_NUM   5
40 
41 #define WARN_BUFSIZE    256
42 
43 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
44 
45 #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \
46   ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */)
47 #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \
48   ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
49 
50 #define OPTON_SINGLELINE(option)     ((option) & ONIG_OPTION_SINGLELINE)
51 #define OPTON_MULTILINE(option)      ((option) & ONIG_OPTION_MULTILINE)
52 #define OPTON_IGNORECASE(option)     ((option) & ONIG_OPTION_IGNORECASE)
53 #define OPTON_EXTEND(option)         ((option) & ONIG_OPTION_EXTEND)
54 #define OPTON_WORD_ASCII(option) \
55   ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
56 #define OPTON_DIGIT_ASCII(option) \
57   ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
58 #define OPTON_SPACE_ASCII(option) \
59   ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
60 #define OPTON_POSIX_ASCII(option)    ((option) & ONIG_OPTION_POSIX_IS_ASCII)
61 #define OPTON_TEXT_SEGMENT_WORD(option)  ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD)
62 
63 #define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \
64   ((ctype) >= 0 && \
65   (((ctype) < ONIGENC_CTYPE_ASCII  && OPTON_POSIX_ASCII(options)) ||\
66    ((ctype) == ONIGENC_CTYPE_WORD  && OPTON_WORD_ASCII(options))  ||\
67    ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\
68    ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options))))
69 
70 
71 OnigSyntaxType OnigSyntaxOniguruma = {
72   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
73      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
74      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
75      ONIG_SYN_OP_ESC_CONTROL_CHARS |
76      ONIG_SYN_OP_ESC_C_CONTROL )
77    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
78   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
79       ONIG_SYN_OP2_OPTION_ONIGURUMA |
80       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
81       ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
82       ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
83       ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS |
84       ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME    |
85       ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
86       ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
87       ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT |
88       ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
89       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
90       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
91       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
92       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
93       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
94       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
95       ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
96   , ( SYN_GNU_REGEX_BV |
97       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
98       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
99       ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND |
100       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
101       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
102       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
103       ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |
104       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
105       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
106   , ONIG_OPTION_NONE
107   ,
108   {
109       (OnigCodePoint )'\\'                       /* esc */
110     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
111     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
112     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
113     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
114     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
115   }
116 };
117 
118 OnigSyntaxType OnigSyntaxRuby = {
119   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
120      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
121      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
122      ONIG_SYN_OP_ESC_CONTROL_CHARS |
123      ONIG_SYN_OP_ESC_C_CONTROL )
124    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
125   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
126       ONIG_SYN_OP2_OPTION_RUBY |
127       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
128       ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
129       ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
130       ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
131       ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
132       ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
133       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
134       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
135       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
136       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
137       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
138       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
139       ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
140   , ( SYN_GNU_REGEX_BV |
141       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
142       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
143       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
144       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
145       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
146       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
147       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
148   , ONIG_OPTION_NONE
149   ,
150   {
151       (OnigCodePoint )'\\'                       /* esc */
152     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
153     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
154     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
155     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
156     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
157   }
158 };
159 
160 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA;
161 
onig_null_warn(const char * s ARG_UNUSED)162 extern void onig_null_warn(const char* s ARG_UNUSED) { }
163 
164 #ifdef DEFAULT_WARN_FUNCTION
165 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
166 #else
167 static OnigWarnFunc onig_warn = onig_null_warn;
168 #endif
169 
170 #ifdef DEFAULT_VERB_WARN_FUNCTION
171 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
172 #else
173 static OnigWarnFunc onig_verb_warn = onig_null_warn;
174 #endif
175 
onig_set_warn_func(OnigWarnFunc f)176 extern void onig_set_warn_func(OnigWarnFunc f)
177 {
178   onig_warn = f;
179 }
180 
onig_set_verb_warn_func(OnigWarnFunc f)181 extern void onig_set_verb_warn_func(OnigWarnFunc f)
182 {
183   onig_verb_warn = f;
184 }
185 
186 extern void
onig_warning(const char * s)187 onig_warning(const char* s)
188 {
189   if (onig_warn == onig_null_warn) return ;
190 
191   (*onig_warn)(s);
192 }
193 
194 #define DEFAULT_MAX_CAPTURE_NUM   32767
195 
196 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
197 
198 extern int
onig_set_capture_num_limit(int num)199 onig_set_capture_num_limit(int num)
200 {
201   if (num < 0) return -1;
202 
203   MaxCaptureNum = num;
204   return 0;
205 }
206 
207 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
208 
209 extern unsigned int
onig_get_parse_depth_limit(void)210 onig_get_parse_depth_limit(void)
211 {
212   return ParseDepthLimit;
213 }
214 
215 extern int
onig_set_parse_depth_limit(unsigned int depth)216 onig_set_parse_depth_limit(unsigned int depth)
217 {
218   if (depth == 0)
219     ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
220   else
221     ParseDepthLimit = depth;
222   return 0;
223 }
224 
225 #ifdef ONIG_DEBUG_PARSE
226 #define INC_PARSE_DEPTH(d) do {\
227   (d)++;\
228   if (env->max_parse_depth < (d)) env->max_parse_depth = d;\
229   if ((d) > ParseDepthLimit) \
230     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
231 } while (0)
232 #else
233 #define INC_PARSE_DEPTH(d) do {\
234   (d)++;\
235   if ((d) > ParseDepthLimit) \
236     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
237 } while (0)
238 #endif
239 
240 #define DEC_PARSE_DEPTH(d)  (d)--
241 
242 
243 static int
bbuf_init(BBuf * buf,int size)244 bbuf_init(BBuf* buf, int size)
245 {
246   if (size <= 0) {
247     size   = 0;
248     buf->p = NULL;
249   }
250   else {
251     buf->p = (UChar* )xmalloc(size);
252     if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
253   }
254 
255   buf->alloc = size;
256   buf->used  = 0;
257   return 0;
258 }
259 
260 static void
bbuf_free(BBuf * bbuf)261 bbuf_free(BBuf* bbuf)
262 {
263   if (IS_NOT_NULL(bbuf)) {
264     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
265     xfree(bbuf);
266   }
267 }
268 
269 static int
bbuf_clone(BBuf ** rto,BBuf * from)270 bbuf_clone(BBuf** rto, BBuf* from)
271 {
272   int r;
273   BBuf *to;
274 
275   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
276   CHECK_NULL_RETURN_MEMERR(to);
277   r = BB_INIT(to, from->alloc);
278   if (r != 0) {
279     xfree(to->p);
280     *rto = 0;
281     return r;
282   }
283   to->used = from->used;
284   xmemcpy(to->p, from->p, from->used);
285   return 0;
286 }
287 
288 static int
backref_rel_to_abs(int rel_no,ScanEnv * env)289 backref_rel_to_abs(int rel_no, ScanEnv* env)
290 {
291   if (rel_no > 0) {
292     return env->num_mem + rel_no;
293   }
294   else {
295     return env->num_mem + 1 + rel_no;
296   }
297 }
298 
299 #define OPTION_ON(v,f)     ((v) |= (f))
300 #define OPTION_OFF(v,f)    ((v) &= ~(f))
301 
302 #define OPTION_NEGATE(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
303 
304 #define MBCODE_START_POS(enc) \
305   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
306 
307 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
308   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
309 
310 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
311   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
312     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
313     if (r != 0) return r;\
314   }\
315 } while (0)
316 
317 
318 #define BITSET_IS_EMPTY(bs,empty) do {\
319   int i;\
320   empty = 1;\
321   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\
322     if ((bs)[i] != 0) {\
323       empty = 0; break;\
324     }\
325   }\
326 } while (0)
327 
328 static void
bitset_set_range(BitSetRef bs,int from,int to)329 bitset_set_range(BitSetRef bs, int from, int to)
330 {
331   int i;
332   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
333     BITSET_SET_BIT(bs, i);
334   }
335 }
336 
337 static void
bitset_invert(BitSetRef bs)338 bitset_invert(BitSetRef bs)
339 {
340   int i;
341   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); }
342 }
343 
344 static void
bitset_invert_to(BitSetRef from,BitSetRef to)345 bitset_invert_to(BitSetRef from, BitSetRef to)
346 {
347   int i;
348   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); }
349 }
350 
351 static void
bitset_and(BitSetRef dest,BitSetRef bs)352 bitset_and(BitSetRef dest, BitSetRef bs)
353 {
354   int i;
355   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; }
356 }
357 
358 static void
bitset_or(BitSetRef dest,BitSetRef bs)359 bitset_or(BitSetRef dest, BitSetRef bs)
360 {
361   int i;
362   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; }
363 }
364 
365 static void
bitset_copy(BitSetRef dest,BitSetRef bs)366 bitset_copy(BitSetRef dest, BitSetRef bs)
367 {
368   int i;
369   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; }
370 }
371 
372 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)373 onig_strncmp(const UChar* s1, const UChar* s2, int n)
374 {
375   int x;
376 
377   while (n-- > 0) {
378     x = *s2++ - *s1++;
379     if (x) return x;
380   }
381   return 0;
382 }
383 
384 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)385 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
386 {
387   int len = (int )(end - src);
388   if (len > 0) {
389     xmemcpy(dest, src, len);
390     dest[len] = (UChar )0;
391   }
392 }
393 
394 /* scan pattern methods */
395 #define PEND_VALUE   0
396 
397 #define PFETCH_READY  UChar* pfetch_prev
398 #define PEND         (p < end ?  0 : 1)
399 #define PUNFETCH     p = pfetch_prev
400 #define PINC       do { \
401   pfetch_prev = p; \
402   p += ONIGENC_MBC_ENC_LEN(enc, p); \
403 } while (0)
404 #define PFETCH(c)  do { \
405   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
406   pfetch_prev = p; \
407   p += ONIGENC_MBC_ENC_LEN(enc, p); \
408 } while (0)
409 
410 #define PINC_S     do { \
411   p += ONIGENC_MBC_ENC_LEN(enc, p); \
412 } while (0)
413 #define PFETCH_S(c) do { \
414   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
415   p += ONIGENC_MBC_ENC_LEN(enc, p); \
416 } while (0)
417 
418 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
419 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
420 
421 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)422 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
423             int capa)
424 {
425   UChar* r;
426 
427   if (dest)
428     r = (UChar* )xrealloc(dest, capa + 1);
429   else
430     r = (UChar* )xmalloc(capa + 1);
431 
432   CHECK_NULL_RETURN(r);
433   onig_strcpy(r + (dest_end - dest), src, src_end);
434   return r;
435 }
436 
437 /* dest on static area */
438 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)439 strcat_capa_from_static(UChar* dest, UChar* dest_end,
440                         const UChar* src, const UChar* src_end, int capa)
441 {
442   UChar* r;
443 
444   r = (UChar* )xmalloc(capa + 1);
445   CHECK_NULL_RETURN(r);
446   onig_strcpy(r, dest, dest_end);
447   onig_strcpy(r + (dest_end - dest), src, src_end);
448   return r;
449 }
450 
451 
452 #ifdef USE_ST_LIBRARY
453 
454 typedef struct {
455   UChar* s;
456   UChar* end;
457 } st_str_end_key;
458 
459 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)460 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
461 {
462   UChar *p, *q;
463   int c;
464 
465   if ((x->end - x->s) != (y->end - y->s))
466     return 1;
467 
468   p = x->s;
469   q = y->s;
470   while (p < x->end) {
471     c = (int )*p - (int )*q;
472     if (c != 0) return c;
473 
474     p++; q++;
475   }
476 
477   return 0;
478 }
479 
480 static int
str_end_hash(st_str_end_key * x)481 str_end_hash(st_str_end_key* x)
482 {
483   UChar *p;
484   unsigned val = 0;
485 
486   p = x->s;
487   while (p < x->end) {
488     val = val * 997 + (unsigned )*p++;
489   }
490 
491   return (int) (val + (val >> 5));
492 }
493 
494 extern hash_table_type
onig_st_init_strend_table_with_size(int size)495 onig_st_init_strend_table_with_size(int size)
496 {
497   static struct st_hash_type hashType = {
498     str_end_cmp,
499     str_end_hash,
500   };
501 
502   return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
503 }
504 
505 extern int
onig_st_lookup_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type * value)506 onig_st_lookup_strend(hash_table_type table, const UChar* str_key,
507                       const UChar* end_key, hash_data_type *value)
508 {
509   st_str_end_key key;
510 
511   key.s   = (UChar* )str_key;
512   key.end = (UChar* )end_key;
513 
514   return onig_st_lookup(table, (st_data_t )(&key), value);
515 }
516 
517 extern int
onig_st_insert_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type value)518 onig_st_insert_strend(hash_table_type table, const UChar* str_key,
519                       const UChar* end_key, hash_data_type value)
520 {
521   st_str_end_key* key;
522   int result;
523 
524   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
525   CHECK_NULL_RETURN_MEMERR(key);
526 
527   key->s   = (UChar* )str_key;
528   key->end = (UChar* )end_key;
529   result = onig_st_insert(table, (st_data_t )key, value);
530   if (result) {
531     xfree(key);
532   }
533   return result;
534 }
535 
536 
537 #ifdef USE_CALLOUT
538 
539 typedef struct {
540   OnigEncoding enc;
541   int    type; /* callout type: single or not */
542   UChar* s;
543   UChar* end;
544 } st_callout_name_key;
545 
546 static int
callout_name_table_cmp(st_callout_name_key * x,st_callout_name_key * y)547 callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y)
548 {
549   UChar *p, *q;
550   int c;
551 
552   if (x->enc  != y->enc)  return 1;
553   if (x->type != y->type) return 1;
554   if ((x->end - x->s) != (y->end - y->s))
555     return 1;
556 
557   p = x->s;
558   q = y->s;
559   while (p < x->end) {
560     c = (int )*p - (int )*q;
561     if (c != 0) return c;
562 
563     p++; q++;
564   }
565 
566   return 0;
567 }
568 
569 static int
callout_name_table_hash(st_callout_name_key * x)570 callout_name_table_hash(st_callout_name_key* x)
571 {
572   UChar *p;
573   unsigned int val = 0;
574 
575   p = x->s;
576   while (p < x->end) {
577     val = val * 997 + (unsigned int )*p++;
578   }
579 
580   /* use intptr_t for escape warning in Windows */
581   return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type);
582 }
583 
584 extern hash_table_type
onig_st_init_callout_name_table_with_size(int size)585 onig_st_init_callout_name_table_with_size(int size)
586 {
587   static struct st_hash_type hashType = {
588     callout_name_table_cmp,
589     callout_name_table_hash,
590   };
591 
592   return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
593 }
594 
595 extern int
onig_st_lookup_callout_name_table(hash_table_type table,OnigEncoding enc,int type,const UChar * str_key,const UChar * end_key,hash_data_type * value)596 onig_st_lookup_callout_name_table(hash_table_type table,
597                                   OnigEncoding enc,
598                                   int type,
599                                   const UChar* str_key,
600                                   const UChar* end_key,
601                                   hash_data_type *value)
602 {
603   st_callout_name_key key;
604 
605   key.enc  = enc;
606   key.type = type;
607   key.s    = (UChar* )str_key;
608   key.end  = (UChar* )end_key;
609 
610   return onig_st_lookup(table, (st_data_t )(&key), value);
611 }
612 
613 static int
st_insert_callout_name_table(hash_table_type table,OnigEncoding enc,int type,UChar * str_key,UChar * end_key,hash_data_type value)614 st_insert_callout_name_table(hash_table_type table,
615                              OnigEncoding enc, int type,
616                              UChar* str_key, UChar* end_key,
617                              hash_data_type value)
618 {
619   st_callout_name_key* key;
620   int result;
621 
622   key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key));
623   CHECK_NULL_RETURN_MEMERR(key);
624 
625   /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */
626   key->enc  = enc;
627   key->type = type;
628   key->s    = str_key;
629   key->end  = end_key;
630   result = onig_st_insert(table, (st_data_t )key, value);
631   if (result) {
632     xfree(key);
633   }
634   return result;
635 }
636 #endif
637 
638 #endif /* USE_ST_LIBRARY */
639 
640 
641 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
642 
643 typedef struct {
644   UChar* name;
645   int    name_len;   /* byte length */
646   int    back_num;   /* number of backrefs */
647   int    back_alloc;
648   int    back_ref1;
649   int*   back_refs;
650 } NameEntry;
651 
652 #ifdef USE_ST_LIBRARY
653 
654 #define INIT_NAMES_ALLOC_NUM    5
655 
656 typedef st_table  NameTable;
657 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
658 
659 #define NAMEBUF_SIZE    24
660 #define NAMEBUF_SIZE_1  25
661 
662 #ifdef ONIG_DEBUG
663 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)664 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
665 {
666   int i;
667   FILE* fp = (FILE* )arg;
668 
669   fprintf(fp, "%s: ", e->name);
670   if (e->back_num == 0)
671     fputs("-", fp);
672   else if (e->back_num == 1)
673     fprintf(fp, "%d", e->back_ref1);
674   else {
675     for (i = 0; i < e->back_num; i++) {
676       if (i > 0) fprintf(fp, ", ");
677       fprintf(fp, "%d", e->back_refs[i]);
678     }
679   }
680   fputs("\n", fp);
681   return ST_CONTINUE;
682 }
683 
684 extern int
onig_print_names(FILE * fp,regex_t * reg)685 onig_print_names(FILE* fp, regex_t* reg)
686 {
687   NameTable* t = (NameTable* )reg->name_table;
688 
689   if (IS_NOT_NULL(t)) {
690     fprintf(fp, "name table\n");
691     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
692     fputs("\n", fp);
693   }
694   return 0;
695 }
696 #endif /* ONIG_DEBUG */
697 
698 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)699 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
700 {
701   xfree(e->name);
702   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
703   xfree(key);
704   xfree(e);
705   return ST_DELETE;
706 }
707 
708 static int
names_clear(regex_t * reg)709 names_clear(regex_t* reg)
710 {
711   NameTable* t = (NameTable* )reg->name_table;
712 
713   if (IS_NOT_NULL(t)) {
714     onig_st_foreach(t, i_free_name_entry, 0);
715   }
716   return 0;
717 }
718 
719 extern int
onig_names_free(regex_t * reg)720 onig_names_free(regex_t* reg)
721 {
722   int r;
723   NameTable* t;
724 
725   r = names_clear(reg);
726   if (r != 0) return r;
727 
728   t = (NameTable* )reg->name_table;
729   if (IS_NOT_NULL(t)) onig_st_free_table(t);
730   reg->name_table = (void* )NULL;
731   return 0;
732 }
733 
734 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)735 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
736 {
737   NameEntry* e;
738   NameTable* t = (NameTable* )reg->name_table;
739 
740   e = (NameEntry* )NULL;
741   if (IS_NOT_NULL(t)) {
742     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
743   }
744   return e;
745 }
746 
747 typedef struct {
748   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
749   regex_t* reg;
750   void* arg;
751   int ret;
752   OnigEncoding enc;
753 } INamesArg;
754 
755 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)756 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
757 {
758   int r = (*(arg->func))(e->name,
759                          e->name + e->name_len,
760                          e->back_num,
761                          (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
762                          arg->reg, arg->arg);
763   if (r != 0) {
764     arg->ret = r;
765     return ST_STOP;
766   }
767   return ST_CONTINUE;
768 }
769 
770 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)771 onig_foreach_name(regex_t* reg,
772   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
773 {
774   INamesArg narg;
775   NameTable* t = (NameTable* )reg->name_table;
776 
777   narg.ret = 0;
778   if (IS_NOT_NULL(t)) {
779     narg.func = func;
780     narg.reg  = reg;
781     narg.arg  = arg;
782     narg.enc  = reg->enc; /* should be pattern encoding. */
783     onig_st_foreach(t, i_names, (HashDataType )&narg);
784   }
785   return narg.ret;
786 }
787 
788 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumMap * map)789 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map)
790 {
791   int i;
792 
793   if (e->back_num > 1) {
794     for (i = 0; i < e->back_num; i++) {
795       e->back_refs[i] = map[e->back_refs[i]].new_val;
796     }
797   }
798   else if (e->back_num == 1) {
799     e->back_ref1 = map[e->back_ref1].new_val;
800   }
801 
802   return ST_CONTINUE;
803 }
804 
805 extern int
onig_renumber_name_table(regex_t * reg,GroupNumMap * map)806 onig_renumber_name_table(regex_t* reg, GroupNumMap* map)
807 {
808   NameTable* t = (NameTable* )reg->name_table;
809 
810   if (IS_NOT_NULL(t)) {
811     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
812   }
813   return 0;
814 }
815 
816 
817 extern int
onig_number_of_names(regex_t * reg)818 onig_number_of_names(regex_t* reg)
819 {
820   NameTable* t = (NameTable* )reg->name_table;
821 
822   if (IS_NOT_NULL(t))
823     return t->num_entries;
824   else
825     return 0;
826 }
827 
828 #else  /* USE_ST_LIBRARY */
829 
830 #define INIT_NAMES_ALLOC_NUM    8
831 
832 typedef struct {
833   NameEntry* e;
834   int        num;
835   int        alloc;
836 } NameTable;
837 
838 #ifdef ONIG_DEBUG
839 extern int
onig_print_names(FILE * fp,regex_t * reg)840 onig_print_names(FILE* fp, regex_t* reg)
841 {
842   int i, j;
843   NameEntry* e;
844   NameTable* t = (NameTable* )reg->name_table;
845 
846   if (IS_NOT_NULL(t) && t->num > 0) {
847     fprintf(fp, "name table\n");
848     for (i = 0; i < t->num; i++) {
849       e = &(t->e[i]);
850       fprintf(fp, "%s: ", e->name);
851       if (e->back_num == 0) {
852         fputs("-", fp);
853       }
854       else if (e->back_num == 1) {
855         fprintf(fp, "%d", e->back_ref1);
856       }
857       else {
858         for (j = 0; j < e->back_num; j++) {
859           if (j > 0) fprintf(fp, ", ");
860           fprintf(fp, "%d", e->back_refs[j]);
861         }
862       }
863       fputs("\n", fp);
864     }
865     fputs("\n", fp);
866   }
867   return 0;
868 }
869 #endif
870 
871 static int
names_clear(regex_t * reg)872 names_clear(regex_t* reg)
873 {
874   int i;
875   NameEntry* e;
876   NameTable* t = (NameTable* )reg->name_table;
877 
878   if (IS_NOT_NULL(t)) {
879     for (i = 0; i < t->num; i++) {
880       e = &(t->e[i]);
881       if (IS_NOT_NULL(e->name)) {
882         xfree(e->name);
883         e->name       = NULL;
884         e->name_len   = 0;
885         e->back_num   = 0;
886         e->back_alloc = 0;
887         if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
888         e->back_refs = (int* )NULL;
889       }
890     }
891     if (IS_NOT_NULL(t->e)) {
892       xfree(t->e);
893       t->e = NULL;
894     }
895     t->num = 0;
896   }
897   return 0;
898 }
899 
900 extern int
onig_names_free(regex_t * reg)901 onig_names_free(regex_t* reg)
902 {
903   int r;
904   NameTable* t;
905 
906   r = names_clear(reg);
907   if (r != 0) return r;
908 
909   t = (NameTable* )reg->name_table;
910   if (IS_NOT_NULL(t)) xfree(t);
911   reg->name_table = NULL;
912   return 0;
913 }
914 
915 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)916 name_find(regex_t* reg, UChar* name, UChar* name_end)
917 {
918   int i, len;
919   NameEntry* e;
920   NameTable* t = (NameTable* )reg->name_table;
921 
922   if (IS_NOT_NULL(t)) {
923     len = name_end - name;
924     for (i = 0; i < t->num; i++) {
925       e = &(t->e[i]);
926       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
927         return e;
928     }
929   }
930   return (NameEntry* )NULL;
931 }
932 
933 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)934 onig_foreach_name(regex_t* reg,
935   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
936 {
937   int i, r;
938   NameEntry* e;
939   NameTable* t = (NameTable* )reg->name_table;
940 
941   if (IS_NOT_NULL(t)) {
942     for (i = 0; i < t->num; i++) {
943       e = &(t->e[i]);
944       r = (*func)(e->name, e->name + e->name_len, e->back_num,
945                   (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
946                   reg, arg);
947       if (r != 0) return r;
948     }
949   }
950   return 0;
951 }
952 
953 extern int
onig_number_of_names(regex_t * reg)954 onig_number_of_names(regex_t* reg)
955 {
956   NameTable* t = (NameTable* )reg->name_table;
957 
958   if (IS_NOT_NULL(t))
959     return t->num;
960   else
961     return 0;
962 }
963 
964 #endif /* else USE_ST_LIBRARY */
965 
966 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)967 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
968 {
969   int r;
970   int alloc;
971   NameEntry* e;
972   NameTable* t = (NameTable* )reg->name_table;
973 
974   if (name_end - name <= 0)
975     return ONIGERR_EMPTY_GROUP_NAME;
976 
977   e = name_find(reg, name, name_end);
978   if (IS_NULL(e)) {
979 #ifdef USE_ST_LIBRARY
980     if (IS_NULL(t)) {
981       t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM);
982       CHECK_NULL_RETURN_MEMERR(t);
983       reg->name_table = (void* )t;
984     }
985     e = (NameEntry* )xmalloc(sizeof(NameEntry));
986     CHECK_NULL_RETURN_MEMERR(e);
987 
988     e->name = onigenc_strdup(reg->enc, name, name_end);
989     if (IS_NULL(e->name)) {
990       xfree(e);  return ONIGERR_MEMORY;
991     }
992     r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
993                               (HashDataType )e);
994     if (r < 0) return r;
995 
996     e->name_len   = (int )(name_end - name);
997     e->back_num   = 0;
998     e->back_alloc = 0;
999     e->back_refs  = (int* )NULL;
1000 
1001 #else
1002 
1003     if (IS_NULL(t)) {
1004       alloc = INIT_NAMES_ALLOC_NUM;
1005       t = (NameTable* )xmalloc(sizeof(NameTable));
1006       CHECK_NULL_RETURN_MEMERR(t);
1007       t->e     = NULL;
1008       t->alloc = 0;
1009       t->num   = 0;
1010 
1011       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
1012       if (IS_NULL(t->e)) {
1013         xfree(t);
1014         return ONIGERR_MEMORY;
1015       }
1016       t->alloc = alloc;
1017       reg->name_table = t;
1018       goto clear;
1019     }
1020     else if (t->num == t->alloc) {
1021       int i;
1022 
1023       alloc = t->alloc * 2;
1024       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
1025       CHECK_NULL_RETURN_MEMERR(t->e);
1026       t->alloc = alloc;
1027 
1028     clear:
1029       for (i = t->num; i < t->alloc; i++) {
1030         t->e[i].name       = NULL;
1031         t->e[i].name_len   = 0;
1032         t->e[i].back_num   = 0;
1033         t->e[i].back_alloc = 0;
1034         t->e[i].back_refs  = (int* )NULL;
1035       }
1036     }
1037     e = &(t->e[t->num]);
1038     t->num++;
1039     e->name = onigenc_strdup(reg->enc, name, name_end);
1040     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1041     e->name_len = name_end - name;
1042 #endif
1043   }
1044 
1045   if (e->back_num >= 1 &&
1046       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
1047     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1048                                    name, name_end);
1049     return ONIGERR_MULTIPLEX_DEFINED_NAME;
1050   }
1051 
1052   e->back_num++;
1053   if (e->back_num == 1) {
1054     e->back_ref1 = backref;
1055   }
1056   else {
1057     if (e->back_num == 2) {
1058       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
1059       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
1060       CHECK_NULL_RETURN_MEMERR(e->back_refs);
1061       e->back_alloc = alloc;
1062       e->back_refs[0] = e->back_ref1;
1063       e->back_refs[1] = backref;
1064     }
1065     else {
1066       if (e->back_num > e->back_alloc) {
1067         alloc = e->back_alloc * 2;
1068         e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
1069         CHECK_NULL_RETURN_MEMERR(e->back_refs);
1070         e->back_alloc = alloc;
1071       }
1072       e->back_refs[e->back_num - 1] = backref;
1073     }
1074   }
1075 
1076   return 0;
1077 }
1078 
1079 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)1080 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
1081                            const UChar* name_end, int** nums)
1082 {
1083   NameEntry* e = name_find(reg, name, name_end);
1084 
1085   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
1086 
1087   switch (e->back_num) {
1088   case 0:
1089     break;
1090   case 1:
1091     *nums = &(e->back_ref1);
1092     break;
1093   default:
1094     *nums = e->back_refs;
1095     break;
1096   }
1097   return e->back_num;
1098 }
1099 
1100 static int
name_to_group_numbers(ScanEnv * env,const UChar * name,const UChar * name_end,int ** nums)1101 name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end,
1102                       int** nums)
1103 {
1104   regex_t* reg;
1105   NameEntry* e;
1106 
1107   reg = env->reg;
1108   e = name_find(reg, name, name_end);
1109 
1110   if (IS_NULL(e)) {
1111     onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
1112                                    (UChar* )name, (UChar* )name_end);
1113     return ONIGERR_UNDEFINED_NAME_REFERENCE;
1114   }
1115 
1116   switch (e->back_num) {
1117   case 0:
1118     break;
1119   case 1:
1120     *nums = &(e->back_ref1);
1121     break;
1122   default:
1123     *nums = e->back_refs;
1124     break;
1125   }
1126   return e->back_num;
1127 }
1128 
1129 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)1130 onig_name_to_backref_number(regex_t* reg, const UChar* name,
1131                             const UChar* name_end, OnigRegion *region)
1132 {
1133   int i, n, *nums;
1134 
1135   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
1136   if (n < 0)
1137     return n;
1138   else if (n == 0)
1139     return ONIGERR_PARSER_BUG;
1140   else if (n == 1)
1141     return nums[0];
1142   else {
1143     if (IS_NOT_NULL(region)) {
1144       for (i = n - 1; i >= 0; i--) {
1145         if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
1146           return nums[i];
1147       }
1148     }
1149     return nums[n - 1];
1150   }
1151 }
1152 
1153 extern int
onig_noname_group_capture_is_active(regex_t * reg)1154 onig_noname_group_capture_is_active(regex_t* reg)
1155 {
1156   if (OPTON_DONT_CAPTURE_GROUP(reg->options))
1157     return 0;
1158 
1159   if (onig_number_of_names(reg) > 0 &&
1160       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
1161       ! OPTON_CAPTURE_GROUP(reg->options)) {
1162     return 0;
1163   }
1164 
1165   return 1;
1166 }
1167 
1168 #ifdef USE_CALLOUT
1169 
1170 typedef struct {
1171   OnigCalloutType type;
1172   int             in;
1173   OnigCalloutFunc start_func;
1174   OnigCalloutFunc end_func;
1175   int             arg_num;
1176   int             opt_arg_num;
1177   unsigned int    arg_types[ONIG_CALLOUT_MAX_ARGS_NUM];
1178   OnigValue       opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM];
1179   UChar*          name; /* reference to GlobalCalloutNameTable entry: e->name */
1180 } CalloutNameListEntry;
1181 
1182 typedef struct {
1183   int  n;
1184   int  alloc;
1185   CalloutNameListEntry* v;
1186 } CalloutNameListType;
1187 
1188 static CalloutNameListType* GlobalCalloutNameList;
1189 
1190 static int
make_callout_func_list(CalloutNameListType ** rs,int init_size)1191 make_callout_func_list(CalloutNameListType** rs, int init_size)
1192 {
1193   CalloutNameListType* s;
1194   CalloutNameListEntry* v;
1195 
1196   *rs = 0;
1197 
1198   s = xmalloc(sizeof(*s));
1199   if (IS_NULL(s)) return ONIGERR_MEMORY;
1200 
1201   v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size);
1202   if (IS_NULL(v)) {
1203     xfree(s);
1204     return ONIGERR_MEMORY;
1205   }
1206 
1207   s->n = 0;
1208   s->alloc = init_size;
1209   s->v = v;
1210 
1211   *rs = s;
1212   return ONIG_NORMAL;
1213 }
1214 
1215 static void
free_callout_func_list(CalloutNameListType * s)1216 free_callout_func_list(CalloutNameListType* s)
1217 {
1218   if (IS_NOT_NULL(s)) {
1219     if (IS_NOT_NULL(s->v)) {
1220       int i, j;
1221 
1222       for (i = 0; i < s->n; i++) {
1223         CalloutNameListEntry* e = s->v + i;
1224         for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) {
1225           if (e->arg_types[j] == ONIG_TYPE_STRING) {
1226             UChar* p = e->opt_defaults[j].s.start;
1227             if (IS_NOT_NULL(p)) xfree(p);
1228           }
1229         }
1230       }
1231       xfree(s->v);
1232     }
1233     xfree(s);
1234   }
1235 }
1236 
1237 static int
callout_func_list_add(CalloutNameListType * s,int * rid)1238 callout_func_list_add(CalloutNameListType* s, int* rid)
1239 {
1240   if (s->n >= s->alloc) {
1241     int new_size = s->alloc * 2;
1242     CalloutNameListEntry* nv = (CalloutNameListEntry* )
1243       xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size);
1244     if (IS_NULL(nv)) return ONIGERR_MEMORY;
1245 
1246     s->alloc = new_size;
1247     s->v = nv;
1248   }
1249 
1250   *rid = s->n;
1251 
1252   xmemset(&(s->v[s->n]), 0, sizeof(*(s->v)));
1253   s->n++;
1254   return ONIG_NORMAL;
1255 }
1256 
1257 
1258 typedef struct {
1259   UChar* name;
1260   int    name_len;   /* byte length */
1261   int    id;
1262 } CalloutNameEntry;
1263 
1264 #ifdef USE_ST_LIBRARY
1265 typedef st_table  CalloutNameTable;
1266 #else
1267 typedef struct {
1268   CalloutNameEntry* e;
1269   int               num;
1270   int               alloc;
1271 } CalloutNameTable;
1272 #endif
1273 
1274 static CalloutNameTable* GlobalCalloutNameTable;
1275 static int CalloutNameIDCounter;
1276 
1277 #ifdef USE_ST_LIBRARY
1278 
1279 static int
i_free_callout_name_entry(st_callout_name_key * key,CalloutNameEntry * e,void * arg ARG_UNUSED)1280 i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e,
1281                           void* arg ARG_UNUSED)
1282 {
1283   xfree(e->name);
1284   /*xfree(key->s); */ /* is same as e->name */
1285   xfree(key);
1286   xfree(e);
1287   return ST_DELETE;
1288 }
1289 
1290 static int
callout_name_table_clear(CalloutNameTable * t)1291 callout_name_table_clear(CalloutNameTable* t)
1292 {
1293   if (IS_NOT_NULL(t)) {
1294     onig_st_foreach(t, i_free_callout_name_entry, 0);
1295   }
1296   return 0;
1297 }
1298 
1299 static int
global_callout_name_table_free(void)1300 global_callout_name_table_free(void)
1301 {
1302   if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1303     int r = callout_name_table_clear(GlobalCalloutNameTable);
1304     if (r != 0) return r;
1305 
1306     onig_st_free_table(GlobalCalloutNameTable);
1307     GlobalCalloutNameTable = 0;
1308     CalloutNameIDCounter = 0;
1309   }
1310 
1311   return 0;
1312 }
1313 
1314 static CalloutNameEntry*
callout_name_find(OnigEncoding enc,int is_not_single,const UChar * name,const UChar * name_end)1315 callout_name_find(OnigEncoding enc, int is_not_single,
1316                   const UChar* name, const UChar* name_end)
1317 {
1318   int r;
1319   CalloutNameEntry* e;
1320   CalloutNameTable* t = GlobalCalloutNameTable;
1321 
1322   e = (CalloutNameEntry* )NULL;
1323   if (IS_NOT_NULL(t)) {
1324     r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1325                                           (HashDataType* )((void* )(&e)));
1326     if (r == 0) { /* not found */
1327       if (enc != ONIG_ENCODING_ASCII &&
1328           ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
1329         enc = ONIG_ENCODING_ASCII;
1330         onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1331                                           (HashDataType* )((void* )(&e)));
1332       }
1333     }
1334   }
1335   return e;
1336 }
1337 
1338 #else
1339 
1340 static int
callout_name_table_clear(CalloutNameTable * t)1341 callout_name_table_clear(CalloutNameTable* t)
1342 {
1343   int i;
1344   CalloutNameEntry* e;
1345 
1346   if (IS_NOT_NULL(t)) {
1347     for (i = 0; i < t->num; i++) {
1348       e = &(t->e[i]);
1349       if (IS_NOT_NULL(e->name)) {
1350         xfree(e->name);
1351         e->name     = NULL;
1352         e->name_len = 0;
1353         e->id       = 0;
1354         e->func     = 0;
1355       }
1356     }
1357     if (IS_NOT_NULL(t->e)) {
1358       xfree(t->e);
1359       t->e = NULL;
1360     }
1361     t->num = 0;
1362   }
1363   return 0;
1364 }
1365 
1366 static int
global_callout_name_table_free(void)1367 global_callout_name_table_free(void)
1368 {
1369   if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1370     int r = callout_name_table_clear(GlobalCalloutNameTable);
1371     if (r != 0) return r;
1372 
1373     xfree(GlobalCalloutNameTable);
1374     GlobalCalloutNameTable = 0;
1375     CalloutNameIDCounter = 0;
1376   }
1377   return 0;
1378 }
1379 
1380 static CalloutNameEntry*
callout_name_find(UChar * name,UChar * name_end)1381 callout_name_find(UChar* name, UChar* name_end)
1382 {
1383   int i, len;
1384   CalloutNameEntry* e;
1385   CalloutNameTable* t = Calloutnames;
1386 
1387   if (IS_NOT_NULL(t)) {
1388     len = name_end - name;
1389     for (i = 0; i < t->num; i++) {
1390       e = &(t->e[i]);
1391       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
1392         return e;
1393     }
1394   }
1395   return (CalloutNameEntry* )NULL;
1396 }
1397 
1398 #endif
1399 
1400 /* name string must be single byte char string. */
1401 static int
callout_name_entry(CalloutNameEntry ** rentry,OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end)1402 callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc,
1403                    int is_not_single, UChar* name, UChar* name_end)
1404 {
1405   int r;
1406   CalloutNameEntry* e;
1407   CalloutNameTable* t = GlobalCalloutNameTable;
1408 
1409   *rentry = 0;
1410   if (name_end - name <= 0)
1411     return ONIGERR_INVALID_CALLOUT_NAME;
1412 
1413   e = callout_name_find(enc, is_not_single, name, name_end);
1414   if (IS_NULL(e)) {
1415 #ifdef USE_ST_LIBRARY
1416     if (IS_NULL(t)) {
1417       t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM);
1418       CHECK_NULL_RETURN_MEMERR(t);
1419       GlobalCalloutNameTable = t;
1420     }
1421     e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry));
1422     CHECK_NULL_RETURN_MEMERR(e);
1423 
1424     e->name = onigenc_strdup(enc, name, name_end);
1425     if (IS_NULL(e->name)) {
1426       xfree(e);  return ONIGERR_MEMORY;
1427     }
1428 
1429     r = st_insert_callout_name_table(t, enc, is_not_single,
1430                                      e->name, (e->name + (name_end - name)),
1431                                      (HashDataType )e);
1432     if (r < 0) return r;
1433 
1434 #else
1435 
1436     int alloc;
1437 
1438     if (IS_NULL(t)) {
1439       alloc = INIT_NAMES_ALLOC_NUM;
1440       t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable));
1441       CHECK_NULL_RETURN_MEMERR(t);
1442       t->e     = NULL;
1443       t->alloc = 0;
1444       t->num   = 0;
1445 
1446       t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc);
1447       if (IS_NULL(t->e)) {
1448         xfree(t);
1449         return ONIGERR_MEMORY;
1450       }
1451       t->alloc = alloc;
1452       GlobalCalloutNameTable = t;
1453       goto clear;
1454     }
1455     else if (t->num == t->alloc) {
1456       int i;
1457 
1458       alloc = t->alloc * 2;
1459       t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc);
1460       CHECK_NULL_RETURN_MEMERR(t->e);
1461       t->alloc = alloc;
1462 
1463     clear:
1464       for (i = t->num; i < t->alloc; i++) {
1465         t->e[i].name       = NULL;
1466         t->e[i].name_len   = 0;
1467         t->e[i].id         = 0;
1468       }
1469     }
1470     e = &(t->e[t->num]);
1471     t->num++;
1472     e->name = onigenc_strdup(enc, name, name_end);
1473     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1474 #endif
1475 
1476     CalloutNameIDCounter++;
1477     e->id = CalloutNameIDCounter;
1478     e->name_len = (int )(name_end - name);
1479   }
1480 
1481   *rentry = e;
1482   return e->id;
1483 }
1484 
1485 static int
is_allowed_callout_name(OnigEncoding enc,UChar * name,UChar * name_end)1486 is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end)
1487 {
1488   UChar* p;
1489   OnigCodePoint c;
1490 
1491   if (name >= name_end) return 0;
1492 
1493   p = name;
1494   while (p < name_end) {
1495     c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1496     if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c))
1497       return 0;
1498 
1499     if (p == name) {
1500       if (c >= '0' && c <= '9') return 0;
1501     }
1502 
1503     p += ONIGENC_MBC_ENC_LEN(enc, p);
1504   }
1505 
1506   return 1;
1507 }
1508 
1509 static int
is_allowed_callout_tag_name(OnigEncoding enc,UChar * name,UChar * name_end)1510 is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end)
1511 {
1512   UChar* p;
1513   OnigCodePoint c;
1514 
1515   if (name >= name_end) return 0;
1516 
1517   p = name;
1518   while (p < name_end) {
1519     c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1520     if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c))
1521       return 0;
1522 
1523     if (p == name) {
1524       if (c >= '0' && c <= '9') return 0;
1525     }
1526 
1527     p += ONIGENC_MBC_ENC_LEN(enc, p);
1528   }
1529 
1530   return 1;
1531 }
1532 
1533 extern int
onig_set_callout_of_name(OnigEncoding enc,OnigCalloutType callout_type,UChar * name,UChar * name_end,int in,OnigCalloutFunc start_func,OnigCalloutFunc end_func,int arg_num,unsigned int arg_types[],int opt_arg_num,OnigValue opt_defaults[])1534 onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type,
1535                          UChar* name, UChar* name_end, int in,
1536                          OnigCalloutFunc start_func,
1537                          OnigCalloutFunc end_func,
1538                          int arg_num, unsigned int arg_types[],
1539                          int opt_arg_num, OnigValue opt_defaults[])
1540 {
1541   int r;
1542   int i;
1543   int j;
1544   int id;
1545   int is_not_single;
1546   CalloutNameEntry* e;
1547   CalloutNameListEntry* fe;
1548 
1549   if (callout_type != ONIG_CALLOUT_TYPE_SINGLE)
1550     return ONIGERR_INVALID_ARGUMENT;
1551 
1552   if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM)
1553     return ONIGERR_INVALID_CALLOUT_ARG;
1554 
1555   if (opt_arg_num < 0 || opt_arg_num > arg_num)
1556     return ONIGERR_INVALID_CALLOUT_ARG;
1557 
1558   if (start_func == 0 && end_func == 0)
1559     return ONIGERR_INVALID_CALLOUT_ARG;
1560 
1561   if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0)
1562     return ONIGERR_INVALID_CALLOUT_ARG;
1563 
1564   for (i = 0; i < arg_num; i++) {
1565     unsigned int t = arg_types[i];
1566     if (t == ONIG_TYPE_VOID)
1567       return ONIGERR_INVALID_CALLOUT_ARG;
1568     else {
1569       if (i >= arg_num - opt_arg_num) {
1570         if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING &&
1571             t != ONIG_TYPE_TAG)
1572           return ONIGERR_INVALID_CALLOUT_ARG;
1573       }
1574       else {
1575         if (t != ONIG_TYPE_LONG) {
1576           t = t & ~ONIG_TYPE_LONG;
1577           if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG)
1578             return ONIGERR_INVALID_CALLOUT_ARG;
1579         }
1580       }
1581     }
1582   }
1583 
1584   if (! is_allowed_callout_name(enc, name, name_end)) {
1585     return ONIGERR_INVALID_CALLOUT_NAME;
1586   }
1587 
1588   is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE);
1589   id = callout_name_entry(&e, enc, is_not_single, name, name_end);
1590   if (id < 0) return id;
1591 
1592   r = ONIG_NORMAL;
1593   if (IS_NULL(GlobalCalloutNameList)) {
1594     r = make_callout_func_list(&GlobalCalloutNameList, 10);
1595     if (r != ONIG_NORMAL) return r;
1596   }
1597 
1598   while (id >= GlobalCalloutNameList->n) {
1599     int rid;
1600     r = callout_func_list_add(GlobalCalloutNameList, &rid);
1601     if (r != ONIG_NORMAL) return r;
1602   }
1603 
1604   fe = GlobalCalloutNameList->v + id;
1605   fe->type         = callout_type;
1606   fe->in           = in;
1607   fe->start_func   = start_func;
1608   fe->end_func     = end_func;
1609   fe->arg_num      = arg_num;
1610   fe->opt_arg_num  = opt_arg_num;
1611   fe->name         = e->name;
1612 
1613   for (i = 0; i < arg_num; i++) {
1614     fe->arg_types[i] = arg_types[i];
1615   }
1616   for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) {
1617     if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT;
1618     if (fe->arg_types[i] == ONIG_TYPE_STRING) {
1619       OnigValue* val;
1620       UChar* ds;
1621 
1622       val = opt_defaults + j;
1623       ds = onigenc_strdup(enc, val->s.start, val->s.end);
1624       CHECK_NULL_RETURN_MEMERR(ds);
1625 
1626       fe->opt_defaults[i].s.start = ds;
1627       fe->opt_defaults[i].s.end   = ds + (val->s.end - val->s.start);
1628     }
1629     else {
1630       fe->opt_defaults[i] = opt_defaults[j];
1631     }
1632   }
1633 
1634   r = id;
1635   return r;
1636 }
1637 
1638 static int
get_callout_name_id_by_name(OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end,int * rid)1639 get_callout_name_id_by_name(OnigEncoding enc, int is_not_single,
1640                             UChar* name, UChar* name_end, int* rid)
1641 {
1642   int r;
1643   CalloutNameEntry* e;
1644 
1645   if (! is_allowed_callout_name(enc, name, name_end)) {
1646     return ONIGERR_INVALID_CALLOUT_NAME;
1647   }
1648 
1649   e = callout_name_find(enc, is_not_single, name, name_end);
1650   if (IS_NULL(e)) {
1651     return ONIGERR_UNDEFINED_CALLOUT_NAME;
1652   }
1653 
1654   r = ONIG_NORMAL;
1655   *rid = e->id;
1656 
1657   return r;
1658 }
1659 
1660 extern OnigCalloutFunc
onig_get_callout_start_func(regex_t * reg,int callout_num)1661 onig_get_callout_start_func(regex_t* reg, int callout_num)
1662 {
1663   /* If used for callouts of contents, return 0. */
1664   CalloutListEntry* e;
1665 
1666   e = onig_reg_callout_list_at(reg, callout_num);
1667   CHECK_NULL_RETURN(e);
1668   return e->start_func;
1669 }
1670 
1671 extern const UChar*
onig_get_callout_tag_start(regex_t * reg,int callout_num)1672 onig_get_callout_tag_start(regex_t* reg, int callout_num)
1673 {
1674   CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1675   CHECK_NULL_RETURN(e);
1676   return e->tag_start;
1677 }
1678 
1679 extern const UChar*
onig_get_callout_tag_end(regex_t * reg,int callout_num)1680 onig_get_callout_tag_end(regex_t* reg, int callout_num)
1681 {
1682   CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1683   CHECK_NULL_RETURN(e);
1684   return e->tag_end;
1685 }
1686 
1687 
1688 extern OnigCalloutType
onig_get_callout_type_by_name_id(int name_id)1689 onig_get_callout_type_by_name_id(int name_id)
1690 {
1691   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1692     return 0;
1693 
1694   return GlobalCalloutNameList->v[name_id].type;
1695 }
1696 
1697 extern OnigCalloutFunc
onig_get_callout_start_func_by_name_id(int name_id)1698 onig_get_callout_start_func_by_name_id(int name_id)
1699 {
1700   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1701     return 0;
1702 
1703   return GlobalCalloutNameList->v[name_id].start_func;
1704 }
1705 
1706 extern OnigCalloutFunc
onig_get_callout_end_func_by_name_id(int name_id)1707 onig_get_callout_end_func_by_name_id(int name_id)
1708 {
1709   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1710     return 0;
1711 
1712   return GlobalCalloutNameList->v[name_id].end_func;
1713 }
1714 
1715 extern int
onig_get_callout_in_by_name_id(int name_id)1716 onig_get_callout_in_by_name_id(int name_id)
1717 {
1718   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1719     return 0;
1720 
1721   return GlobalCalloutNameList->v[name_id].in;
1722 }
1723 
1724 static int
get_callout_arg_num_by_name_id(int name_id)1725 get_callout_arg_num_by_name_id(int name_id)
1726 {
1727   return GlobalCalloutNameList->v[name_id].arg_num;
1728 }
1729 
1730 static int
get_callout_opt_arg_num_by_name_id(int name_id)1731 get_callout_opt_arg_num_by_name_id(int name_id)
1732 {
1733   return GlobalCalloutNameList->v[name_id].opt_arg_num;
1734 }
1735 
1736 static unsigned int
get_callout_arg_type_by_name_id(int name_id,int index)1737 get_callout_arg_type_by_name_id(int name_id, int index)
1738 {
1739   return GlobalCalloutNameList->v[name_id].arg_types[index];
1740 }
1741 
1742 static OnigValue
get_callout_opt_default_by_name_id(int name_id,int index)1743 get_callout_opt_default_by_name_id(int name_id, int index)
1744 {
1745   return GlobalCalloutNameList->v[name_id].opt_defaults[index];
1746 }
1747 
1748 extern UChar*
onig_get_callout_name_by_name_id(int name_id)1749 onig_get_callout_name_by_name_id(int name_id)
1750 {
1751   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1752     return 0;
1753 
1754   return GlobalCalloutNameList->v[name_id].name;
1755 }
1756 
1757 extern int
onig_global_callout_names_free(void)1758 onig_global_callout_names_free(void)
1759 {
1760   free_callout_func_list(GlobalCalloutNameList);
1761   GlobalCalloutNameList = 0;
1762 
1763   global_callout_name_table_free();
1764   return ONIG_NORMAL;
1765 }
1766 
1767 
1768 typedef st_table   CalloutTagTable;
1769 typedef intptr_t   CalloutTagVal;
1770 
1771 #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST     (1<<0)
1772 
1773 static int
i_callout_callout_list_set(UChar * key,CalloutTagVal e,void * arg)1774 i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg)
1775 {
1776   int num;
1777   RegexExt* ext = (RegexExt* )arg;
1778 
1779   num = (int )e - 1;
1780   ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST;
1781   return ST_CONTINUE;
1782 }
1783 
1784 static int
setup_ext_callout_list_values(regex_t * reg)1785 setup_ext_callout_list_values(regex_t* reg)
1786 {
1787   int i, j;
1788   RegexExt* ext;
1789 
1790   ext = reg->extp;
1791   if (IS_NOT_NULL(ext->tag_table)) {
1792     onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set,
1793                     (st_data_t )ext);
1794   }
1795 
1796   for (i = 0; i < ext->callout_num; i++) {
1797     CalloutListEntry* e = ext->callout_list + i;
1798     if (e->of == ONIG_CALLOUT_OF_NAME) {
1799       for (j = 0; j < e->u.arg.num; j++) {
1800         if (e->u.arg.types[j] == ONIG_TYPE_TAG) {
1801           UChar* start;
1802           UChar* end;
1803           int num;
1804           start = e->u.arg.vals[j].s.start;
1805           end   = e->u.arg.vals[j].s.end;
1806           num = onig_get_callout_num_by_tag(reg, start, end);
1807           if (num < 0) return num;
1808           e->u.arg.vals[j].tag = num;
1809         }
1810       }
1811     }
1812   }
1813 
1814   return ONIG_NORMAL;
1815 }
1816 
1817 extern int
onig_callout_tag_is_exist_at_callout_num(regex_t * reg,int callout_num)1818 onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num)
1819 {
1820   RegexExt* ext = reg->extp;
1821 
1822   if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0;
1823   if (callout_num > ext->callout_num) return 0;
1824 
1825   return (ext->callout_list[callout_num].flag &
1826           CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0;
1827 }
1828 
1829 static int
i_free_callout_tag_entry(UChar * key,CalloutTagVal e,void * arg ARG_UNUSED)1830 i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED)
1831 {
1832   xfree(key);
1833   return ST_DELETE;
1834 }
1835 
1836 static int
callout_tag_table_clear(CalloutTagTable * t)1837 callout_tag_table_clear(CalloutTagTable* t)
1838 {
1839   if (IS_NOT_NULL(t)) {
1840     onig_st_foreach(t, i_free_callout_tag_entry, 0);
1841   }
1842   return 0;
1843 }
1844 
1845 extern int
onig_callout_tag_table_free(void * table)1846 onig_callout_tag_table_free(void* table)
1847 {
1848   CalloutTagTable* t = (CalloutTagTable* )table;
1849 
1850   if (IS_NOT_NULL(t)) {
1851     int r = callout_tag_table_clear(t);
1852     if (r != 0) return r;
1853 
1854     onig_st_free_table(t);
1855   }
1856 
1857   return 0;
1858 }
1859 
1860 extern int
onig_get_callout_num_by_tag(regex_t * reg,const UChar * tag,const UChar * tag_end)1861 onig_get_callout_num_by_tag(regex_t* reg,
1862                             const UChar* tag, const UChar* tag_end)
1863 {
1864   int r;
1865   RegexExt* ext;
1866   CalloutTagVal e;
1867 
1868   ext = reg->extp;
1869   if (IS_NULL(ext) || IS_NULL(ext->tag_table))
1870     return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1871 
1872   r = onig_st_lookup_strend(ext->tag_table, tag, tag_end,
1873                             (HashDataType* )((void* )(&e)));
1874   if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1875   return (int )e;
1876 }
1877 
1878 static CalloutTagVal
callout_tag_find(CalloutTagTable * t,const UChar * name,const UChar * name_end)1879 callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end)
1880 {
1881   CalloutTagVal e;
1882 
1883   e = -1;
1884   if (IS_NOT_NULL(t)) {
1885     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
1886   }
1887   return e;
1888 }
1889 
1890 static int
callout_tag_table_new(CalloutTagTable ** rt)1891 callout_tag_table_new(CalloutTagTable** rt)
1892 {
1893   CalloutTagTable* t;
1894 
1895   *rt = 0;
1896   t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM);
1897   CHECK_NULL_RETURN_MEMERR(t);
1898 
1899   *rt = t;
1900   return ONIG_NORMAL;
1901 }
1902 
1903 static int
callout_tag_entry_raw(ScanEnv * env,CalloutTagTable * t,UChar * name,UChar * name_end,CalloutTagVal entry_val)1904 callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name,
1905                       UChar* name_end, CalloutTagVal entry_val)
1906 {
1907   int r;
1908   CalloutTagVal val;
1909 
1910   if (name_end - name <= 0)
1911     return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1912 
1913   val = callout_tag_find(t, name, name_end);
1914   if (val >= 0) {
1915     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1916                                    name, name_end);
1917     return ONIGERR_MULTIPLEX_DEFINED_NAME;
1918   }
1919 
1920   r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);
1921   if (r < 0) return r;
1922 
1923   return ONIG_NORMAL;
1924 }
1925 
1926 static int
ext_ensure_tag_table(regex_t * reg)1927 ext_ensure_tag_table(regex_t* reg)
1928 {
1929   int r;
1930   RegexExt* ext;
1931   CalloutTagTable* t;
1932 
1933   ext = onig_get_regex_ext(reg);
1934   CHECK_NULL_RETURN_MEMERR(ext);
1935 
1936   if (IS_NULL(ext->tag_table)) {
1937     r = callout_tag_table_new(&t);
1938     if (r != ONIG_NORMAL) return r;
1939 
1940     ext->tag_table = t;
1941   }
1942 
1943   return ONIG_NORMAL;
1944 }
1945 
1946 static int
callout_tag_entry(ScanEnv * env,regex_t * reg,UChar * name,UChar * name_end,CalloutTagVal entry_val)1947 callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
1948                   CalloutTagVal entry_val)
1949 {
1950   int r;
1951   RegexExt* ext;
1952   CalloutListEntry* e;
1953 
1954   r = ext_ensure_tag_table(reg);
1955   if (r != ONIG_NORMAL) return r;
1956 
1957   ext = onig_get_regex_ext(reg);
1958   CHECK_NULL_RETURN_MEMERR(ext);
1959   r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);
1960 
1961   e = onig_reg_callout_list_at(reg, (int )entry_val);
1962   CHECK_NULL_RETURN_MEMERR(e);
1963   e->tag_start = name;
1964   e->tag_end   = name_end;
1965 
1966   return r;
1967 }
1968 
1969 #endif /* USE_CALLOUT */
1970 
1971 
1972 #define INIT_SCANENV_MEMENV_ALLOC_SIZE   16
1973 
1974 static void
scan_env_clear(ScanEnv * env)1975 scan_env_clear(ScanEnv* env)
1976 {
1977   MEM_STATUS_CLEAR(env->cap_history);
1978   MEM_STATUS_CLEAR(env->backtrack_mem);
1979   MEM_STATUS_CLEAR(env->backrefed_mem);
1980   env->error      = (UChar* )NULL;
1981   env->error_end  = (UChar* )NULL;
1982   env->num_call   = 0;
1983 
1984 #ifdef USE_CALL
1985   env->unset_addr_list = NULL;
1986   env->has_call_zero   = 0;
1987 #endif
1988 
1989   env->num_mem    = 0;
1990   env->num_named  = 0;
1991   env->mem_alloc  = 0;
1992   env->mem_env_dynamic = (MemEnv* )NULL;
1993 
1994   xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static));
1995 
1996   env->parse_depth      = 0;
1997 #ifdef ONIG_DEBUG_PARSE
1998   env->max_parse_depth  = 0;
1999 #endif
2000   env->backref_num      = 0;
2001   env->keep_num         = 0;
2002   env->id_num           = 0;
2003   env->save_alloc_num   = 0;
2004   env->saves            = 0;
2005 }
2006 
2007 static int
scan_env_add_mem_entry(ScanEnv * env)2008 scan_env_add_mem_entry(ScanEnv* env)
2009 {
2010   int i, need, alloc;
2011   MemEnv* p;
2012 
2013   need = env->num_mem + 1;
2014   if (need > MaxCaptureNum && MaxCaptureNum != 0)
2015     return ONIGERR_TOO_MANY_CAPTURES;
2016 
2017   if (need >= SCANENV_MEMENV_SIZE) {
2018     if (env->mem_alloc <= need) {
2019       if (IS_NULL(env->mem_env_dynamic)) {
2020         alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE;
2021         p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);
2022         CHECK_NULL_RETURN_MEMERR(p);
2023         xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static));
2024       }
2025       else {
2026         alloc = env->mem_alloc * 2;
2027         p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc);
2028         CHECK_NULL_RETURN_MEMERR(p);
2029       }
2030 
2031       for (i = env->num_mem + 1; i < alloc; i++) {
2032         p[i].mem_node = NULL_NODE;
2033         p[i].empty_repeat_node = NULL_NODE;
2034       }
2035 
2036       env->mem_env_dynamic = p;
2037       env->mem_alloc = alloc;
2038     }
2039   }
2040 
2041   env->num_mem++;
2042   return env->num_mem;
2043 }
2044 
2045 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)2046 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
2047 {
2048   if (env->num_mem >= num)
2049     SCANENV_MEMENV(env)[num].mem_node = node;
2050   else
2051     return ONIGERR_PARSER_BUG;
2052   return 0;
2053 }
2054 
2055 static void
node_free_body(Node * node)2056 node_free_body(Node* node)
2057 {
2058   if (IS_NULL(node)) return ;
2059 
2060   switch (NODE_TYPE(node)) {
2061   case NODE_STRING:
2062     if (STR_(node)->capacity != 0 &&
2063         IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
2064       xfree(STR_(node)->s);
2065     }
2066     break;
2067 
2068   case NODE_LIST:
2069   case NODE_ALT:
2070     onig_node_free(NODE_CAR(node));
2071     node = NODE_CDR(node);
2072     while (IS_NOT_NULL(node)) {
2073       Node* next = NODE_CDR(node);
2074       onig_node_free(NODE_CAR(node));
2075       xfree(node);
2076       node = next;
2077     }
2078     break;
2079 
2080   case NODE_CCLASS:
2081     {
2082       CClassNode* cc = CCLASS_(node);
2083 
2084       if (cc->mbuf)
2085         bbuf_free(cc->mbuf);
2086     }
2087     break;
2088 
2089   case NODE_BACKREF:
2090     if (IS_NOT_NULL(BACKREF_(node)->back_dynamic))
2091       xfree(BACKREF_(node)->back_dynamic);
2092     break;
2093 
2094   case NODE_BAG:
2095     if (NODE_BODY(node))
2096       onig_node_free(NODE_BODY(node));
2097 
2098     {
2099       BagNode* en = BAG_(node);
2100       if (en->type == BAG_IF_ELSE) {
2101         onig_node_free(en->te.Then);
2102         onig_node_free(en->te.Else);
2103       }
2104     }
2105     break;
2106 
2107   case NODE_QUANT:
2108     if (NODE_BODY(node))
2109       onig_node_free(NODE_BODY(node));
2110     break;
2111 
2112   case NODE_ANCHOR:
2113     if (NODE_BODY(node))
2114       onig_node_free(NODE_BODY(node));
2115     if (IS_NOT_NULL(ANCHOR_(node)->lead_node))
2116       onig_node_free(ANCHOR_(node)->lead_node);
2117     break;
2118 
2119   case NODE_CTYPE:
2120   case NODE_CALL:
2121   case NODE_GIMMICK:
2122     break;
2123   }
2124 }
2125 
2126 extern void
onig_node_free(Node * node)2127 onig_node_free(Node* node)
2128 {
2129   if (IS_NULL(node)) return ;
2130 
2131 #ifdef DEBUG_NODE_FREE
2132   fprintf(stderr, "onig_node_free: %p\n", node);
2133 #endif
2134 
2135   node_free_body(node);
2136   xfree(node);
2137 }
2138 
2139 static void
cons_node_free_alone(Node * node)2140 cons_node_free_alone(Node* node)
2141 {
2142   NODE_CAR(node) = 0;
2143   NODE_CDR(node) = 0;
2144   onig_node_free(node);
2145 }
2146 
2147 static Node*
node_new(void)2148 node_new(void)
2149 {
2150   Node* node;
2151 
2152   node = (Node* )xmalloc(sizeof(Node));
2153   CHECK_NULL_RETURN(node);
2154   xmemset(node, 0, sizeof(*node));
2155 
2156 #ifdef DEBUG_NODE_FREE
2157   fprintf(stderr, "node_new: %p\n", node);
2158 #endif
2159   return node;
2160 }
2161 
2162 extern int
onig_node_copy(Node ** rcopy,Node * from)2163 onig_node_copy(Node** rcopy, Node* from)
2164 {
2165   int r;
2166   Node* copy;
2167 
2168   *rcopy = NULL_NODE;
2169 
2170   switch (NODE_TYPE(from)) {
2171   case NODE_LIST:
2172   case NODE_ALT:
2173   case NODE_ANCHOR:
2174     /* These node's link to other nodes are processed by caller. */
2175     break;
2176   case NODE_STRING:
2177   case NODE_CCLASS:
2178   case NODE_CTYPE:
2179     /* Fixed contents after copy. */
2180     break;
2181   default:
2182     /* Not supported yet. */
2183     return ONIGERR_TYPE_BUG;
2184     break;
2185   }
2186 
2187   copy = node_new();
2188   CHECK_NULL_RETURN_MEMERR(copy);
2189   xmemcpy(copy, from, sizeof(*copy));
2190 
2191   switch (NODE_TYPE(copy)) {
2192   case NODE_STRING:
2193     r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE);
2194     if (r != 0) {
2195     err:
2196       onig_node_free(copy);
2197       return r;
2198     }
2199     break;
2200 
2201   case NODE_CCLASS:
2202     {
2203       CClassNode *fcc, *tcc;
2204 
2205       fcc = CCLASS_(from);
2206       tcc = CCLASS_(copy);
2207       if (IS_NOT_NULL(fcc->mbuf)) {
2208         r = bbuf_clone(&(tcc->mbuf), fcc->mbuf);
2209         if (r != 0) goto err;
2210       }
2211     }
2212     break;
2213 
2214   default:
2215     break;
2216   }
2217 
2218   *rcopy = copy;
2219   return ONIG_NORMAL;
2220 }
2221 
2222 
2223 static void
initialize_cclass(CClassNode * cc)2224 initialize_cclass(CClassNode* cc)
2225 {
2226   BITSET_CLEAR(cc->bs);
2227   cc->flags = 0;
2228   cc->mbuf  = NULL;
2229 }
2230 
2231 static Node*
node_new_cclass(void)2232 node_new_cclass(void)
2233 {
2234   Node* node = node_new();
2235   CHECK_NULL_RETURN(node);
2236 
2237   NODE_SET_TYPE(node, NODE_CCLASS);
2238   initialize_cclass(CCLASS_(node));
2239   return node;
2240 }
2241 
2242 static Node*
node_new_ctype(int type,int not,OnigOptionType options)2243 node_new_ctype(int type, int not, OnigOptionType options)
2244 {
2245   Node* node = node_new();
2246   CHECK_NULL_RETURN(node);
2247 
2248   NODE_SET_TYPE(node, NODE_CTYPE);
2249   CTYPE_(node)->ctype   = type;
2250   CTYPE_(node)->not     = not;
2251   CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options);
2252   return node;
2253 }
2254 
2255 static Node*
node_new_anychar(OnigOptionType options)2256 node_new_anychar(OnigOptionType options)
2257 {
2258   Node* node;
2259 
2260   node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options);
2261   CHECK_NULL_RETURN(node);
2262 
2263   if (OPTON_MULTILINE(options))
2264     NODE_STATUS_ADD(node, MULTILINE);
2265   return node;
2266 }
2267 
2268 static int
node_new_no_newline(Node ** node,ScanEnv * env)2269 node_new_no_newline(Node** node, ScanEnv* env)
2270 {
2271   Node* n;
2272 
2273   n = node_new_anychar(ONIG_OPTION_NONE);
2274   CHECK_NULL_RETURN_MEMERR(n);
2275   *node = n;
2276   return 0;
2277 }
2278 
2279 static int
node_new_true_anychar(Node ** node)2280 node_new_true_anychar(Node** node)
2281 {
2282   Node* n;
2283 
2284   n = node_new_anychar(ONIG_OPTION_MULTILINE);
2285   CHECK_NULL_RETURN_MEMERR(n);
2286   *node = n;
2287   return 0;
2288 }
2289 
2290 static Node*
node_new_list(Node * left,Node * right)2291 node_new_list(Node* left, Node* right)
2292 {
2293   Node* node = node_new();
2294   CHECK_NULL_RETURN(node);
2295 
2296   NODE_SET_TYPE(node, NODE_LIST);
2297   NODE_CAR(node)  = left;
2298   NODE_CDR(node) = right;
2299   return node;
2300 }
2301 
2302 extern Node*
onig_node_new_list(Node * left,Node * right)2303 onig_node_new_list(Node* left, Node* right)
2304 {
2305   return node_new_list(left, right);
2306 }
2307 
2308 extern Node*
onig_node_new_alt(Node * left,Node * right)2309 onig_node_new_alt(Node* left, Node* right)
2310 {
2311   Node* node = node_new();
2312   CHECK_NULL_RETURN(node);
2313 
2314   NODE_SET_TYPE(node, NODE_ALT);
2315   NODE_CAR(node)  = left;
2316   NODE_CDR(node) = right;
2317   return node;
2318 }
2319 
2320 static Node*
make_list_or_alt(NodeType type,int n,Node * ns[])2321 make_list_or_alt(NodeType type, int n, Node* ns[])
2322 {
2323   Node* r;
2324 
2325   if (n <= 0) return NULL_NODE;
2326 
2327   if (n == 1) {
2328     r = node_new();
2329     CHECK_NULL_RETURN(r);
2330     NODE_SET_TYPE(r, type);
2331     NODE_CAR(r) = ns[0];
2332     NODE_CDR(r) = NULL_NODE;
2333   }
2334   else {
2335     Node* right;
2336 
2337     r = node_new();
2338     CHECK_NULL_RETURN(r);
2339 
2340     right = make_list_or_alt(type, n - 1, ns + 1);
2341     if (IS_NULL(right)) {
2342       onig_node_free(r);
2343       return NULL_NODE;
2344     }
2345 
2346     NODE_SET_TYPE(r, type);
2347     NODE_CAR(r) = ns[0];
2348     NODE_CDR(r) = right;
2349   }
2350 
2351   return r;
2352 }
2353 
2354 static Node*
make_list(int n,Node * ns[])2355 make_list(int n, Node* ns[])
2356 {
2357   return make_list_or_alt(NODE_LIST, n, ns);
2358 }
2359 
2360 static Node*
make_alt(int n,Node * ns[])2361 make_alt(int n, Node* ns[])
2362 {
2363   return make_list_or_alt(NODE_ALT, n, ns);
2364 }
2365 
2366 static Node*
node_new_anchor(int type)2367 node_new_anchor(int type)
2368 {
2369   Node* node;
2370 
2371   node = node_new();
2372   CHECK_NULL_RETURN(node);
2373 
2374   NODE_SET_TYPE(node, NODE_ANCHOR);
2375   ANCHOR_(node)->type       = type;
2376   ANCHOR_(node)->char_min_len = 0;
2377   ANCHOR_(node)->char_max_len = INFINITE_LEN;
2378   ANCHOR_(node)->ascii_mode = 0;
2379   ANCHOR_(node)->lead_node  = NULL_NODE;
2380   return node;
2381 }
2382 
2383 static Node*
node_new_anchor_with_options(int type,OnigOptionType options)2384 node_new_anchor_with_options(int type, OnigOptionType options)
2385 {
2386   int ascii_mode;
2387   Node* node;
2388 
2389   node = node_new_anchor(type);
2390   CHECK_NULL_RETURN(node);
2391 
2392   ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0;
2393   ANCHOR_(node)->ascii_mode = ascii_mode;
2394 
2395   if (type == ANCR_TEXT_SEGMENT_BOUNDARY ||
2396       type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) {
2397     if (OPTON_TEXT_SEGMENT_WORD(options))
2398       NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD);
2399   }
2400 
2401   return node;
2402 }
2403 
2404 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2405 node_new_backref(int back_num, int* backrefs, int by_name,
2406 #ifdef USE_BACKREF_WITH_LEVEL
2407                  int exist_level, int nest_level,
2408 #endif
2409                  ScanEnv* env)
2410 {
2411   int i;
2412   Node* node;
2413 
2414   node = node_new();
2415   CHECK_NULL_RETURN(node);
2416 
2417   NODE_SET_TYPE(node, NODE_BACKREF);
2418   BACKREF_(node)->back_num = back_num;
2419   BACKREF_(node)->back_dynamic = (int* )NULL;
2420   if (by_name != 0)
2421     NODE_STATUS_ADD(node, BY_NAME);
2422 
2423   if (OPTON_IGNORECASE(env->options))
2424     NODE_STATUS_ADD(node, IGNORECASE);
2425 
2426 #ifdef USE_BACKREF_WITH_LEVEL
2427   if (exist_level != 0) {
2428     NODE_STATUS_ADD(node, NEST_LEVEL);
2429     BACKREF_(node)->nest_level  = nest_level;
2430   }
2431 #endif
2432 
2433   for (i = 0; i < back_num; i++) {
2434     if (backrefs[i] <= env->num_mem &&
2435         IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) {
2436       NODE_STATUS_ADD(node, RECURSION);   /* /...(\1).../ */
2437       break;
2438     }
2439   }
2440 
2441   if (back_num <= NODE_BACKREFS_SIZE) {
2442     for (i = 0; i < back_num; i++)
2443       BACKREF_(node)->back_static[i] = backrefs[i];
2444   }
2445   else {
2446     int* p = (int* )xmalloc(sizeof(int) * back_num);
2447     if (IS_NULL(p)) {
2448       onig_node_free(node);
2449       return NULL;
2450     }
2451     BACKREF_(node)->back_dynamic = p;
2452     for (i = 0; i < back_num; i++)
2453       p[i] = backrefs[i];
2454   }
2455 
2456   env->backref_num++;
2457   return node;
2458 }
2459 
2460 static Node*
node_new_backref_checker(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2461 node_new_backref_checker(int back_num, int* backrefs, int by_name,
2462 #ifdef USE_BACKREF_WITH_LEVEL
2463                          int exist_level, int nest_level,
2464 #endif
2465                          ScanEnv* env)
2466 {
2467   Node* node;
2468 
2469   node = node_new_backref(back_num, backrefs, by_name,
2470 #ifdef USE_BACKREF_WITH_LEVEL
2471                           exist_level, nest_level,
2472 #endif
2473                           env);
2474   CHECK_NULL_RETURN(node);
2475 
2476   NODE_STATUS_ADD(node, CHECKER);
2477   return node;
2478 }
2479 
2480 #ifdef USE_CALL
2481 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum,int by_number)2482 node_new_call(UChar* name, UChar* name_end, int gnum, int by_number)
2483 {
2484   Node* node = node_new();
2485   CHECK_NULL_RETURN(node);
2486 
2487   NODE_SET_TYPE(node, NODE_CALL);
2488   CALL_(node)->by_number   = by_number;
2489   CALL_(node)->name        = name;
2490   CALL_(node)->name_end    = name_end;
2491   CALL_(node)->group_num   = gnum;
2492   CALL_(node)->entry_count = 1;
2493   return node;
2494 }
2495 #endif
2496 
2497 static Node*
node_new_quantifier(int lower,int upper,int by_number)2498 node_new_quantifier(int lower, int upper, int by_number)
2499 {
2500   Node* node = node_new();
2501   CHECK_NULL_RETURN(node);
2502 
2503   NODE_SET_TYPE(node, NODE_QUANT);
2504   QUANT_(node)->lower            = lower;
2505   QUANT_(node)->upper            = upper;
2506   QUANT_(node)->greedy           = 1;
2507   QUANT_(node)->emptiness        = BODY_IS_NOT_EMPTY;
2508   QUANT_(node)->head_exact       = NULL_NODE;
2509   QUANT_(node)->next_head_exact  = NULL_NODE;
2510   QUANT_(node)->include_referred = 0;
2511   if (by_number != 0)
2512     NODE_STATUS_ADD(node, BY_NUMBER);
2513 
2514   return node;
2515 }
2516 
2517 static Node*
node_new_bag(enum BagType type)2518 node_new_bag(enum BagType type)
2519 {
2520   Node* node = node_new();
2521   CHECK_NULL_RETURN(node);
2522 
2523   NODE_SET_TYPE(node, NODE_BAG);
2524   BAG_(node)->type = type;
2525 
2526   switch (type) {
2527   case BAG_MEMORY:
2528     BAG_(node)->m.regnum       =  0;
2529     BAG_(node)->m.called_addr  = -1;
2530     BAG_(node)->m.entry_count  =  1;
2531     BAG_(node)->m.called_state =  0;
2532     break;
2533 
2534   case BAG_OPTION:
2535     BAG_(node)->o.options =  0;
2536     break;
2537 
2538   case BAG_STOP_BACKTRACK:
2539     break;
2540 
2541   case BAG_IF_ELSE:
2542     BAG_(node)->te.Then = 0;
2543     BAG_(node)->te.Else = 0;
2544     break;
2545   }
2546 
2547   BAG_(node)->opt_count = 0;
2548   return node;
2549 }
2550 
2551 extern Node*
onig_node_new_bag(enum BagType type)2552 onig_node_new_bag(enum BagType type)
2553 {
2554   return node_new_bag(type);
2555 }
2556 
2557 static Node*
node_new_bag_if_else(Node * cond,Node * Then,Node * Else)2558 node_new_bag_if_else(Node* cond, Node* Then, Node* Else)
2559 {
2560   Node* n;
2561   n = node_new_bag(BAG_IF_ELSE);
2562   CHECK_NULL_RETURN(n);
2563 
2564   NODE_BODY(n) = cond;
2565   BAG_(n)->te.Then = Then;
2566   BAG_(n)->te.Else = Else;
2567   return n;
2568 }
2569 
2570 static Node*
node_new_memory(int is_named)2571 node_new_memory(int is_named)
2572 {
2573   Node* node = node_new_bag(BAG_MEMORY);
2574   CHECK_NULL_RETURN(node);
2575   if (is_named != 0)
2576     NODE_STATUS_ADD(node, NAMED_GROUP);
2577 
2578   return node;
2579 }
2580 
2581 static Node*
node_new_option(OnigOptionType option)2582 node_new_option(OnigOptionType option)
2583 {
2584   Node* node = node_new_bag(BAG_OPTION);
2585   CHECK_NULL_RETURN(node);
2586   BAG_(node)->o.options = option;
2587   return node;
2588 }
2589 
2590 static Node*
node_new_group(Node * content)2591 node_new_group(Node* content)
2592 {
2593   Node* node;
2594 
2595   node = node_new();
2596   CHECK_NULL_RETURN(node);
2597   NODE_SET_TYPE(node, NODE_LIST);
2598   NODE_CAR(node) = content;
2599   NODE_CDR(node) = NULL_NODE;
2600 
2601   return node;
2602 }
2603 
2604 static Node*
node_drop_group(Node * group)2605 node_drop_group(Node* group)
2606 {
2607   Node* content;
2608 
2609   content = NODE_CAR(group);
2610   NODE_CAR(group) = NULL_NODE;
2611   onig_node_free(group);
2612   return content;
2613 }
2614 
2615 static int
node_set_fail(Node * node)2616 node_set_fail(Node* node)
2617 {
2618   NODE_SET_TYPE(node, NODE_GIMMICK);
2619   GIMMICK_(node)->type = GIMMICK_FAIL;
2620   return ONIG_NORMAL;
2621 }
2622 
2623 static int
node_new_fail(Node ** node,ScanEnv * env)2624 node_new_fail(Node** node, ScanEnv* env)
2625 {
2626   *node = node_new();
2627   CHECK_NULL_RETURN_MEMERR(*node);
2628 
2629   return node_set_fail(*node);
2630 }
2631 
2632 extern int
onig_node_reset_fail(Node * node)2633 onig_node_reset_fail(Node* node)
2634 {
2635   node_free_body(node);
2636   return node_set_fail(node);
2637 }
2638 
2639 static int
node_new_save_gimmick(Node ** node,enum SaveType save_type,ScanEnv * env)2640 node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
2641 {
2642   int id;
2643 
2644   ID_ENTRY(env, id);
2645 
2646   *node = node_new();
2647   CHECK_NULL_RETURN_MEMERR(*node);
2648 
2649   NODE_SET_TYPE(*node, NODE_GIMMICK);
2650   GIMMICK_(*node)->id   = id;
2651   GIMMICK_(*node)->type = GIMMICK_SAVE;
2652   GIMMICK_(*node)->detail_type = (int )save_type;
2653 
2654   return ONIG_NORMAL;
2655 }
2656 
2657 static int
node_new_update_var_gimmick(Node ** node,enum UpdateVarType update_var_type,int id,ScanEnv * env)2658 node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
2659                             int id, ScanEnv* env)
2660 {
2661   *node = node_new();
2662   CHECK_NULL_RETURN_MEMERR(*node);
2663 
2664   NODE_SET_TYPE(*node, NODE_GIMMICK);
2665   GIMMICK_(*node)->id   = id;
2666   GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR;
2667   GIMMICK_(*node)->detail_type = (int )update_var_type;
2668 
2669   return ONIG_NORMAL;
2670 }
2671 
2672 static int
node_new_keep(Node ** node,ScanEnv * env)2673 node_new_keep(Node** node, ScanEnv* env)
2674 {
2675   int r;
2676 
2677   r = node_new_save_gimmick(node, SAVE_KEEP, env);
2678   if (r != 0) return r;
2679 
2680   env->keep_num++;
2681   return ONIG_NORMAL;
2682 }
2683 
2684 #ifdef USE_CALLOUT
2685 
2686 extern void
onig_free_reg_callout_list(int n,CalloutListEntry * list)2687 onig_free_reg_callout_list(int n, CalloutListEntry* list)
2688 {
2689   int i;
2690   int j;
2691 
2692   if (IS_NULL(list)) return ;
2693 
2694   for (i = 0; i < n; i++) {
2695     if (list[i].of == ONIG_CALLOUT_OF_NAME) {
2696       for (j = 0; j < list[i].u.arg.passed_num; j++) {
2697         if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) {
2698           if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start))
2699             xfree(list[i].u.arg.vals[j].s.start);
2700         }
2701       }
2702     }
2703     else { /* ONIG_CALLOUT_OF_CONTENTS */
2704       if (IS_NOT_NULL(list[i].u.content.start)) {
2705         xfree((void* )list[i].u.content.start);
2706       }
2707     }
2708   }
2709 
2710   xfree(list);
2711 }
2712 
2713 extern CalloutListEntry*
onig_reg_callout_list_at(regex_t * reg,int num)2714 onig_reg_callout_list_at(regex_t* reg, int num)
2715 {
2716   RegexExt* ext = reg->extp;
2717   CHECK_NULL_RETURN(ext);
2718 
2719   if (num <= 0 || num > ext->callout_num)
2720     return 0;
2721 
2722   num--;
2723   return ext->callout_list + num;
2724 }
2725 
2726 static int
reg_callout_list_entry(ScanEnv * env,int * rnum)2727 reg_callout_list_entry(ScanEnv* env, int* rnum)
2728 {
2729 #define INIT_CALLOUT_LIST_NUM  3
2730 
2731   int num;
2732   CalloutListEntry* list;
2733   CalloutListEntry* e;
2734   RegexExt* ext;
2735 
2736   ext = onig_get_regex_ext(env->reg);
2737   CHECK_NULL_RETURN_MEMERR(ext);
2738 
2739   if (IS_NULL(ext->callout_list)) {
2740     list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM);
2741     CHECK_NULL_RETURN_MEMERR(list);
2742 
2743     ext->callout_list = list;
2744     ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM;
2745     ext->callout_num = 0;
2746   }
2747 
2748   num = ext->callout_num + 1;
2749   if (num > ext->callout_list_alloc) {
2750     int alloc = ext->callout_list_alloc * 2;
2751     list = (CalloutListEntry* )xrealloc(ext->callout_list,
2752                                         sizeof(CalloutListEntry) * alloc);
2753     CHECK_NULL_RETURN_MEMERR(list);
2754 
2755     ext->callout_list       = list;
2756     ext->callout_list_alloc = alloc;
2757   }
2758 
2759   e = ext->callout_list + (num - 1);
2760 
2761   e->flag             = 0;
2762   e->of               = 0;
2763   e->in               = ONIG_CALLOUT_OF_CONTENTS;
2764   e->type             = 0;
2765   e->tag_start        = 0;
2766   e->tag_end          = 0;
2767   e->start_func       = 0;
2768   e->end_func         = 0;
2769   e->u.arg.num        = 0;
2770   e->u.arg.passed_num = 0;
2771 
2772   ext->callout_num = num;
2773   *rnum = num;
2774   return ONIG_NORMAL;
2775 }
2776 
2777 static int
node_new_callout(Node ** node,OnigCalloutOf callout_of,int num,int id,ScanEnv * env)2778 node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
2779                  ScanEnv* env)
2780 {
2781   *node = node_new();
2782   CHECK_NULL_RETURN_MEMERR(*node);
2783 
2784   NODE_SET_TYPE(*node, NODE_GIMMICK);
2785   GIMMICK_(*node)->id          = id;
2786   GIMMICK_(*node)->num         = num;
2787   GIMMICK_(*node)->type        = GIMMICK_CALLOUT;
2788   GIMMICK_(*node)->detail_type = (int )callout_of;
2789 
2790   return ONIG_NORMAL;
2791 }
2792 #endif
2793 
2794 static int
make_text_segment(Node ** node,ScanEnv * env)2795 make_text_segment(Node** node, ScanEnv* env)
2796 {
2797   int r;
2798   int i;
2799   Node* x;
2800   Node* ns[2];
2801 
2802   /* \X == (?>\O(?:\Y\O)*) */
2803 
2804   ns[1] = NULL_NODE;
2805 
2806   r = ONIGERR_MEMORY;
2807   ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options);
2808   if (IS_NULL(ns[0])) goto err;
2809 
2810   r = node_new_true_anychar(&ns[1]);
2811   if (r != 0) goto err1;
2812 
2813   x = make_list(2, ns);
2814   if (IS_NULL(x)) goto err;
2815   ns[0] = x;
2816   ns[1] = NULL_NODE;
2817 
2818   x = node_new_quantifier(0, INFINITE_REPEAT, TRUE);
2819   if (IS_NULL(x)) goto err;
2820 
2821   NODE_BODY(x) = ns[0];
2822   ns[0] = NULL_NODE;
2823   ns[1] = x;
2824 
2825   r = node_new_true_anychar(&ns[0]);
2826   if (r != 0) goto err1;
2827 
2828   x = make_list(2, ns);
2829   if (IS_NULL(x)) goto err;
2830 
2831   ns[0] = x;
2832   ns[1] = NULL_NODE;
2833 
2834   x = node_new_bag(BAG_STOP_BACKTRACK);
2835   if (IS_NULL(x)) goto err;
2836 
2837   NODE_BODY(x) = ns[0];
2838 
2839   *node = x;
2840   return ONIG_NORMAL;
2841 
2842  err:
2843   r = ONIGERR_MEMORY;
2844  err1:
2845   for (i = 0; i < 2; i++) onig_node_free(ns[i]);
2846   return r;
2847 }
2848 
2849 static int
make_absent_engine(Node ** node,int pre_save_right_id,Node * absent,Node * step_one,int lower,int upper,int possessive,int is_range_cutter,ScanEnv * env)2850 make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
2851                    Node* step_one, int lower, int upper, int possessive,
2852                    int is_range_cutter, ScanEnv* env)
2853 {
2854   int r;
2855   int i;
2856   int id;
2857   Node* x;
2858   Node* ns[4];
2859 
2860   for (i = 0; i < 4; i++) ns[i] = NULL_NODE;
2861 
2862   ns[1] = absent;
2863   ns[3] = step_one; /* for err */
2864   r = node_new_save_gimmick(&ns[0], SAVE_S, env);
2865   if (r != 0) goto err;
2866 
2867   id = GIMMICK_(ns[0])->id;
2868   r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK,
2869                                   id, env);
2870   if (r != 0) goto err;
2871 
2872   if (is_range_cutter != 0)
2873     NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS);
2874 
2875   r = node_new_fail(&ns[3], env);
2876   if (r != 0) goto err;
2877 
2878   x = make_list(4, ns);
2879   if (IS_NULL(x)) goto err0;
2880 
2881   ns[0] = x;
2882   ns[1] = step_one;
2883   ns[2] = ns[3] = NULL_NODE;
2884 
2885   x = make_alt(2, ns);
2886   if (IS_NULL(x)) goto err0;
2887 
2888   ns[0] = x;
2889 
2890   x = node_new_quantifier(lower, upper, FALSE);
2891   if (IS_NULL(x)) goto err0;
2892 
2893   NODE_BODY(x) = ns[0];
2894   ns[0] = x;
2895 
2896   if (possessive != 0) {
2897     x = node_new_bag(BAG_STOP_BACKTRACK);
2898     if (IS_NULL(x)) goto err0;
2899 
2900     NODE_BODY(x) = ns[0];
2901     ns[0] = x;
2902   }
2903 
2904   r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2905                                   pre_save_right_id, env);
2906   if (r != 0) goto err;
2907 
2908   r = node_new_fail(&ns[2], env);
2909   if (r != 0) goto err;
2910 
2911   x = make_list(2, ns + 1);
2912   if (IS_NULL(x)) goto err0;
2913 
2914   ns[1] = x; ns[2] = NULL_NODE;
2915 
2916   x = make_alt(2, ns);
2917   if (IS_NULL(x)) goto err0;
2918 
2919   if (is_range_cutter != FALSE)
2920     NODE_STATUS_ADD(x, SUPER);
2921 
2922   *node = x;
2923   return ONIG_NORMAL;
2924 
2925  err0:
2926   r = ONIGERR_MEMORY;
2927  err:
2928   for (i = 0; i < 4; i++) onig_node_free(ns[i]);
2929   return r;
2930 }
2931 
2932 static int
make_absent_tail(Node ** node1,Node ** node2,int pre_save_right_id,ScanEnv * env)2933 make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
2934                  ScanEnv* env)
2935 {
2936   int r;
2937   int id;
2938   Node* save;
2939   Node* x;
2940   Node* ns[2];
2941 
2942   *node1 = *node2 = NULL_NODE;
2943   save = ns[0] = ns[1] = NULL_NODE;
2944 
2945   r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2946   if (r != 0) goto err;
2947 
2948   id = GIMMICK_(save)->id;
2949   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2950                                   id, env);
2951   if (r != 0) goto err;
2952 
2953   r = node_new_fail(&ns[1], env);
2954   if (r != 0) goto err;
2955 
2956   x = make_list(2, ns);
2957   if (IS_NULL(x)) goto err0;
2958 
2959   ns[0] = NULL_NODE; ns[1] = x;
2960 
2961   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2962                                   pre_save_right_id, env);
2963   if (r != 0) goto err;
2964 
2965   x = make_alt(2, ns);
2966   if (IS_NULL(x)) goto err0;
2967 
2968   *node1 = save;
2969   *node2 = x;
2970   return ONIG_NORMAL;
2971 
2972  err0:
2973   r = ONIGERR_MEMORY;
2974  err:
2975   onig_node_free(save);
2976   onig_node_free(ns[0]);
2977   onig_node_free(ns[1]);
2978   return r;
2979 }
2980 
2981 static int
make_range_clear(Node ** node,ScanEnv * env)2982 make_range_clear(Node** node, ScanEnv* env)
2983 {
2984   int r;
2985   int id;
2986   Node* save;
2987   Node* x;
2988   Node* ns[2];
2989 
2990   *node = NULL_NODE;
2991   save = ns[0] = ns[1] = NULL_NODE;
2992 
2993   r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2994   if (r != 0) goto err;
2995 
2996   id = GIMMICK_(save)->id;
2997   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2998                                   id, env);
2999   if (r != 0) goto err;
3000 
3001   r = node_new_fail(&ns[1], env);
3002   if (r != 0) goto err;
3003 
3004   x = make_list(2, ns);
3005   if (IS_NULL(x)) goto err0;
3006 
3007   ns[0] = NULL_NODE; ns[1] = x;
3008 
3009 #define ID_NOT_USED_DONT_CARE_ME   0
3010 
3011   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT,
3012                                   ID_NOT_USED_DONT_CARE_ME, env);
3013   if (r != 0) goto err;
3014   NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS);
3015 
3016   x = make_alt(2, ns);
3017   if (IS_NULL(x)) goto err0;
3018 
3019   NODE_STATUS_ADD(x, SUPER);
3020 
3021   ns[0] = save;
3022   ns[1] = x;
3023   save = NULL_NODE;
3024   x = make_list(2, ns);
3025   if (IS_NULL(x)) goto err0;
3026 
3027   *node = x;
3028   return ONIG_NORMAL;
3029 
3030  err0:
3031   r = ONIGERR_MEMORY;
3032  err:
3033   onig_node_free(save);
3034   onig_node_free(ns[0]);
3035   onig_node_free(ns[1]);
3036   return r;
3037 }
3038 
3039 static int
is_simple_one_char_repeat(Node * node,Node ** rquant,Node ** rbody,int * is_possessive,ScanEnv * env)3040 is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
3041                           int* is_possessive, ScanEnv* env)
3042 {
3043   Node* quant;
3044   Node* body;
3045 
3046   *rquant = *rbody = 0;
3047   *is_possessive = 0;
3048 
3049   if (NODE_TYPE(node) == NODE_QUANT) {
3050     quant = node;
3051   }
3052   else {
3053     if (NODE_TYPE(node) == NODE_BAG) {
3054       BagNode* en = BAG_(node);
3055       if (en->type == BAG_STOP_BACKTRACK) {
3056         *is_possessive = 1;
3057         quant = NODE_BAG_BODY(en);
3058         if (NODE_TYPE(quant) != NODE_QUANT)
3059           return 0;
3060       }
3061       else
3062         return 0;
3063     }
3064     else
3065       return 0;
3066   }
3067 
3068   if (QUANT_(quant)->greedy == 0)
3069     return 0;
3070 
3071   body = NODE_BODY(quant);
3072   switch (NODE_TYPE(body)) {
3073   case NODE_STRING:
3074     {
3075       int len;
3076       StrNode* sn = STR_(body);
3077       UChar *s = sn->s;
3078 
3079       len = 0;
3080       while (s < sn->end) {
3081         s += enclen(env->enc, s);
3082         len++;
3083       }
3084       if (len != 1)
3085         return 0;
3086     }
3087 
3088   case NODE_CCLASS:
3089     break;
3090 
3091   default:
3092     return 0;
3093     break;
3094   }
3095 
3096   if (node != quant) {
3097     NODE_BODY(node) = 0;
3098     onig_node_free(node);
3099   }
3100   NODE_BODY(quant) = NULL_NODE;
3101   *rquant = quant;
3102   *rbody  = body;
3103   return 1;
3104 }
3105 
3106 static int
make_absent_tree_for_simple_one_char_repeat(Node ** node,Node * absent,Node * quant,Node * body,int possessive,ScanEnv * env)3107 make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant,
3108                                             Node* body, int possessive, ScanEnv* env)
3109 {
3110   int r;
3111   int i;
3112   int id1;
3113   int lower, upper;
3114   Node* x;
3115   Node* ns[4];
3116 
3117   *node = NULL_NODE;
3118   r = ONIGERR_MEMORY;
3119   ns[0] = ns[1] = NULL_NODE;
3120   ns[2] = body, ns[3] = absent;
3121 
3122   lower = QUANT_(quant)->lower;
3123   upper = QUANT_(quant)->upper;
3124   onig_node_free(quant);
3125 
3126   r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3127   if (r != 0) goto err;
3128 
3129   id1 = GIMMICK_(ns[0])->id;
3130 
3131   r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive,
3132                          FALSE, env);
3133   if (r != 0) goto err;
3134 
3135   ns[2] = ns[3] = NULL_NODE;
3136 
3137   r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3138                                   id1, env);
3139   if (r != 0) goto err;
3140 
3141   x = make_list(3, ns);
3142   if (IS_NULL(x)) goto err0;
3143 
3144   *node = x;
3145   return ONIG_NORMAL;
3146 
3147  err0:
3148   r = ONIGERR_MEMORY;
3149  err:
3150   for (i = 0; i < 4; i++) onig_node_free(ns[i]);
3151   return r;
3152 }
3153 
3154 static int
make_absent_tree(Node ** node,Node * absent,Node * expr,int is_range_cutter,ScanEnv * env)3155 make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,
3156                  ScanEnv* env)
3157 {
3158   int r;
3159   int i;
3160   int id1, id2;
3161   int possessive;
3162   Node* x;
3163   Node* ns[7];
3164 
3165   r = ONIGERR_MEMORY;
3166   for (i = 0; i < 7; i++) ns[i] = NULL_NODE;
3167   ns[4] = expr; ns[5] = absent;
3168 
3169   if (is_range_cutter == 0) {
3170     Node* quant;
3171     Node* body;
3172 
3173     if (expr == NULL_NODE) {
3174       /* default expr \O* */
3175       quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
3176       if (IS_NULL(quant)) goto err0;
3177 
3178       r = node_new_true_anychar(&body);
3179       if (r != 0) {
3180         onig_node_free(quant);
3181         goto err;
3182       }
3183       possessive = 0;
3184       goto simple;
3185     }
3186     else {
3187       if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) {
3188       simple:
3189         r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant,
3190                                                         body, possessive, env);
3191         if (r != 0) {
3192           ns[4] = NULL_NODE;
3193           onig_node_free(quant);
3194           onig_node_free(body);
3195           goto err;
3196         }
3197 
3198         return ONIG_NORMAL;
3199       }
3200     }
3201   }
3202 
3203   r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3204   if (r != 0) goto err;
3205 
3206   id1 = GIMMICK_(ns[0])->id;
3207 
3208   r = node_new_save_gimmick(&ns[1], SAVE_S, env);
3209   if (r != 0) goto err;
3210 
3211   id2 = GIMMICK_(ns[1])->id;
3212 
3213   r = node_new_true_anychar(&ns[3]);
3214   if (r != 0) goto err;
3215 
3216   possessive = 1;
3217   r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,
3218                          possessive, is_range_cutter, env);
3219   if (r != 0) goto err;
3220 
3221   ns[3] = NULL_NODE;
3222   ns[5] = NULL_NODE;
3223 
3224   r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env);
3225   if (r != 0) goto err;
3226 
3227   if (is_range_cutter != 0) {
3228     x = make_list(4, ns);
3229     if (IS_NULL(x)) goto err0;
3230   }
3231   else {
3232     r = make_absent_tail(&ns[5], &ns[6], id1, env);
3233     if (r != 0) goto err;
3234 
3235     x = make_list(7, ns);
3236     if (IS_NULL(x)) goto err0;
3237   }
3238 
3239   *node = x;
3240   return ONIG_NORMAL;
3241 
3242  err0:
3243   r = ONIGERR_MEMORY;
3244  err:
3245   for (i = 0; i < 7; i++) onig_node_free(ns[i]);
3246   return r;
3247 }
3248 
3249 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)3250 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
3251 {
3252   int addlen = (int )(end - s);
3253 
3254   if (addlen > 0) {
3255     int len  = (int )(STR_(node)->end - STR_(node)->s);
3256 
3257     if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) {
3258       UChar* p;
3259       int capa = len + addlen + NODE_STRING_MARGIN;
3260 
3261       if (capa <= STR_(node)->capacity) {
3262         onig_strcpy(STR_(node)->s + len, s, end);
3263       }
3264       else {
3265         if (STR_(node)->s == STR_(node)->buf)
3266           p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end,
3267                                       s, end, capa);
3268         else
3269           p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa);
3270 
3271         CHECK_NULL_RETURN_MEMERR(p);
3272         STR_(node)->s        = p;
3273         STR_(node)->capacity = capa;
3274       }
3275     }
3276     else {
3277       onig_strcpy(STR_(node)->s + len, s, end);
3278     }
3279     STR_(node)->end = STR_(node)->s + len + addlen;
3280   }
3281 
3282   return 0;
3283 }
3284 
3285 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end,int need_free)3286 onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free)
3287 {
3288   onig_node_str_clear(node, need_free);
3289   return onig_node_str_cat(node, s, end);
3290 }
3291 
3292 static int
node_str_cat_char(Node * node,UChar c)3293 node_str_cat_char(Node* node, UChar c)
3294 {
3295   UChar s[1];
3296 
3297   s[0] = c;
3298   return onig_node_str_cat(node, s, s + 1);
3299 }
3300 
3301 extern void
onig_node_str_clear(Node * node,int need_free)3302 onig_node_str_clear(Node* node, int need_free)
3303 {
3304   if (need_free != 0 &&
3305       STR_(node)->capacity != 0 &&
3306       IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
3307     xfree(STR_(node)->s);
3308   }
3309 
3310   STR_(node)->flag     = 0;
3311   STR_(node)->s        = STR_(node)->buf;
3312   STR_(node)->end      = STR_(node)->buf;
3313   STR_(node)->capacity = 0;
3314 }
3315 
3316 static int
node_set_str(Node * node,const UChar * s,const UChar * end)3317 node_set_str(Node* node, const UChar* s, const UChar* end)
3318 {
3319   int r;
3320 
3321   NODE_SET_TYPE(node, NODE_STRING);
3322   STR_(node)->flag     = 0;
3323   STR_(node)->s        = STR_(node)->buf;
3324   STR_(node)->end      = STR_(node)->buf;
3325   STR_(node)->capacity = 0;
3326 
3327   r = onig_node_str_cat(node, s, end);
3328   return r;
3329 }
3330 
3331 static Node*
node_new_str(const UChar * s,const UChar * end)3332 node_new_str(const UChar* s, const UChar* end)
3333 {
3334   int r;
3335   Node* node = node_new();
3336   CHECK_NULL_RETURN(node);
3337 
3338   r = node_set_str(node, s, end);
3339   if (r != 0) {
3340     onig_node_free(node);
3341     return NULL;
3342   }
3343 
3344   return node;
3345 }
3346 
3347 static int
node_reset_str(Node * node,const UChar * s,const UChar * end)3348 node_reset_str(Node* node, const UChar* s, const UChar* end)
3349 {
3350   node_free_body(node);
3351   return node_set_str(node, s, end);
3352 }
3353 
3354 extern int
onig_node_reset_empty(Node * node)3355 onig_node_reset_empty(Node* node)
3356 {
3357   return node_reset_str(node, NULL, NULL);
3358 }
3359 
3360 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)3361 onig_node_new_str(const UChar* s, const UChar* end)
3362 {
3363   return node_new_str(s, end);
3364 }
3365 
3366 static Node*
node_new_str_with_options(const UChar * s,const UChar * end,OnigOptionType options)3367 node_new_str_with_options(const UChar* s, const UChar* end,
3368                           OnigOptionType options)
3369 {
3370   Node* node;
3371   node = node_new_str(s, end);
3372 
3373   if (OPTON_IGNORECASE(options))
3374     NODE_STATUS_ADD(node, IGNORECASE);
3375 
3376   return node;
3377 }
3378 
3379 static Node*
node_new_str_crude(UChar * s,UChar * end,OnigOptionType options)3380 node_new_str_crude(UChar* s, UChar* end, OnigOptionType options)
3381 {
3382   Node* node = node_new_str_with_options(s, end, options);
3383   CHECK_NULL_RETURN(node);
3384   NODE_STRING_SET_CRUDE(node);
3385   return node;
3386 }
3387 
3388 static Node*
node_new_empty(void)3389 node_new_empty(void)
3390 {
3391   return node_new_str(NULL, NULL);
3392 }
3393 
3394 static Node*
node_new_str_crude_char(UChar c,OnigOptionType options)3395 node_new_str_crude_char(UChar c, OnigOptionType options)
3396 {
3397   int i;
3398   UChar p[1];
3399   Node* node;
3400 
3401   p[0] = c;
3402   node = node_new_str_crude(p, p + 1, options);
3403 
3404   /* clear buf tail */
3405   for (i = 1; i < NODE_STRING_BUF_SIZE; i++)
3406     STR_(node)->buf[i] = '\0';
3407 
3408   return node;
3409 }
3410 
3411 static Node*
str_node_split_last_char(Node * node,OnigEncoding enc)3412 str_node_split_last_char(Node* node, OnigEncoding enc)
3413 {
3414   const UChar *p;
3415   Node* rn;
3416   StrNode* sn;
3417 
3418   sn = STR_(node);
3419   rn = NULL_NODE;
3420   if (sn->end > sn->s) {
3421     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
3422     if (p && p > sn->s) { /* can be split. */
3423       rn = node_new_str(p, sn->end);
3424       CHECK_NULL_RETURN(rn);
3425 
3426       sn->end = (UChar* )p;
3427       STR_(rn)->flag = sn->flag;
3428       NODE_STATUS(rn) = NODE_STATUS(node);
3429     }
3430   }
3431 
3432   return rn;
3433 }
3434 
3435 static int
str_node_can_be_split(Node * node,OnigEncoding enc)3436 str_node_can_be_split(Node* node, OnigEncoding enc)
3437 {
3438   StrNode* sn = STR_(node);
3439   if (sn->end > sn->s) {
3440     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
3441   }
3442   return 0;
3443 }
3444 
3445 static int
scan_number(UChar ** src,const UChar * end,OnigEncoding enc)3446 scan_number(UChar** src, const UChar* end, OnigEncoding enc)
3447 {
3448   int num, val;
3449   OnigCodePoint c;
3450   UChar* p = *src;
3451   PFETCH_READY;
3452 
3453   num = 0;
3454   while (! PEND) {
3455     PFETCH(c);
3456     if (IS_CODE_DIGIT_ASCII(enc, c)) {
3457       val = (int )DIGITVAL(c);
3458       if ((ONIG_INT_MAX - val) / 10 < num)
3459         return -1;  /* overflow */
3460 
3461       num = num * 10 + val;
3462     }
3463     else {
3464       PUNFETCH;
3465       break;
3466     }
3467   }
3468   *src = p;
3469   return num;
3470 }
3471 
3472 static int
scan_hexadecimal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3473 scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen,
3474                         OnigEncoding enc, OnigCodePoint* rcode)
3475 {
3476   OnigCodePoint code;
3477   OnigCodePoint c;
3478   unsigned int val;
3479   int n;
3480   UChar* p = *src;
3481   PFETCH_READY;
3482 
3483   code = 0;
3484   n = 0;
3485   while (! PEND && n < maxlen) {
3486     PFETCH(c);
3487     if (IS_CODE_XDIGIT_ASCII(enc, c)) {
3488       n++;
3489       val = (unsigned int )XDIGITVAL(enc, c);
3490       if ((UINT_MAX - val) / 16UL < code)
3491         return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3492 
3493       code = (code << 4) + val;
3494     }
3495     else {
3496       PUNFETCH;
3497       break;
3498     }
3499   }
3500 
3501   if (n < minlen)
3502     return ONIGERR_INVALID_CODE_POINT_VALUE;
3503 
3504   *rcode = code;
3505   *src = p;
3506   return ONIG_NORMAL;
3507 }
3508 
3509 static int
scan_octal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3510 scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen,
3511                   OnigEncoding enc, OnigCodePoint* rcode)
3512 {
3513   OnigCodePoint code;
3514   OnigCodePoint c;
3515   unsigned int val;
3516   int n;
3517   UChar* p = *src;
3518   PFETCH_READY;
3519 
3520   code = 0;
3521   n = 0;
3522   while (! PEND && n < maxlen) {
3523     PFETCH(c);
3524     if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') {
3525       n++;
3526       val = (unsigned int )ODIGITVAL(c);
3527       if ((UINT_MAX - val) / 8UL < code)
3528         return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3529 
3530       code = (code << 3) + val;
3531     }
3532     else {
3533       PUNFETCH;
3534       break;
3535     }
3536   }
3537 
3538   if (n < minlen)
3539     return ONIGERR_INVALID_CODE_POINT_VALUE;
3540 
3541   *rcode = code;
3542   *src = p;
3543   return ONIG_NORMAL;
3544 }
3545 
3546 
3547 #define BB_WRITE_CODE_POINT(bbuf,pos,code) \
3548     BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
3549 
3550 /* data format:
3551      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
3552      (all data size is OnigCodePoint)
3553  */
3554 static int
new_code_range(BBuf ** pbuf)3555 new_code_range(BBuf** pbuf)
3556 {
3557 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
3558   int r;
3559   OnigCodePoint n;
3560   BBuf* bbuf;
3561 
3562   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
3563   CHECK_NULL_RETURN_MEMERR(bbuf);
3564   r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE);
3565   if (r != 0) {
3566     xfree(bbuf);
3567     *pbuf = 0;
3568     return r;
3569   }
3570 
3571   n = 0;
3572   BB_WRITE_CODE_POINT(bbuf, 0, n);
3573   return 0;
3574 }
3575 
3576 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)3577 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
3578 {
3579   int r, inc_n, pos;
3580   int low, high, bound, x;
3581   OnigCodePoint n, *data;
3582   BBuf* bbuf;
3583 
3584   if (from > to) {
3585     n = from; from = to; to = n;
3586   }
3587 
3588   if (IS_NULL(*pbuf)) {
3589     r = new_code_range(pbuf);
3590     if (r != 0) return r;
3591     bbuf = *pbuf;
3592     n = 0;
3593   }
3594   else {
3595     bbuf = *pbuf;
3596     GET_CODE_POINT(n, bbuf->p);
3597   }
3598   data = (OnigCodePoint* )(bbuf->p);
3599   data++;
3600 
3601   for (low = 0, bound = n; low < bound; ) {
3602     x = (low + bound) >> 1;
3603     if (from > data[x*2 + 1])
3604       low = x + 1;
3605     else
3606       bound = x;
3607   }
3608 
3609   high = (to == ~((OnigCodePoint )0)) ? n : low;
3610   for (bound = n; high < bound; ) {
3611     x = (high + bound) >> 1;
3612     if (to + 1 >= data[x*2])
3613       high = x + 1;
3614     else
3615       bound = x;
3616   }
3617 
3618   inc_n = low + 1 - high;
3619   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
3620     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
3621 
3622   if (inc_n != 1) {
3623     if (from > data[low*2])
3624       from = data[low*2];
3625     if (to < data[(high - 1)*2 + 1])
3626       to = data[(high - 1)*2 + 1];
3627   }
3628 
3629   if (inc_n != 0 && (OnigCodePoint )high < n) {
3630     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
3631     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
3632     int size = (n - high) * 2 * SIZE_CODE_POINT;
3633 
3634     if (inc_n > 0) {
3635       BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
3636     }
3637     else {
3638       BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
3639     }
3640   }
3641 
3642   pos = SIZE_CODE_POINT * (1 + low * 2);
3643   BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
3644   BB_WRITE_CODE_POINT(bbuf, pos, from);
3645   BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
3646   n += inc_n;
3647   BB_WRITE_CODE_POINT(bbuf, 0, n);
3648 
3649   return 0;
3650 }
3651 
3652 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)3653 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
3654 {
3655   if (from > to) {
3656     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3657       return 0;
3658     else
3659       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3660   }
3661 
3662   return add_code_range_to_buf(pbuf, from, to);
3663 }
3664 
3665 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)3666 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
3667 {
3668   int r, i, n;
3669   OnigCodePoint pre, from, *data, to = 0;
3670 
3671   *pbuf = (BBuf* )NULL;
3672   if (IS_NULL(bbuf)) {
3673   set_all:
3674     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3675   }
3676 
3677   data = (OnigCodePoint* )(bbuf->p);
3678   GET_CODE_POINT(n, data);
3679   data++;
3680   if (n <= 0) goto set_all;
3681 
3682   r = 0;
3683   pre = MBCODE_START_POS(enc);
3684   for (i = 0; i < n; i++) {
3685     from = data[i*2];
3686     to   = data[i*2+1];
3687     if (pre <= from - 1) {
3688       r = add_code_range_to_buf(pbuf, pre, from - 1);
3689       if (r != 0) return r;
3690     }
3691     if (to == ~((OnigCodePoint )0)) break;
3692     pre = to + 1;
3693   }
3694   if (to < ~((OnigCodePoint )0)) {
3695     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
3696   }
3697   return r;
3698 }
3699 
3700 #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\
3701   BBuf *tbuf; \
3702   int  tnot; \
3703   tnot = not1;  not1  = not2;  not2  = tnot; \
3704   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
3705 } while (0)
3706 
3707 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3708 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
3709                   BBuf* bbuf2, int not2, BBuf** pbuf)
3710 {
3711   int r;
3712   OnigCodePoint i, n1, *data1;
3713   OnigCodePoint from, to;
3714 
3715   *pbuf = (BBuf* )NULL;
3716   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
3717     if (not1 != 0 || not2 != 0)
3718       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3719     return 0;
3720   }
3721 
3722   r = 0;
3723   if (IS_NULL(bbuf2))
3724     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3725 
3726   if (IS_NULL(bbuf1)) {
3727     if (not1 != 0) {
3728       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3729     }
3730     else {
3731       if (not2 == 0) {
3732         return bbuf_clone(pbuf, bbuf2);
3733       }
3734       else {
3735         return not_code_range_buf(enc, bbuf2, pbuf);
3736       }
3737     }
3738   }
3739 
3740   if (not1 != 0)
3741     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3742 
3743   data1 = (OnigCodePoint* )(bbuf1->p);
3744   GET_CODE_POINT(n1, data1);
3745   data1++;
3746 
3747   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
3748     r = bbuf_clone(pbuf, bbuf2);
3749   }
3750   else if (not1 == 0) { /* 1 OR (not 2) */
3751     r = not_code_range_buf(enc, bbuf2, pbuf);
3752   }
3753   if (r != 0) return r;
3754 
3755   for (i = 0; i < n1; i++) {
3756     from = data1[i*2];
3757     to   = data1[i*2+1];
3758     r = add_code_range_to_buf(pbuf, from, to);
3759     if (r != 0) return r;
3760   }
3761   return 0;
3762 }
3763 
3764 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)3765 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
3766                 OnigCodePoint* data, int n)
3767 {
3768   int i, r;
3769   OnigCodePoint from2, to2;
3770 
3771   for (i = 0; i < n; i++) {
3772     from2 = data[i*2];
3773     to2   = data[i*2+1];
3774     if (from2 < from1) {
3775       if (to2 < from1) continue;
3776       else {
3777         from1 = to2 + 1;
3778       }
3779     }
3780     else if (from2 <= to1) {
3781       if (to2 < to1) {
3782         if (from1 <= from2 - 1) {
3783           r = add_code_range_to_buf(pbuf, from1, from2-1);
3784           if (r != 0) return r;
3785         }
3786         from1 = to2 + 1;
3787       }
3788       else {
3789         to1 = from2 - 1;
3790       }
3791     }
3792     else {
3793       from1 = from2;
3794     }
3795     if (from1 > to1) break;
3796   }
3797   if (from1 <= to1) {
3798     r = add_code_range_to_buf(pbuf, from1, to1);
3799     if (r != 0) return r;
3800   }
3801   return 0;
3802 }
3803 
3804 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3805 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
3806 {
3807   int r;
3808   OnigCodePoint i, j, n1, n2, *data1, *data2;
3809   OnigCodePoint from, to, from1, to1, from2, to2;
3810 
3811   *pbuf = (BBuf* )NULL;
3812   if (IS_NULL(bbuf1)) {
3813     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
3814       return bbuf_clone(pbuf, bbuf2);
3815     return 0;
3816   }
3817   else if (IS_NULL(bbuf2)) {
3818     if (not2 != 0)
3819       return bbuf_clone(pbuf, bbuf1);
3820     return 0;
3821   }
3822 
3823   if (not1 != 0)
3824     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3825 
3826   data1 = (OnigCodePoint* )(bbuf1->p);
3827   data2 = (OnigCodePoint* )(bbuf2->p);
3828   GET_CODE_POINT(n1, data1);
3829   GET_CODE_POINT(n2, data2);
3830   data1++;
3831   data2++;
3832 
3833   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
3834     for (i = 0; i < n1; i++) {
3835       from1 = data1[i*2];
3836       to1   = data1[i*2+1];
3837       for (j = 0; j < n2; j++) {
3838         from2 = data2[j*2];
3839         to2   = data2[j*2+1];
3840         if (from2 > to1) break;
3841         if (to2 < from1) continue;
3842         from = MAX(from1, from2);
3843         to   = MIN(to1, to2);
3844         r = add_code_range_to_buf(pbuf, from, to);
3845         if (r != 0) return r;
3846       }
3847     }
3848   }
3849   else if (not1 == 0) { /* 1 AND (not 2) */
3850     for (i = 0; i < n1; i++) {
3851       from1 = data1[i*2];
3852       to1   = data1[i*2+1];
3853       r = and_code_range1(pbuf, from1, to1, data2, n2);
3854       if (r != 0) return r;
3855     }
3856   }
3857 
3858   return 0;
3859 }
3860 
3861 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)3862 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
3863 {
3864   int r, not1, not2;
3865   BBuf *buf1, *buf2, *pbuf;
3866   BitSetRef bsr1, bsr2;
3867   BitSet bs1, bs2;
3868 
3869   not1 = IS_NCCLASS_NOT(dest);
3870   bsr1 = dest->bs;
3871   buf1 = dest->mbuf;
3872   not2 = IS_NCCLASS_NOT(cc);
3873   bsr2 = cc->bs;
3874   buf2 = cc->mbuf;
3875 
3876   if (not1 != 0) {
3877     bitset_invert_to(bsr1, bs1);
3878     bsr1 = bs1;
3879   }
3880   if (not2 != 0) {
3881     bitset_invert_to(bsr2, bs2);
3882     bsr2 = bs2;
3883   }
3884   bitset_and(bsr1, bsr2);
3885   if (bsr1 != dest->bs) {
3886     bitset_copy(dest->bs, bsr1);
3887   }
3888   if (not1 != 0) {
3889     bitset_invert(dest->bs);
3890   }
3891 
3892   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
3893     if (not1 != 0 && not2 != 0) {
3894       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
3895     }
3896     else {
3897       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
3898       if (r == 0 && not1 != 0) {
3899         BBuf *tbuf;
3900         r = not_code_range_buf(enc, pbuf, &tbuf);
3901         if (r != 0) {
3902           bbuf_free(pbuf);
3903           return r;
3904         }
3905         bbuf_free(pbuf);
3906         pbuf = tbuf;
3907       }
3908     }
3909     if (r != 0) return r;
3910 
3911     dest->mbuf = pbuf;
3912     bbuf_free(buf1);
3913     return r;
3914   }
3915   return 0;
3916 }
3917 
3918 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)3919 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
3920 {
3921   int r, not1, not2;
3922   BBuf *buf1, *buf2, *pbuf;
3923   BitSetRef bsr1, bsr2;
3924   BitSet bs1, bs2;
3925 
3926   not1 = IS_NCCLASS_NOT(dest);
3927   bsr1 = dest->bs;
3928   buf1 = dest->mbuf;
3929   not2 = IS_NCCLASS_NOT(cc);
3930   bsr2 = cc->bs;
3931   buf2 = cc->mbuf;
3932 
3933   if (not1 != 0) {
3934     bitset_invert_to(bsr1, bs1);
3935     bsr1 = bs1;
3936   }
3937   if (not2 != 0) {
3938     bitset_invert_to(bsr2, bs2);
3939     bsr2 = bs2;
3940   }
3941   bitset_or(bsr1, bsr2);
3942   if (bsr1 != dest->bs) {
3943     bitset_copy(dest->bs, bsr1);
3944   }
3945   if (not1 != 0) {
3946     bitset_invert(dest->bs);
3947   }
3948 
3949   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
3950     if (not1 != 0 && not2 != 0) {
3951       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
3952     }
3953     else {
3954       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
3955       if (r == 0 && not1 != 0) {
3956         BBuf *tbuf;
3957         r = not_code_range_buf(enc, pbuf, &tbuf);
3958         if (r != 0) {
3959           bbuf_free(pbuf);
3960           return r;
3961         }
3962         bbuf_free(pbuf);
3963         pbuf = tbuf;
3964       }
3965     }
3966     if (r != 0) return r;
3967 
3968     dest->mbuf = pbuf;
3969     bbuf_free(buf1);
3970     return r;
3971   }
3972   else
3973     return 0;
3974 }
3975 
3976 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)3977 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
3978 {
3979   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
3980     switch (c) {
3981     case 'n': return '\n';
3982     case 't': return '\t';
3983     case 'r': return '\r';
3984     case 'f': return '\f';
3985     case 'a': return '\007';
3986     case 'b': return '\010';
3987     case 'e': return '\033';
3988     case 'v':
3989       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
3990         return '\v';
3991       break;
3992 
3993     default:
3994       break;
3995     }
3996   }
3997   return c;
3998 }
3999 
4000 static int
is_invalid_quantifier_target(Node * node)4001 is_invalid_quantifier_target(Node* node)
4002 {
4003   switch (NODE_TYPE(node)) {
4004   case NODE_ANCHOR:
4005   case NODE_GIMMICK:
4006     return 1;
4007     break;
4008 
4009   case NODE_BAG:
4010     /* allow enclosed elements */
4011     /* return is_invalid_quantifier_target(NODE_BODY(node)); */
4012     break;
4013 
4014   case NODE_LIST:
4015     do {
4016       if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0;
4017     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4018     return 0;
4019     break;
4020 
4021   case NODE_ALT:
4022     do {
4023       if (is_invalid_quantifier_target(NODE_CAR(node))) return 1;
4024     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4025     break;
4026 
4027   default:
4028     break;
4029   }
4030   return 0;
4031 }
4032 
4033 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
4034 static int
quantifier_type_num(QuantNode * q)4035 quantifier_type_num(QuantNode* q)
4036 {
4037   if (q->greedy) {
4038     if (q->lower == 0) {
4039       if (q->upper == 1) return 0;
4040       else if (IS_INFINITE_REPEAT(q->upper)) return 1;
4041     }
4042     else if (q->lower == 1) {
4043       if (IS_INFINITE_REPEAT(q->upper)) return 2;
4044     }
4045   }
4046   else {
4047     if (q->lower == 0) {
4048       if (q->upper == 1) return 3;
4049       else if (IS_INFINITE_REPEAT(q->upper)) return 4;
4050     }
4051     else if (q->lower == 1) {
4052       if (IS_INFINITE_REPEAT(q->upper)) return 5;
4053     }
4054   }
4055   return -1;
4056 }
4057 
4058 
4059 enum ReduceType {
4060   RQ_ASIS = 0, /* as is */
4061   RQ_DEL  = 1, /* delete parent */
4062   RQ_A,        /* to '*'    */
4063   RQ_AQ,       /* to '*?'   */
4064   RQ_QQ,       /* to '??'   */
4065   RQ_P_QQ,     /* to '+)??' */
4066   RQ_PQ_Q      /* to '+?)?' */
4067 };
4068 
4069 static enum ReduceType ReduceTypeTable[6][6] = {
4070   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
4071   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
4072   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
4073   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
4074   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
4075   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
4076 };
4077 
4078 extern int
onig_reduce_nested_quantifier(Node * pnode)4079 onig_reduce_nested_quantifier(Node* pnode)
4080 {
4081   int pnum, cnum;
4082   QuantNode *p, *c;
4083   Node* cnode;
4084 
4085   cnode = NODE_BODY(pnode);
4086 
4087   p = QUANT_(pnode);
4088   c = QUANT_(cnode);
4089   pnum = quantifier_type_num(p);
4090   cnum = quantifier_type_num(c);
4091   if (pnum < 0 || cnum < 0) {
4092     if (p->lower == p->upper && c->lower == c->upper) {
4093       int n = onig_positive_int_multiply(p->lower, c->lower);
4094       if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4095 
4096       p->lower = p->upper = n;
4097       NODE_BODY(pnode) = NODE_BODY(cnode);
4098       goto remove_cnode;
4099     }
4100 
4101     return 0;
4102   }
4103 
4104   switch(ReduceTypeTable[cnum][pnum]) {
4105   case RQ_DEL:
4106     *pnode = *cnode;
4107     goto remove_cnode;
4108     break;
4109   case RQ_A:
4110     NODE_BODY(pnode) = NODE_BODY(cnode);
4111     p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 1;
4112     goto remove_cnode;
4113     break;
4114   case RQ_AQ:
4115     NODE_BODY(pnode) = NODE_BODY(cnode);
4116     p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 0;
4117     goto remove_cnode;
4118     break;
4119   case RQ_QQ:
4120     NODE_BODY(pnode) = NODE_BODY(cnode);
4121     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
4122     goto remove_cnode;
4123     break;
4124   case RQ_P_QQ:
4125     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
4126     c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 1;
4127     break;
4128   case RQ_PQ_Q:
4129     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
4130     c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 0;
4131     break;
4132   case RQ_ASIS:
4133     break;
4134   }
4135 
4136   return 0;
4137 
4138  remove_cnode:
4139   NODE_BODY(cnode) = NULL_NODE;
4140   onig_node_free(cnode);
4141   return 0;
4142 }
4143 
4144 static int
node_new_general_newline(Node ** node,ScanEnv * env)4145 node_new_general_newline(Node** node, ScanEnv* env)
4146 {
4147   int r;
4148   int dlen, alen;
4149   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
4150   Node* crnl;
4151   Node* ncc;
4152   Node* x;
4153   CClassNode* cc;
4154 
4155   dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf);
4156   if (dlen < 0) return dlen;
4157   alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen);
4158   if (alen < 0) return alen;
4159 
4160   crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE);
4161   CHECK_NULL_RETURN_MEMERR(crnl);
4162 
4163   ncc = node_new_cclass();
4164   if (IS_NULL(ncc)) goto err2;
4165 
4166   cc = CCLASS_(ncc);
4167   if (dlen == 1) {
4168     bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d);
4169   }
4170   else {
4171     r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d);
4172     if (r != 0) {
4173     err1:
4174       onig_node_free(ncc);
4175     err2:
4176       onig_node_free(crnl);
4177       return ONIGERR_MEMORY;
4178     }
4179   }
4180 
4181   if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) {
4182     r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
4183     if (r != 0) goto err1;
4184     r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
4185     if (r != 0) goto err1;
4186   }
4187 
4188   x = node_new_bag_if_else(crnl, NULL_NODE, ncc);
4189   if (IS_NULL(x)) goto err1;
4190 
4191   *node = x;
4192   return 0;
4193 }
4194 
4195 enum TokenSyms {
4196   TK_EOT      = 0,   /* end of token */
4197   TK_CRUDE_BYTE = 1,
4198   TK_CHAR,
4199   TK_STRING,
4200   TK_CODE_POINT,
4201   TK_ANYCHAR,
4202   TK_CHAR_TYPE,
4203   TK_BACKREF,
4204   TK_CALL,
4205   TK_ANCHOR,
4206   TK_REPEAT,
4207   TK_INTERVAL,
4208   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
4209   TK_ALT,
4210   TK_SUBEXP_OPEN,
4211   TK_SUBEXP_CLOSE,
4212   TK_OPEN_CC,
4213   TK_QUOTE_OPEN,
4214   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
4215   TK_KEEP,             /* \K */
4216   TK_GENERAL_NEWLINE,  /* \R */
4217   TK_NO_NEWLINE,       /* \N */
4218   TK_TRUE_ANYCHAR,     /* \O */
4219   TK_TEXT_SEGMENT,     /* \X */
4220 
4221   /* in cc */
4222   TK_CC_CLOSE,
4223   TK_CC_RANGE,
4224   TK_CC_POSIX_BRACKET_OPEN,
4225   TK_CC_AND,           /* && */
4226   TK_CC_OPEN_CC        /* [ */
4227 };
4228 
4229 typedef struct {
4230   enum TokenSyms type;
4231   int escaped;
4232   int base;   /* is number: 8, 16 (used in [....]) */
4233   UChar* backp;
4234   union {
4235     UChar* s;
4236     UChar byte;
4237     OnigCodePoint code;
4238     int   anchor;
4239     int   subtype;
4240     struct {
4241       int lower;
4242       int upper;
4243       int greedy;
4244       int possessive;
4245     } repeat;
4246     struct {
4247       int  num;
4248       int  ref1;
4249       int* refs;
4250       int  by_name;
4251 #ifdef USE_BACKREF_WITH_LEVEL
4252       int  exist_level;
4253       int  level;   /* \k<name+n> */
4254 #endif
4255     } backref;
4256     struct {
4257       UChar* name;
4258       UChar* name_end;
4259       int    gnum;
4260       int    by_number;
4261     } call;
4262     struct {
4263       int ctype;
4264       int not;
4265     } prop;
4266   } u;
4267 } PToken;
4268 
4269 
4270 static int
fetch_interval(UChar ** src,UChar * end,PToken * tok,ScanEnv * env)4271 fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
4272 {
4273   int low, up, syn_allow, non_low = 0;
4274   int r = 0;
4275   OnigCodePoint c;
4276   OnigEncoding enc = env->enc;
4277   UChar* p = *src;
4278   PFETCH_READY;
4279 
4280   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
4281 
4282   if (PEND) {
4283     if (syn_allow)
4284       return 1;  /* "....{" : OK! */
4285     else
4286       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
4287   }
4288 
4289   if (! syn_allow) {
4290     c = PPEEK;
4291     if (c == ')' || c == '(' || c == '|') {
4292       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
4293     }
4294   }
4295 
4296   low = scan_number(&p, end, env->enc);
4297   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4298   if (low > ONIG_MAX_REPEAT_NUM)
4299     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4300 
4301   if (p == *src) { /* can't read low */
4302     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
4303       /* allow {,n} as {0,n} */
4304       low = 0;
4305       non_low = 1;
4306     }
4307     else
4308       goto invalid;
4309   }
4310 
4311   if (PEND) goto invalid;
4312   PFETCH(c);
4313   if (c == ',') {
4314     UChar* prev = p;
4315     up = scan_number(&p, end, env->enc);
4316     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4317     if (up > ONIG_MAX_REPEAT_NUM)
4318       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4319 
4320     if (p == prev) {
4321       if (non_low != 0)
4322         goto invalid;
4323       up = INFINITE_REPEAT;  /* {n,} : {n,infinite} */
4324     }
4325   }
4326   else {
4327     if (non_low != 0)
4328       goto invalid;
4329 
4330     PUNFETCH;
4331     up = low;  /* {n} : exact n times */
4332     r = 2;     /* fixed */
4333   }
4334 
4335   if (PEND) goto invalid;
4336   PFETCH(c);
4337   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
4338     if (c != MC_ESC(env->syntax) || PEND) goto invalid;
4339     PFETCH(c);
4340   }
4341   if (c != '}') goto invalid;
4342 
4343   if (!IS_INFINITE_REPEAT(up) && low > up) {
4344     /* {n,m}+ supported case */
4345     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))
4346       return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
4347 
4348     tok->u.repeat.possessive = 1;
4349     {
4350       int tmp;
4351       tmp = low; low = up; up = tmp;
4352     }
4353   }
4354   else
4355     tok->u.repeat.possessive = 0;
4356 
4357   tok->type = TK_INTERVAL;
4358   tok->u.repeat.lower = low;
4359   tok->u.repeat.upper = up;
4360   *src = p;
4361   return r; /* 0: normal {n,m}, 2: fixed {n} */
4362 
4363  invalid:
4364   if (syn_allow) {
4365     /* *src = p; */ /* !!! Don't do this line !!! */
4366     return 1;  /* OK */
4367   }
4368   else
4369     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
4370 }
4371 
4372 /* \M-, \C-, \c, or \... */
4373 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4374 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4375 {
4376   int v;
4377   OnigCodePoint c;
4378   OnigEncoding enc = env->enc;
4379   UChar* p = *src;
4380 
4381   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4382 
4383   PFETCH_S(c);
4384   switch (c) {
4385   case 'M':
4386     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
4387       if (PEND) return ONIGERR_END_PATTERN_AT_META;
4388       PFETCH_S(c);
4389       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
4390       if (PEND) return ONIGERR_END_PATTERN_AT_META;
4391       PFETCH_S(c);
4392       if (c == MC_ESC(env->syntax)) {
4393         v = fetch_escaped_value(&p, end, env, &c);
4394         if (v < 0) return v;
4395       }
4396       c = ((c & 0xff) | 0x80);
4397     }
4398     else
4399       goto backslash;
4400     break;
4401 
4402   case 'C':
4403     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
4404       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4405       PFETCH_S(c);
4406       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
4407       goto control;
4408     }
4409     else
4410       goto backslash;
4411 
4412   case 'c':
4413     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
4414     control:
4415       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4416       PFETCH_S(c);
4417       if (c == '?') {
4418         c = 0177;
4419       }
4420       else {
4421         if (c == MC_ESC(env->syntax)) {
4422           v = fetch_escaped_value(&p, end, env, &c);
4423           if (v < 0) return v;
4424         }
4425         c &= 0x9f;
4426       }
4427       break;
4428     }
4429     /* fall through */
4430 
4431   default:
4432     {
4433     backslash:
4434       c = conv_backslash_value(c, env);
4435     }
4436     break;
4437   }
4438 
4439   *src = p;
4440   *val = c;
4441   return 0;
4442 }
4443 
4444 static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env);
4445 
4446 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)4447 get_name_end_code_point(OnigCodePoint start)
4448 {
4449   switch (start) {
4450   case '<':  return (OnigCodePoint )'>';  break;
4451   case '\'': return (OnigCodePoint )'\''; break;
4452   case '(':  return (OnigCodePoint )')';  break;
4453   default:
4454     break;
4455   }
4456 
4457   return (OnigCodePoint )0;
4458 }
4459 
4460 enum REF_NUM {
4461   IS_NOT_NUM = 0,
4462   IS_ABS_NUM = 1,
4463   IS_REL_NUM = 2
4464 };
4465 
4466 #ifdef USE_BACKREF_WITH_LEVEL
4467 /*
4468    \k<name+n>, \k<name-n>
4469    \k<num+n>,  \k<num-n>
4470    \k<-num+n>, \k<-num-n>
4471    \k<+num+n>, \k<+num-n>
4472 */
4473 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel,enum REF_NUM * num_type)4474 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
4475                       UChar** rname_end, ScanEnv* env,
4476                       int* rback_num, int* rlevel, enum REF_NUM* num_type)
4477 {
4478   int r, sign, exist_level;
4479   int digit_count;
4480   OnigCodePoint end_code;
4481   OnigCodePoint c = 0;
4482   OnigEncoding enc = env->enc;
4483   UChar *name_end;
4484   UChar *pnum_head;
4485   UChar *p = *src;
4486   PFETCH_READY;
4487 
4488   *rback_num = 0;
4489   exist_level = 0;
4490   *num_type = IS_NOT_NUM;
4491   sign = 1;
4492   pnum_head = *src;
4493 
4494   end_code = get_name_end_code_point(start_code);
4495 
4496   digit_count = 0;
4497   name_end = end;
4498   r = 0;
4499   if (PEND) {
4500     return ONIGERR_EMPTY_GROUP_NAME;
4501   }
4502   else {
4503     PFETCH(c);
4504     if (c == end_code)
4505       return ONIGERR_EMPTY_GROUP_NAME;
4506 
4507     if (IS_CODE_DIGIT_ASCII(enc, c)) {
4508       *num_type = IS_ABS_NUM;
4509       digit_count++;
4510     }
4511     else if (c == '-') {
4512       *num_type = IS_REL_NUM;
4513       sign = -1;
4514       pnum_head = p;
4515     }
4516     else if (c == '+') {
4517       *num_type = IS_REL_NUM;
4518       sign = 1;
4519       pnum_head = p;
4520     }
4521     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4522       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4523     }
4524   }
4525 
4526   while (!PEND) {
4527     name_end = p;
4528     PFETCH(c);
4529     if (c == end_code || c == ')' || c == '+' || c == '-') {
4530       if (*num_type != IS_NOT_NUM && digit_count == 0)
4531         r = ONIGERR_INVALID_GROUP_NAME;
4532       break;
4533     }
4534 
4535     if (*num_type != IS_NOT_NUM) {
4536       if (IS_CODE_DIGIT_ASCII(enc, c)) {
4537         digit_count++;
4538       }
4539       else {
4540         r = ONIGERR_INVALID_GROUP_NAME;
4541         *num_type = IS_NOT_NUM;
4542       }
4543     }
4544     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4545       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4546     }
4547   }
4548 
4549   if (r == 0 && c != end_code) {
4550     if (c == '+' || c == '-') {
4551       int level;
4552       int flag = (c == '-' ? -1 : 1);
4553 
4554       if (PEND) {
4555         r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4556         goto end;
4557       }
4558       PFETCH(c);
4559       if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err;
4560       PUNFETCH;
4561       level = scan_number(&p, end, enc);
4562       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
4563       *rlevel = (level * flag);
4564       exist_level = 1;
4565 
4566       if (!PEND) {
4567         PFETCH(c);
4568         if (c == end_code)
4569           goto end;
4570       }
4571     }
4572 
4573   err:
4574     name_end = end;
4575   err2:
4576     r = ONIGERR_INVALID_GROUP_NAME;
4577   }
4578 
4579  end:
4580   if (r == 0) {
4581     if (*num_type != IS_NOT_NUM) {
4582       *rback_num = scan_number(&pnum_head, name_end, enc);
4583       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4584       else if (*rback_num == 0) {
4585         if (*num_type == IS_REL_NUM)
4586           goto err2;
4587       }
4588 
4589       *rback_num *= sign;
4590     }
4591 
4592     *rname_end = name_end;
4593     *src = p;
4594     return (exist_level ? 1 : 0);
4595   }
4596   else {
4597     onig_scan_env_set_error_string(env, r, *src, name_end);
4598     return r;
4599   }
4600 }
4601 #endif /* USE_BACKREF_WITH_LEVEL */
4602 
4603 /*
4604   ref: 0 -> define name    (don't allow number name)
4605        1 -> reference name (allow number name)
4606 */
4607 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,enum REF_NUM * num_type,int is_ref)4608 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
4609            UChar** rname_end, ScanEnv* env, int* rback_num,
4610            enum REF_NUM* num_type, int is_ref)
4611 {
4612   int r, sign;
4613   int digit_count;
4614   OnigCodePoint end_code;
4615   OnigCodePoint c = 0;
4616   OnigEncoding enc = env->enc;
4617   UChar *name_end;
4618   UChar *pnum_head;
4619   UChar *p = *src;
4620 
4621   *rback_num = 0;
4622 
4623   end_code = get_name_end_code_point(start_code);
4624 
4625   digit_count = 0;
4626   name_end = end;
4627   pnum_head = *src;
4628   r = 0;
4629   *num_type = IS_NOT_NUM;
4630   sign = 1;
4631   if (PEND) {
4632     return ONIGERR_EMPTY_GROUP_NAME;
4633   }
4634   else {
4635     PFETCH_S(c);
4636     if (c == end_code)
4637       return ONIGERR_EMPTY_GROUP_NAME;
4638 
4639     if (IS_CODE_DIGIT_ASCII(enc, c)) {
4640       if (is_ref == TRUE)
4641         *num_type = IS_ABS_NUM;
4642       else {
4643         r = ONIGERR_INVALID_GROUP_NAME;
4644       }
4645       digit_count++;
4646     }
4647     else if (c == '-') {
4648       if (is_ref == TRUE) {
4649         *num_type = IS_REL_NUM;
4650         sign = -1;
4651         pnum_head = p;
4652       }
4653       else {
4654         r = ONIGERR_INVALID_GROUP_NAME;
4655       }
4656     }
4657     else if (c == '+') {
4658       if (is_ref == TRUE) {
4659         *num_type = IS_REL_NUM;
4660         sign = 1;
4661         pnum_head = p;
4662       }
4663       else {
4664         r = ONIGERR_INVALID_GROUP_NAME;
4665       }
4666     }
4667     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4668       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4669     }
4670   }
4671 
4672   if (r == 0) {
4673     while (!PEND) {
4674       name_end = p;
4675       PFETCH_S(c);
4676       if (c == end_code || c == ')') {
4677         if (*num_type != IS_NOT_NUM && digit_count == 0)
4678           r = ONIGERR_INVALID_GROUP_NAME;
4679         break;
4680       }
4681 
4682       if (*num_type != IS_NOT_NUM) {
4683         if (IS_CODE_DIGIT_ASCII(enc, c)) {
4684           digit_count++;
4685         }
4686         else {
4687           if (!ONIGENC_IS_CODE_WORD(enc, c))
4688             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4689           else
4690             r = ONIGERR_INVALID_GROUP_NAME;
4691 
4692           *num_type = IS_NOT_NUM;
4693         }
4694       }
4695       else {
4696         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4697           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4698         }
4699       }
4700     }
4701 
4702     if (c != end_code) {
4703       r = ONIGERR_INVALID_GROUP_NAME;
4704       goto err;
4705     }
4706 
4707     if (*num_type != IS_NOT_NUM) {
4708       *rback_num = scan_number(&pnum_head, name_end, enc);
4709       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4710       else if (*rback_num == 0) {
4711         if (*num_type == IS_REL_NUM) {
4712           r = ONIGERR_INVALID_GROUP_NAME;
4713           goto err;
4714         }
4715       }
4716 
4717       *rback_num *= sign;
4718     }
4719 
4720     *rname_end = name_end;
4721     *src = p;
4722     return 0;
4723   }
4724   else {
4725     while (!PEND) {
4726       name_end = p;
4727       PFETCH_S(c);
4728       if (c == end_code || c == ')')
4729         break;
4730     }
4731     if (PEND)
4732       name_end = end;
4733 
4734   err:
4735     onig_scan_env_set_error_string(env, r, *src, name_end);
4736     return r;
4737   }
4738 }
4739 
4740 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)4741 CC_ESC_WARN(ScanEnv* env, UChar *c)
4742 {
4743   if (onig_warn == onig_null_warn) return ;
4744 
4745   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
4746       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
4747     UChar buf[WARN_BUFSIZE];
4748     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4749                                env->pattern, env->pattern_end,
4750                                (UChar* )"character class has '%s' without escape",
4751                                c);
4752     (*onig_warn)((char* )buf);
4753   }
4754 }
4755 
4756 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)4757 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
4758 {
4759   if (onig_warn == onig_null_warn) return ;
4760 
4761   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
4762     UChar buf[WARN_BUFSIZE];
4763     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
4764                          (env)->pattern, (env)->pattern_end,
4765                          (UChar* )"regular expression has '%s' without escape", c);
4766     (*onig_warn)((char* )buf);
4767   }
4768 }
4769 
4770 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)4771 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
4772                   UChar **next, OnigEncoding enc)
4773 {
4774   int i;
4775   OnigCodePoint x;
4776   UChar *q;
4777   UChar *p = from;
4778 
4779   while (p < to) {
4780     x = ONIGENC_MBC_TO_CODE(enc, p, to);
4781     q = p + enclen(enc, p);
4782     if (x == s[0]) {
4783       for (i = 1; i < n && q < to; i++) {
4784         x = ONIGENC_MBC_TO_CODE(enc, q, to);
4785         if (x != s[i]) break;
4786         q += enclen(enc, q);
4787       }
4788       if (i >= n) {
4789         if (IS_NOT_NULL(next))
4790           *next = q;
4791         return p;
4792       }
4793     }
4794     p = q;
4795   }
4796   return NULL_UCHARP;
4797 }
4798 
4799 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)4800 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
4801                          OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
4802 {
4803   int i, in_esc;
4804   OnigCodePoint x;
4805   UChar *q;
4806   UChar *p = from;
4807 
4808   in_esc = 0;
4809   while (p < to) {
4810     if (in_esc) {
4811       in_esc = 0;
4812       p += enclen(enc, p);
4813     }
4814     else {
4815       x = ONIGENC_MBC_TO_CODE(enc, p, to);
4816       q = p + enclen(enc, p);
4817       if (x == s[0]) {
4818         for (i = 1; i < n && q < to; i++) {
4819           x = ONIGENC_MBC_TO_CODE(enc, q, to);
4820           if (x != s[i]) break;
4821           q += enclen(enc, q);
4822         }
4823         if (i >= n) return 1;
4824         p += enclen(enc, p);
4825       }
4826       else {
4827         x = ONIGENC_MBC_TO_CODE(enc, p, to);
4828         if (x == bad) return 0;
4829         else if (x == MC_ESC(syn)) in_esc = 1;
4830         p = q;
4831       }
4832     }
4833   }
4834   return 0;
4835 }
4836 
4837 static int
fetch_token_in_cc(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)4838 fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
4839 {
4840   int r;
4841   OnigCodePoint code;
4842   OnigCodePoint c, c2;
4843   OnigSyntaxType* syn = env->syntax;
4844   OnigEncoding enc = env->enc;
4845   UChar* prev;
4846   UChar* p = *src;
4847   PFETCH_READY;
4848 
4849   if (PEND) {
4850     tok->type = TK_EOT;
4851     return tok->type;
4852   }
4853 
4854   PFETCH(c);
4855   tok->type = TK_CHAR;
4856   tok->base = 0;
4857   tok->u.code = c;
4858   tok->escaped = 0;
4859 
4860   if (c == ']') {
4861     tok->type = TK_CC_CLOSE;
4862   }
4863   else if (c == '-') {
4864     tok->type = TK_CC_RANGE;
4865   }
4866   else if (c == MC_ESC(syn)) {
4867     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
4868       goto end;
4869 
4870     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4871 
4872     PFETCH(c);
4873     tok->escaped = 1;
4874     tok->u.code = c;
4875     switch (c) {
4876     case 'w':
4877       tok->type = TK_CHAR_TYPE;
4878       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
4879       tok->u.prop.not   = 0;
4880       break;
4881     case 'W':
4882       tok->type = TK_CHAR_TYPE;
4883       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
4884       tok->u.prop.not   = 1;
4885       break;
4886     case 'd':
4887       tok->type = TK_CHAR_TYPE;
4888       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
4889       tok->u.prop.not   = 0;
4890       break;
4891     case 'D':
4892       tok->type = TK_CHAR_TYPE;
4893       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
4894       tok->u.prop.not   = 1;
4895       break;
4896     case 's':
4897       tok->type = TK_CHAR_TYPE;
4898       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
4899       tok->u.prop.not   = 0;
4900       break;
4901     case 'S':
4902       tok->type = TK_CHAR_TYPE;
4903       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
4904       tok->u.prop.not   = 1;
4905       break;
4906     case 'h':
4907       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
4908       tok->type = TK_CHAR_TYPE;
4909       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
4910       tok->u.prop.not   = 0;
4911       break;
4912     case 'H':
4913       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
4914       tok->type = TK_CHAR_TYPE;
4915       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
4916       tok->u.prop.not   = 1;
4917       break;
4918 
4919     case 'p':
4920     case 'P':
4921       if (PEND) break;
4922 
4923       c2 = PPEEK;
4924       if (c2 == '{' &&
4925           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
4926         PINC;
4927         tok->type = TK_CHAR_PROPERTY;
4928         tok->u.prop.not = c == 'P';
4929 
4930         if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
4931           PFETCH(c2);
4932           if (c2 == '^') {
4933             tok->u.prop.not = tok->u.prop.not == 0;
4934           }
4935           else
4936             PUNFETCH;
4937         }
4938       }
4939       break;
4940 
4941     case 'o':
4942       if (PEND) break;
4943 
4944       prev = p;
4945       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
4946         PINC;
4947         r = scan_octal_number(&p, end, 0, 11, enc, &code);
4948         if (r < 0) return r;
4949         if (!PEND) {
4950           c2 = PPEEK;
4951           if (IS_CODE_DIGIT_ASCII(enc, c2))
4952             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
4953         }
4954 
4955         if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
4956           PINC;
4957           tok->type   = TK_CODE_POINT;
4958           tok->base   = 8;
4959           tok->u.code = code;
4960         }
4961         else {
4962           /* can't read nothing or invalid format */
4963           p = prev;
4964         }
4965       }
4966       break;
4967 
4968     case 'x':
4969       if (PEND) break;
4970 
4971       prev = p;
4972       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
4973         PINC;
4974         r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
4975         if (r < 0) return r;
4976         if (!PEND) {
4977           c2 = PPEEK;
4978           if (IS_CODE_XDIGIT_ASCII(enc, c2))
4979             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
4980         }
4981 
4982         if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
4983           PINC;
4984           tok->type   = TK_CODE_POINT;
4985           tok->base   = 16;
4986           tok->u.code = code;
4987         }
4988         else {
4989           /* can't read nothing or invalid format */
4990           p = prev;
4991         }
4992       }
4993       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
4994         r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
4995         if (r < 0) return r;
4996         if (p == prev) {  /* can't read nothing. */
4997           code = 0; /* but, it's not error */
4998         }
4999         tok->type = TK_CRUDE_BYTE;
5000         tok->base = 16;
5001         tok->u.byte = (UChar )code;
5002       }
5003       break;
5004 
5005     case 'u':
5006       if (PEND) break;
5007 
5008       prev = p;
5009       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5010         r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5011         if (r < 0) return r;
5012         if (p == prev) {  /* can't read nothing. */
5013           code = 0; /* but, it's not error */
5014         }
5015         tok->type   = TK_CODE_POINT;
5016         tok->base   = 16;
5017         tok->u.code = code;
5018       }
5019       break;
5020 
5021     case '0':
5022     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
5023       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5024         PUNFETCH;
5025         prev = p;
5026         r = scan_octal_number(&p, end, 0, 3, enc, &code);
5027         if (r < 0) return r;
5028         if (code >= 256) return ONIGERR_TOO_BIG_NUMBER;
5029         if (p == prev) {  /* can't read nothing. */
5030           code = 0; /* but, it's not error */
5031         }
5032         tok->type = TK_CRUDE_BYTE;
5033         tok->base = 8;
5034         tok->u.byte = (UChar )code;
5035       }
5036       break;
5037 
5038     default:
5039       PUNFETCH;
5040       r = fetch_escaped_value(&p, end, env, &c2);
5041       if (r < 0) return r;
5042       if (tok->u.code != c2) {
5043         tok->u.code = c2;
5044         tok->type   = TK_CODE_POINT;
5045       }
5046       break;
5047     }
5048   }
5049   else if (c == '[') {
5050     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
5051       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
5052       tok->backp = p; /* point at '[' is read */
5053       PINC;
5054       if (str_exist_check_with_esc(send, 2, p, end,
5055                                    (OnigCodePoint )']', enc, syn)) {
5056         tok->type = TK_CC_POSIX_BRACKET_OPEN;
5057       }
5058       else {
5059         PUNFETCH;
5060         goto cc_in_cc;
5061       }
5062     }
5063     else {
5064     cc_in_cc:
5065       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
5066         tok->type = TK_CC_OPEN_CC;
5067       }
5068       else {
5069         CC_ESC_WARN(env, (UChar* )"[");
5070       }
5071     }
5072   }
5073   else if (c == '&') {
5074     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
5075         !PEND && (PPEEK_IS('&'))) {
5076       PINC;
5077       tok->type = TK_CC_AND;
5078     }
5079   }
5080 
5081  end:
5082   *src = p;
5083   return tok->type;
5084 }
5085 
5086 static int
fetch_token(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)5087 fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
5088 {
5089   int r;
5090   OnigCodePoint code;
5091   OnigCodePoint c;
5092   OnigEncoding enc = env->enc;
5093   OnigSyntaxType* syn = env->syntax;
5094   UChar* prev;
5095   UChar* p = *src;
5096   PFETCH_READY;
5097 
5098  start:
5099   if (PEND) {
5100     tok->type = TK_EOT;
5101     return tok->type;
5102   }
5103 
5104   tok->type  = TK_STRING;
5105   tok->base  = 0;
5106   tok->backp = p;
5107 
5108   PFETCH(c);
5109   if (IS_MC_ESC_CODE(c, syn)) {
5110     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
5111 
5112     tok->backp = p;
5113     PFETCH(c);
5114 
5115     tok->u.code = c;
5116     tok->escaped = 1;
5117     switch (c) {
5118     case '*':
5119       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
5120       tok->type = TK_REPEAT;
5121       tok->u.repeat.lower = 0;
5122       tok->u.repeat.upper = INFINITE_REPEAT;
5123       goto greedy_check;
5124       break;
5125 
5126     case '+':
5127       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
5128       tok->type = TK_REPEAT;
5129       tok->u.repeat.lower = 1;
5130       tok->u.repeat.upper = INFINITE_REPEAT;
5131       goto greedy_check;
5132       break;
5133 
5134     case '?':
5135       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
5136       tok->type = TK_REPEAT;
5137       tok->u.repeat.lower = 0;
5138       tok->u.repeat.upper = 1;
5139     greedy_check:
5140       tok->u.repeat.possessive = 0;
5141     greedy_check2:
5142       if (!PEND && PPEEK_IS('?') &&
5143           IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) &&
5144           tok->u.repeat.possessive == 0) {
5145         PFETCH(c);
5146         tok->u.repeat.greedy = 0;
5147         tok->u.repeat.possessive = 0;
5148       }
5149       else {
5150       possessive_check:
5151         tok->u.repeat.greedy = 1;
5152         if (!PEND && PPEEK_IS('+') &&
5153             ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
5154               tok->type != TK_INTERVAL)  ||
5155              (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
5156               tok->type == TK_INTERVAL)) &&
5157           tok->u.repeat.possessive == 0) {
5158           PFETCH(c);
5159           tok->u.repeat.possessive = 1;
5160         }
5161       }
5162       break;
5163 
5164     case '{':
5165       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
5166       r = fetch_interval(&p, end, tok, env);
5167       if (r < 0) return r;  /* error */
5168       if (r == 0) goto greedy_check2;
5169       else if (r == 2) { /* {n} */
5170         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5171           goto possessive_check;
5172 
5173         goto greedy_check2;
5174       }
5175       /* r == 1 : normal char */
5176       break;
5177 
5178     case '|':
5179       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
5180       tok->type = TK_ALT;
5181       break;
5182 
5183     case '(':
5184       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5185       tok->type = TK_SUBEXP_OPEN;
5186       break;
5187 
5188     case ')':
5189       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5190       tok->type = TK_SUBEXP_CLOSE;
5191       break;
5192 
5193     case 'w':
5194       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5195       tok->type = TK_CHAR_TYPE;
5196       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5197       tok->u.prop.not   = 0;
5198       break;
5199 
5200     case 'W':
5201       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5202       tok->type = TK_CHAR_TYPE;
5203       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5204       tok->u.prop.not   = 1;
5205       break;
5206 
5207     case 'b':
5208       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5209       tok->type = TK_ANCHOR;
5210       tok->u.anchor = ANCR_WORD_BOUNDARY;
5211       break;
5212 
5213     case 'B':
5214       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5215       tok->type = TK_ANCHOR;
5216       tok->u.anchor = ANCR_NO_WORD_BOUNDARY;
5217       break;
5218 
5219     case 'y':
5220       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5221       tok->type = TK_ANCHOR;
5222       tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY;
5223       break;
5224 
5225     case 'Y':
5226       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5227       tok->type = TK_ANCHOR;
5228       tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY;
5229       break;
5230 
5231 #ifdef USE_WORD_BEGIN_END
5232     case '<':
5233       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5234       tok->type = TK_ANCHOR;
5235       tok->u.anchor = ANCR_WORD_BEGIN;
5236       break;
5237 
5238     case '>':
5239       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5240       tok->type = TK_ANCHOR;
5241       tok->u.anchor = ANCR_WORD_END;
5242       break;
5243 #endif
5244 
5245     case 's':
5246       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5247       tok->type = TK_CHAR_TYPE;
5248       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5249       tok->u.prop.not   = 0;
5250       break;
5251 
5252     case 'S':
5253       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5254       tok->type = TK_CHAR_TYPE;
5255       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5256       tok->u.prop.not   = 1;
5257       break;
5258 
5259     case 'd':
5260       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5261       tok->type = TK_CHAR_TYPE;
5262       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5263       tok->u.prop.not   = 0;
5264       break;
5265 
5266     case 'D':
5267       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5268       tok->type = TK_CHAR_TYPE;
5269       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5270       tok->u.prop.not   = 1;
5271       break;
5272 
5273     case 'h':
5274       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5275       tok->type = TK_CHAR_TYPE;
5276       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5277       tok->u.prop.not   = 0;
5278       break;
5279 
5280     case 'H':
5281       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5282       tok->type = TK_CHAR_TYPE;
5283       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5284       tok->u.prop.not   = 1;
5285       break;
5286 
5287     case 'K':
5288       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break;
5289       tok->type = TK_KEEP;
5290       break;
5291 
5292     case 'R':
5293       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break;
5294       tok->type = TK_GENERAL_NEWLINE;
5295       break;
5296 
5297     case 'N':
5298       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5299       tok->type = TK_NO_NEWLINE;
5300       break;
5301 
5302     case 'O':
5303       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5304       tok->type = TK_TRUE_ANYCHAR;
5305       break;
5306 
5307     case 'X':
5308       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5309       tok->type = TK_TEXT_SEGMENT;
5310       break;
5311 
5312     case 'A':
5313       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5314     begin_buf:
5315       tok->type = TK_ANCHOR;
5316       tok->u.subtype = ANCR_BEGIN_BUF;
5317       break;
5318 
5319     case 'Z':
5320       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5321       tok->type = TK_ANCHOR;
5322       tok->u.subtype = ANCR_SEMI_END_BUF;
5323       break;
5324 
5325     case 'z':
5326       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5327     end_buf:
5328       tok->type = TK_ANCHOR;
5329       tok->u.subtype = ANCR_END_BUF;
5330       break;
5331 
5332     case 'G':
5333       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
5334       tok->type = TK_ANCHOR;
5335       tok->u.subtype = ANCR_BEGIN_POSITION;
5336       break;
5337 
5338     case '`':
5339       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5340       goto begin_buf;
5341       break;
5342 
5343     case '\'':
5344       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5345       goto end_buf;
5346       break;
5347 
5348     case 'o':
5349       if (PEND) break;
5350 
5351       prev = p;
5352       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5353         PINC;
5354         r = scan_octal_number(&p, end, 0, 11, enc, &code);
5355         if (r < 0) return r;
5356         if (!PEND) {
5357           if (IS_CODE_DIGIT_ASCII(enc, PPEEK))
5358             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5359         }
5360 
5361         if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
5362           PINC;
5363           tok->type   = TK_CODE_POINT;
5364           tok->u.code = code;
5365         }
5366         else {
5367           /* can't read nothing or invalid format */
5368           p = prev;
5369         }
5370       }
5371       break;
5372 
5373     case 'x':
5374       if (PEND) break;
5375 
5376       prev = p;
5377       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5378         PINC;
5379         r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5380         if (r < 0) return r;
5381         if (!PEND) {
5382           if (IS_CODE_XDIGIT_ASCII(enc, PPEEK))
5383             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5384         }
5385 
5386         if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
5387           PINC;
5388           tok->type   = TK_CODE_POINT;
5389           tok->u.code = code;
5390         }
5391         else {
5392           /* can't read nothing or invalid format */
5393           p = prev;
5394         }
5395       }
5396       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5397         r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5398         if (r < 0) return r;
5399         if (p == prev) {  /* can't read nothing. */
5400           code = 0; /* but, it's not error */
5401         }
5402         tok->type = TK_CRUDE_BYTE;
5403         tok->base = 16;
5404         tok->u.byte = (UChar )code;
5405       }
5406       break;
5407 
5408     case 'u':
5409       if (PEND) break;
5410 
5411       prev = p;
5412       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5413         r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5414         if (r < 0) return r;
5415         if (p == prev) {  /* can't read nothing. */
5416           code = 0; /* but, it's not error */
5417         }
5418         tok->type   = TK_CODE_POINT;
5419         tok->base   = 16;
5420         tok->u.code = code;
5421       }
5422       break;
5423 
5424     case '1': case '2': case '3': case '4':
5425     case '5': case '6': case '7': case '8': case '9':
5426       PUNFETCH;
5427       prev = p;
5428       r = scan_number(&p, end, enc);
5429       if (r < 0 || r > ONIG_MAX_BACKREF_NUM) {
5430         goto skip_backref;
5431       }
5432 
5433       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
5434           (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */
5435         if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5436           if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node))
5437             return ONIGERR_INVALID_BACKREF;
5438         }
5439 
5440         tok->type = TK_BACKREF;
5441         tok->u.backref.num     = 1;
5442         tok->u.backref.ref1    = r;
5443         tok->u.backref.by_name = 0;
5444 #ifdef USE_BACKREF_WITH_LEVEL
5445         tok->u.backref.exist_level = 0;
5446 #endif
5447         break;
5448       }
5449 
5450     skip_backref:
5451       if (c == '8' || c == '9') {
5452         /* normal char */
5453         p = prev; PINC;
5454         break;
5455       }
5456 
5457       p = prev;
5458       /* fall through */
5459     case '0':
5460       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5461         prev = p;
5462         r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code);
5463         if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER;
5464         if (p == prev) {  /* can't read nothing. */
5465           code = 0; /* but, it's not error */
5466         }
5467         tok->type = TK_CRUDE_BYTE;
5468         tok->base = 8;
5469         tok->u.byte = (UChar )code;
5470       }
5471       else if (c != '0') {
5472         PINC;
5473       }
5474       break;
5475 
5476     case 'k':
5477       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
5478         PFETCH(c);
5479         if (c == '<' || c == '\'') {
5480           UChar* name_end;
5481           int* backs;
5482           int back_num;
5483           enum REF_NUM num_type;
5484 
5485           prev = p;
5486 
5487 #ifdef USE_BACKREF_WITH_LEVEL
5488           name_end = NULL_UCHARP; /* no need. escape gcc warning. */
5489           r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
5490                                  env, &back_num, &tok->u.backref.level, &num_type);
5491           if (r == 1) tok->u.backref.exist_level = 1;
5492           else        tok->u.backref.exist_level = 0;
5493 #else
5494           r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE);
5495 #endif
5496           if (r < 0) return r;
5497 
5498           if (num_type != IS_NOT_NUM) {
5499             if (num_type == IS_REL_NUM) {
5500               back_num = backref_rel_to_abs(back_num, env);
5501             }
5502             if (back_num <= 0)
5503               return ONIGERR_INVALID_BACKREF;
5504 
5505             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5506               if (back_num > env->num_mem ||
5507                   IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
5508                 return ONIGERR_INVALID_BACKREF;
5509             }
5510             tok->type = TK_BACKREF;
5511             tok->u.backref.by_name = 0;
5512             tok->u.backref.num  = 1;
5513             tok->u.backref.ref1 = back_num;
5514           }
5515           else {
5516             int num = name_to_group_numbers(env, prev, name_end, &backs);
5517             if (num <= 0) {
5518               return ONIGERR_UNDEFINED_NAME_REFERENCE;
5519             }
5520             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5521               int i;
5522               for (i = 0; i < num; i++) {
5523                 if (backs[i] > env->num_mem ||
5524                     IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
5525                   return ONIGERR_INVALID_BACKREF;
5526               }
5527             }
5528 
5529             tok->type = TK_BACKREF;
5530             tok->u.backref.by_name = 1;
5531             if (num == 1) {
5532               tok->u.backref.num  = 1;
5533               tok->u.backref.ref1 = backs[0];
5534             }
5535             else {
5536               tok->u.backref.num  = num;
5537               tok->u.backref.refs = backs;
5538             }
5539           }
5540         }
5541         else
5542           PUNFETCH;
5543       }
5544       break;
5545 
5546 #ifdef USE_CALL
5547     case 'g':
5548       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
5549         PFETCH(c);
5550         if (c == '<' || c == '\'') {
5551           int gnum;
5552           UChar* name_end;
5553           enum REF_NUM num_type;
5554 
5555           prev = p;
5556           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
5557                          &gnum, &num_type, TRUE);
5558           if (r < 0) return r;
5559 
5560           if (num_type != IS_NOT_NUM) {
5561             if (num_type == IS_REL_NUM) {
5562               gnum = backref_rel_to_abs(gnum, env);
5563               if (gnum < 0) {
5564                 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5565                                                prev, name_end);
5566                 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5567               }
5568             }
5569             tok->u.call.by_number = 1;
5570             tok->u.call.gnum      = gnum;
5571           }
5572           else {
5573             tok->u.call.by_number = 0;
5574             tok->u.call.gnum      = 0;
5575           }
5576 
5577           tok->type = TK_CALL;
5578           tok->u.call.name     = prev;
5579           tok->u.call.name_end = name_end;
5580         }
5581         else
5582           PUNFETCH;
5583       }
5584       break;
5585 #endif
5586 
5587     case 'Q':
5588       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
5589         tok->type = TK_QUOTE_OPEN;
5590       }
5591       break;
5592 
5593     case 'p':
5594     case 'P':
5595       if (!PEND && PPEEK_IS('{') &&
5596           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5597         PINC;
5598         tok->type = TK_CHAR_PROPERTY;
5599         tok->u.prop.not = c == 'P';
5600 
5601         if (!PEND &&
5602             IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5603           PFETCH(c);
5604           if (c == '^') {
5605             tok->u.prop.not = tok->u.prop.not == 0;
5606           }
5607           else
5608             PUNFETCH;
5609         }
5610       }
5611       break;
5612 
5613     default:
5614       {
5615         OnigCodePoint c2;
5616 
5617         PUNFETCH;
5618         r = fetch_escaped_value(&p, end, env, &c2);
5619         if (r < 0) return r;
5620         if (tok->u.code != c2) {
5621           tok->type = TK_CODE_POINT;
5622           tok->u.code = c2;
5623         }
5624         else { /* string */
5625           p = tok->backp + enclen(enc, tok->backp);
5626         }
5627       }
5628       break;
5629     }
5630   }
5631   else {
5632     tok->u.code = c;
5633     tok->escaped = 0;
5634 
5635 #ifdef USE_VARIABLE_META_CHARS
5636     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
5637         IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
5638       if (c == MC_ANYCHAR(syn))
5639         goto any_char;
5640       else if (c == MC_ANYTIME(syn))
5641         goto any_time;
5642       else if (c == MC_ZERO_OR_ONE_TIME(syn))
5643         goto zero_or_one_time;
5644       else if (c == MC_ONE_OR_MORE_TIME(syn))
5645         goto one_or_more_time;
5646       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
5647         tok->type = TK_ANYCHAR_ANYTIME;
5648         goto out;
5649       }
5650     }
5651 #endif
5652 
5653     switch (c) {
5654     case '.':
5655       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
5656 #ifdef USE_VARIABLE_META_CHARS
5657     any_char:
5658 #endif
5659       tok->type = TK_ANYCHAR;
5660       break;
5661 
5662     case '*':
5663       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
5664 #ifdef USE_VARIABLE_META_CHARS
5665     any_time:
5666 #endif
5667       tok->type = TK_REPEAT;
5668       tok->u.repeat.lower = 0;
5669       tok->u.repeat.upper = INFINITE_REPEAT;
5670       goto greedy_check;
5671       break;
5672 
5673     case '+':
5674       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
5675 #ifdef USE_VARIABLE_META_CHARS
5676     one_or_more_time:
5677 #endif
5678       tok->type = TK_REPEAT;
5679       tok->u.repeat.lower = 1;
5680       tok->u.repeat.upper = INFINITE_REPEAT;
5681       goto greedy_check;
5682       break;
5683 
5684     case '?':
5685       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
5686 #ifdef USE_VARIABLE_META_CHARS
5687     zero_or_one_time:
5688 #endif
5689       tok->type = TK_REPEAT;
5690       tok->u.repeat.lower = 0;
5691       tok->u.repeat.upper = 1;
5692       goto greedy_check;
5693       break;
5694 
5695     case '{':
5696       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
5697       r = fetch_interval(&p, end, tok, env);
5698       if (r < 0) return r;  /* error */
5699       if (r == 0) goto greedy_check2;
5700       else if (r == 2) { /* {n} */
5701         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5702           goto possessive_check;
5703 
5704         goto greedy_check2;
5705       }
5706       /* r == 1 : normal char */
5707       break;
5708 
5709     case '|':
5710       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
5711       tok->type = TK_ALT;
5712       break;
5713 
5714     case '(':
5715       if (!PEND && PPEEK_IS('?') &&
5716           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
5717         PINC;
5718         if (! PEND) {
5719           c = PPEEK;
5720           if (c == '#') {
5721             PFETCH(c);
5722             while (1) {
5723               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5724               PFETCH(c);
5725               if (c == MC_ESC(syn)) {
5726                 if (! PEND) PFETCH(c);
5727               }
5728               else {
5729                 if (c == ')') break;
5730               }
5731             }
5732             goto start;
5733           }
5734           else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) {
5735             int gnum;
5736             UChar* name;
5737             UChar* name_end;
5738             enum REF_NUM num_type;
5739 
5740             switch (c) {
5741             case '&':
5742               {
5743                 PINC;
5744                 name = p;
5745                 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
5746                                &gnum, &num_type, FALSE);
5747                 if (r < 0) return r;
5748 
5749                 tok->type = TK_CALL;
5750                 tok->u.call.by_number = 0;
5751                 tok->u.call.gnum      = 0;
5752                 tok->u.call.name      = name;
5753                 tok->u.call.name_end  = name_end;
5754               }
5755               break;
5756 
5757             case 'R':
5758               tok->type = TK_CALL;
5759               tok->u.call.by_number = 1;
5760               tok->u.call.gnum      = 0;
5761               tok->u.call.name      = p;
5762               PINC;
5763               if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
5764               tok->u.call.name_end  = p;
5765               break;
5766 
5767             case '-':
5768             case '+':
5769               goto lparen_qmark_num;
5770               break;
5771             default:
5772               if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end;
5773 
5774             lparen_qmark_num:
5775               {
5776                 name = p;
5777                 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
5778                                &gnum, &num_type, TRUE);
5779                 if (r < 0) return r;
5780 
5781                 if (num_type == IS_NOT_NUM) {
5782                   return ONIGERR_INVALID_GROUP_NAME;
5783                 }
5784                 else {
5785                   if (num_type == IS_REL_NUM) {
5786                     gnum = backref_rel_to_abs(gnum, env);
5787                     if (gnum < 0) {
5788                       onig_scan_env_set_error_string(env,
5789                              ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
5790                       return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5791                     }
5792                   }
5793                   tok->u.call.by_number = 1;
5794                   tok->u.call.gnum      = gnum;
5795                 }
5796 
5797                 tok->type = TK_CALL;
5798                 tok->u.call.name     = name;
5799                 tok->u.call.name_end = name_end;
5800               }
5801               break;
5802             }
5803           }
5804         }
5805       lparen_qmark_end:
5806         PUNFETCH;
5807       }
5808 
5809       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
5810       tok->type = TK_SUBEXP_OPEN;
5811       break;
5812 
5813     case ')':
5814       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
5815       tok->type = TK_SUBEXP_CLOSE;
5816       break;
5817 
5818     case '^':
5819       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
5820       tok->type = TK_ANCHOR;
5821       tok->u.subtype = (OPTON_SINGLELINE(env->options)
5822                         ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE);
5823       break;
5824 
5825     case '$':
5826       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
5827       tok->type = TK_ANCHOR;
5828       tok->u.subtype = (OPTON_SINGLELINE(env->options)
5829                         ? ANCR_SEMI_END_BUF : ANCR_END_LINE);
5830       break;
5831 
5832     case '[':
5833       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
5834       tok->type = TK_OPEN_CC;
5835       break;
5836 
5837     case ']':
5838       if (*src > env->pattern)   /* /].../ is allowed. */
5839         CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
5840       break;
5841 
5842     case '#':
5843       if (OPTON_EXTEND(env->options)) {
5844         while (!PEND) {
5845           PFETCH(c);
5846           if (ONIGENC_IS_CODE_NEWLINE(enc, c))
5847             break;
5848         }
5849         goto start;
5850         break;
5851       }
5852       break;
5853 
5854     case ' ': case '\t': case '\n': case '\r': case '\f':
5855       if (OPTON_EXTEND(env->options))
5856         goto start;
5857       break;
5858 
5859     default:
5860       /* string */
5861       break;
5862     }
5863   }
5864 
5865 #ifdef USE_VARIABLE_META_CHARS
5866  out:
5867 #endif
5868   *src = p;
5869   return tok->type;
5870 }
5871 
5872 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])5873 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
5874                          OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out,
5875                          const OnigCodePoint mbr[])
5876 {
5877   int i, r;
5878   OnigCodePoint j;
5879 
5880   int n = ONIGENC_CODE_RANGE_NUM(mbr);
5881 
5882   if (not == 0) {
5883     for (i = 0; i < n; i++) {
5884       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
5885            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
5886         if (j >= sb_out) {
5887           if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5888             r = add_code_range_to_buf(&(cc->mbuf), j,
5889                                       ONIGENC_CODE_RANGE_TO(mbr, i));
5890             if (r != 0) return r;
5891             i++;
5892           }
5893 
5894           goto sb_end;
5895         }
5896         BITSET_SET_BIT(cc->bs, j);
5897       }
5898     }
5899 
5900   sb_end:
5901     for ( ; i < n; i++) {
5902       r = add_code_range_to_buf(&(cc->mbuf),
5903                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
5904                                 ONIGENC_CODE_RANGE_TO(mbr, i));
5905       if (r != 0) return r;
5906     }
5907   }
5908   else {
5909     OnigCodePoint prev = 0;
5910 
5911     for (i = 0; i < n; i++) {
5912       for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
5913         if (j >= sb_out) {
5914           goto sb_end2;
5915         }
5916         BITSET_SET_BIT(cc->bs, j);
5917       }
5918       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
5919     }
5920     for (j = prev; j < sb_out; j++) {
5921       BITSET_SET_BIT(cc->bs, j);
5922     }
5923 
5924   sb_end2:
5925     prev = sb_out;
5926 
5927     for (i = 0; i < n; i++) {
5928       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5929         r = add_code_range_to_buf(&(cc->mbuf), prev,
5930                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
5931         if (r != 0) return r;
5932       }
5933       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
5934       if (prev == 0) goto end;
5935     }
5936 
5937     r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
5938     if (r != 0) return r;
5939   }
5940 
5941  end:
5942   return 0;
5943 }
5944 
5945 static int
add_ctype_to_cc_by_range_limit(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[],OnigCodePoint limit)5946 add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,
5947                                OnigEncoding enc ARG_UNUSED,
5948                                OnigCodePoint sb_out,
5949                                const OnigCodePoint mbr[], OnigCodePoint limit)
5950 {
5951   int i, r;
5952   OnigCodePoint j;
5953   OnigCodePoint from;
5954   OnigCodePoint to;
5955 
5956   int n = ONIGENC_CODE_RANGE_NUM(mbr);
5957 
5958   if (not == 0) {
5959     for (i = 0; i < n; i++) {
5960       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
5961            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
5962         if (j > limit) goto end;
5963         if (j >= sb_out) {
5964           if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
5965             to = ONIGENC_CODE_RANGE_TO(mbr, i);
5966             if (to > limit) to = limit;
5967             r = add_code_range_to_buf(&(cc->mbuf), j, to);
5968             if (r != 0) return r;
5969             i++;
5970           }
5971 
5972           goto sb_end;
5973         }
5974         BITSET_SET_BIT(cc->bs, j);
5975       }
5976     }
5977 
5978   sb_end:
5979     for ( ; i < n; i++) {
5980       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
5981       to   = ONIGENC_CODE_RANGE_TO(mbr, i);
5982       if (from > limit) break;
5983       if (to   > limit) to = limit;
5984       r = add_code_range_to_buf(&(cc->mbuf), from, to);
5985       if (r != 0) return r;
5986     }
5987   }
5988   else {
5989     OnigCodePoint prev = 0;
5990 
5991     for (i = 0; i < n; i++) {
5992       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
5993       if (from > limit) {
5994         for (j = prev; j < sb_out; j++) {
5995           BITSET_SET_BIT(cc->bs, j);
5996         }
5997         goto sb_end2;
5998       }
5999       for (j = prev; j < from; j++) {
6000         if (j >= sb_out) goto sb_end2;
6001         BITSET_SET_BIT(cc->bs, j);
6002       }
6003       prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6004       if (prev > limit) prev = limit;
6005       prev++;
6006       if (prev == 0) goto end;
6007     }
6008     for (j = prev; j < sb_out; j++) {
6009       BITSET_SET_BIT(cc->bs, j);
6010     }
6011 
6012   sb_end2:
6013     prev = sb_out;
6014 
6015     for (i = 0; i < n; i++) {
6016       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6017       if (from > limit) goto last;
6018 
6019       if (prev < from) {
6020         r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1);
6021         if (r != 0) return r;
6022       }
6023       prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6024       if (prev > limit) prev = limit;
6025       prev++;
6026       if (prev == 0) goto end;
6027     }
6028 
6029   last:
6030     r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
6031     if (r != 0) return r;
6032   }
6033 
6034  end:
6035   return 0;
6036 }
6037 
6038 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)6039 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
6040 {
6041   int c, r;
6042   int ascii_mode;
6043   int is_single;
6044   const OnigCodePoint *ranges;
6045   OnigCodePoint limit;
6046   OnigCodePoint sb_out;
6047   OnigEncoding enc = env->enc;
6048 
6049   ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options);
6050 
6051   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
6052   if (r == 0) {
6053     if (ascii_mode == 0)
6054       r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
6055     else
6056       r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out,
6057                                          ranges, ASCII_LIMIT);
6058     return r;
6059   }
6060   else if (r != ONIG_NO_SUPPORT_CONFIG) {
6061     return r;
6062   }
6063 
6064   r = 0;
6065   is_single = ONIGENC_IS_SINGLEBYTE(enc);
6066   limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE;
6067 
6068   switch (ctype) {
6069   case ONIGENC_CTYPE_ALPHA:
6070   case ONIGENC_CTYPE_BLANK:
6071   case ONIGENC_CTYPE_CNTRL:
6072   case ONIGENC_CTYPE_DIGIT:
6073   case ONIGENC_CTYPE_LOWER:
6074   case ONIGENC_CTYPE_PUNCT:
6075   case ONIGENC_CTYPE_SPACE:
6076   case ONIGENC_CTYPE_UPPER:
6077   case ONIGENC_CTYPE_XDIGIT:
6078   case ONIGENC_CTYPE_ASCII:
6079   case ONIGENC_CTYPE_ALNUM:
6080     if (not != 0) {
6081       for (c = 0; c < (int )limit; c++) {
6082         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6083           if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6084             BITSET_SET_BIT(cc->bs, c);
6085         }
6086       }
6087       for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6088         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6089           BITSET_SET_BIT(cc->bs, c);
6090       }
6091 
6092       if (is_single == 0)
6093         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6094     }
6095     else {
6096       for (c = 0; c < (int )limit; c++) {
6097         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6098           if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6099             BITSET_SET_BIT(cc->bs, c);
6100         }
6101       }
6102     }
6103     break;
6104 
6105   case ONIGENC_CTYPE_GRAPH:
6106   case ONIGENC_CTYPE_PRINT:
6107   case ONIGENC_CTYPE_WORD:
6108     if (not != 0) {
6109       for (c = 0; c < (int )limit; c++) {
6110         /* check invalid code point */
6111         if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6112             && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6113           BITSET_SET_BIT(cc->bs, c);
6114       }
6115       for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6116         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6117           BITSET_SET_BIT(cc->bs, c);
6118       }
6119       if (ascii_mode != 0 && is_single == 0)
6120         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6121     }
6122     else {
6123       for (c = 0; c < (int )limit; c++) {
6124         if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6125             && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6126           BITSET_SET_BIT(cc->bs, c);
6127       }
6128       if (ascii_mode == 0 && is_single == 0)
6129         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6130     }
6131     break;
6132 
6133   default:
6134     return ONIGERR_PARSER_BUG;
6135     break;
6136   }
6137 
6138   return r;
6139 }
6140 
6141 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)6142 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
6143 {
6144 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
6145 #define POSIX_BRACKET_NAME_MIN_LEN         4
6146 
6147   static PosixBracketEntryType PBS[] = {
6148     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
6149     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
6150     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
6151     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
6152     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
6153     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
6154     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
6155     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
6156     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
6157     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
6158     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
6159     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
6160     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
6161     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
6162     { (UChar* )NULL,     -1, 0 }
6163   };
6164 
6165   PosixBracketEntryType *pb;
6166   int not, i, r;
6167   OnigCodePoint c;
6168   OnigEncoding enc = env->enc;
6169   UChar *p = *src;
6170 
6171   if (PPEEK_IS('^')) {
6172     PINC_S;
6173     not = 1;
6174   }
6175   else
6176     not = 0;
6177 
6178   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
6179     goto not_posix_bracket;
6180 
6181   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
6182     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
6183       p = (UChar* )onigenc_step(enc, p, end, pb->len);
6184       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
6185         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6186 
6187       r = add_ctype_to_cc(cc, pb->ctype, not, env);
6188       if (r != 0) return r;
6189 
6190       PINC_S; PINC_S;
6191       *src = p;
6192       return 0;
6193     }
6194   }
6195 
6196  not_posix_bracket:
6197   c = 0;
6198   i = 0;
6199   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
6200     PINC_S;
6201     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
6202   }
6203   if (c == ':' && ! PEND) {
6204     PINC_S;
6205     if (! PEND) {
6206       PFETCH_S(c);
6207       if (c == ']')
6208         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6209     }
6210   }
6211 
6212   return 1;  /* 1: is not POSIX bracket, but no error. */
6213 }
6214 
6215 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)6216 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
6217 {
6218   int r;
6219   OnigCodePoint c;
6220   OnigEncoding enc;
6221   UChar *prev, *start, *p;
6222 
6223   p = *src;
6224   enc = env->enc;
6225   r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6226   start = prev = p;
6227 
6228   while (!PEND) {
6229     prev = p;
6230     PFETCH_S(c);
6231     if (c == '}') {
6232       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
6233       if (r >= 0) {
6234         *src = p;
6235       }
6236       else {
6237         onig_scan_env_set_error_string(env, r, *src, prev);
6238       }
6239 
6240       return r;
6241     }
6242     else if (c == '(' || c == ')' || c == '{' || c == '|') {
6243       break;
6244     }
6245   }
6246 
6247   return r;
6248 }
6249 
6250 static int
parse_char_property(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6251 parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6252 {
6253   int r, ctype;
6254   CClassNode* cc;
6255 
6256   ctype = fetch_char_property_to_ctype(src, end, env);
6257   if (ctype < 0) return ctype;
6258 
6259   *np = node_new_cclass();
6260   CHECK_NULL_RETURN_MEMERR(*np);
6261   cc = CCLASS_(*np);
6262   r = add_ctype_to_cc(cc, ctype, FALSE, env);
6263   if (r != 0) return r;
6264   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6265 
6266   return 0;
6267 }
6268 
6269 
6270 typedef enum {
6271   CS_VALUE,
6272   CS_RANGE,
6273   CS_COMPLETE,
6274   CS_START
6275 } CSTATE;
6276 
6277 typedef enum {
6278   CV_UNDEF,
6279   CV_SB,
6280   CV_MB,
6281   CV_CPROP
6282 } CVAL;
6283 
6284 static int
cc_cprop_next(CClassNode * cc,OnigCodePoint * pcode,CVAL * val,CSTATE * state,ScanEnv * env)6285 cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
6286               ScanEnv* env)
6287 {
6288   int r;
6289 
6290   if (*state == CS_RANGE)
6291     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
6292 
6293   if (*state == CS_VALUE) {
6294     if (*val == CV_SB)
6295       BITSET_SET_BIT(cc->bs, (int )(*pcode));
6296     else if (*val == CV_MB) {
6297       r = add_code_range(&(cc->mbuf), env, *pcode, *pcode);
6298       if (r < 0) return r;
6299     }
6300   }
6301 
6302   *state = CS_VALUE;
6303   *val   = CV_CPROP;
6304   return 0;
6305 }
6306 
6307 static int
cc_char_next(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_raw,int to_raw,CVAL intype,CVAL * type,CSTATE * state,ScanEnv * env)6308 cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
6309              int* from_raw, int to_raw, CVAL intype, CVAL* type,
6310              CSTATE* state, ScanEnv* env)
6311 {
6312   int r;
6313 
6314   switch (*state) {
6315   case CS_VALUE:
6316     if (*type == CV_SB) {
6317       if (*from > 0xff)
6318           return ONIGERR_INVALID_CODE_POINT_VALUE;
6319 
6320       BITSET_SET_BIT(cc->bs, (int )(*from));
6321     }
6322     else if (*type == CV_MB) {
6323       r = add_code_range(&(cc->mbuf), env, *from, *from);
6324       if (r < 0) return r;
6325     }
6326     break;
6327 
6328   case CS_RANGE:
6329     if (intype == *type) {
6330       if (intype == CV_SB) {
6331         if (*from > 0xff || to > 0xff)
6332           return ONIGERR_INVALID_CODE_POINT_VALUE;
6333 
6334         if (*from > to) {
6335           if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6336             goto ccs_range_end;
6337           else
6338             return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6339         }
6340         bitset_set_range(cc->bs, (int )*from, (int )to);
6341       }
6342       else {
6343         r = add_code_range(&(cc->mbuf), env, *from, to);
6344         if (r < 0) return r;
6345       }
6346     }
6347     else {
6348       if (*from > to) {
6349         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6350           goto ccs_range_end;
6351         else
6352           return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6353       }
6354       bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
6355       r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
6356       if (r < 0) return r;
6357     }
6358   ccs_range_end:
6359     *state = CS_COMPLETE;
6360     break;
6361 
6362   case CS_COMPLETE:
6363   case CS_START:
6364     *state = CS_VALUE;
6365     break;
6366 
6367   default:
6368     break;
6369   }
6370 
6371   *from_raw = to_raw;
6372   *from     = to;
6373   *type     = intype;
6374   return 0;
6375 }
6376 
6377 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)6378 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
6379                  ScanEnv* env)
6380 {
6381   int in_esc;
6382   OnigCodePoint code;
6383   OnigEncoding enc = env->enc;
6384   UChar* p = from;
6385 
6386   in_esc = 0;
6387   while (! PEND) {
6388     if (ignore_escaped && in_esc) {
6389       in_esc = 0;
6390     }
6391     else {
6392       PFETCH_S(code);
6393       if (code == c) return 1;
6394       if (code == MC_ESC(env->syntax)) in_esc = 1;
6395     }
6396   }
6397   return 0;
6398 }
6399 
6400 static int
parse_cc(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6401 parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6402 {
6403   int r, neg, len, fetched, and_start;
6404   OnigCodePoint in_code, curr_code;
6405   UChar *p;
6406   Node* node;
6407   CClassNode *cc, *prev_cc;
6408   CClassNode work_cc;
6409   int curr_raw, in_raw;
6410   CSTATE state;
6411   CVAL in_type;
6412   CVAL curr_type;
6413 
6414   *np = NULL_NODE;
6415   INC_PARSE_DEPTH(env->parse_depth);
6416 
6417   prev_cc = (CClassNode* )NULL;
6418   r = fetch_token_in_cc(tok, src, end, env);
6419   if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) {
6420     neg = 1;
6421     r = fetch_token_in_cc(tok, src, end, env);
6422   }
6423   else {
6424     neg = 0;
6425   }
6426 
6427   if (r < 0) return r;
6428   if (r == TK_CC_CLOSE) {
6429     if (! code_exist_check((OnigCodePoint )']',
6430                            *src, env->pattern_end, 1, env))
6431       return ONIGERR_EMPTY_CHAR_CLASS;
6432 
6433     CC_ESC_WARN(env, (UChar* )"]");
6434     r = tok->type = TK_CHAR;  /* allow []...] */
6435   }
6436 
6437   *np = node = node_new_cclass();
6438   CHECK_NULL_RETURN_MEMERR(node);
6439   cc = CCLASS_(node);
6440 
6441   and_start = 0;
6442   state = CS_START;
6443   curr_type = CV_UNDEF;
6444 
6445   p = *src;
6446   while (r != TK_CC_CLOSE) {
6447     fetched = 0;
6448     switch (r) {
6449     case TK_CHAR:
6450     any_char_in:
6451       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
6452       if (len < 0) {
6453         r = len;
6454         goto err;
6455       }
6456       in_type = (len == 1) ? CV_SB : CV_MB;
6457       in_code = tok->u.code;
6458       in_raw = 0;
6459       goto val_entry2;
6460       break;
6461 
6462     case TK_CRUDE_BYTE:
6463       /* tok->base != 0 : octal or hexadec. */
6464       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
6465         int i, j;
6466         UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
6467         UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
6468         UChar* psave = p;
6469         int base = tok->base;
6470 
6471         buf[0] = tok->u.byte;
6472         for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
6473           r = fetch_token_in_cc(tok, &p, end, env);
6474           if (r < 0) goto err;
6475           if (r != TK_CRUDE_BYTE || tok->base != base) {
6476             fetched = 1;
6477             break;
6478           }
6479           buf[i] = tok->u.byte;
6480         }
6481 
6482         if (i < ONIGENC_MBC_MINLEN(env->enc)) {
6483           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6484           goto err;
6485         }
6486 
6487         /* clear buf tail */
6488         for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0';
6489 
6490         len = enclen(env->enc, buf);
6491         if (i < len) {
6492           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6493           goto err;
6494         }
6495         else if (i > len) { /* fetch back */
6496           p = psave;
6497           for (i = 1; i < len; i++) {
6498             r = fetch_token_in_cc(tok, &p, end, env);
6499           }
6500           fetched = 0;
6501         }
6502 
6503         if (i == 1) {
6504           in_code = (OnigCodePoint )buf[0];
6505           goto crude_single;
6506         }
6507         else {
6508           in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
6509           in_type = CV_MB;
6510         }
6511       }
6512       else {
6513         in_code = (OnigCodePoint )tok->u.byte;
6514       crude_single:
6515         in_type = CV_SB;
6516       }
6517       in_raw = 1;
6518       goto val_entry2;
6519       break;
6520 
6521     case TK_CODE_POINT:
6522       in_code = tok->u.code;
6523       in_raw  = 1;
6524     val_entry:
6525       len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code);
6526       if (len < 0) {
6527         if (state != CS_RANGE ||
6528             ! IS_SYNTAX_BV(env->syntax,
6529                            ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) ||
6530             in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) {
6531           r = len;
6532           goto err;
6533         }
6534       }
6535       in_type = (len == 1 ? CV_SB : CV_MB);
6536     val_entry2:
6537       r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type,
6538                        &curr_type, &state, env);
6539       if (r != 0) goto err;
6540       break;
6541 
6542     case TK_CC_POSIX_BRACKET_OPEN:
6543       r = parse_posix_bracket(cc, &p, end, env);
6544       if (r < 0) goto err;
6545       if (r == 1) {  /* is not POSIX bracket */
6546         CC_ESC_WARN(env, (UChar* )"[");
6547         p = tok->backp;
6548         in_code = tok->u.code;
6549         in_raw = 0;
6550         goto val_entry;
6551       }
6552       goto next_cprop;
6553       break;
6554 
6555     case TK_CHAR_TYPE:
6556       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
6557       if (r != 0) goto err;
6558 
6559     next_cprop:
6560       r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env);
6561       if (r != 0) goto err;
6562       break;
6563 
6564     case TK_CHAR_PROPERTY:
6565       {
6566         int ctype = fetch_char_property_to_ctype(&p, end, env);
6567         if (ctype < 0) {
6568           r = ctype;
6569           goto err;
6570         }
6571         r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
6572         if (r != 0) goto err;
6573         goto next_cprop;
6574       }
6575       break;
6576 
6577     case TK_CC_RANGE:
6578       if (state == CS_VALUE) {
6579         r = fetch_token_in_cc(tok, &p, end, env);
6580         if (r < 0) goto err;
6581 
6582         fetched = 1;
6583         if (r == TK_CC_CLOSE) { /* allow [x-] */
6584         range_end_val:
6585           in_code = (OnigCodePoint )'-';
6586           in_raw = 0;
6587           goto val_entry;
6588         }
6589         else if (r == TK_CC_AND) {
6590           CC_ESC_WARN(env, (UChar* )"-");
6591           goto range_end_val;
6592         }
6593 
6594         if (curr_type == CV_CPROP) {
6595           r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6596           goto err;
6597         }
6598 
6599         state = CS_RANGE;
6600       }
6601       else if (state == CS_START) {
6602         /* [-xa] is allowed */
6603         in_code = tok->u.code;
6604         in_raw = 0;
6605 
6606         r = fetch_token_in_cc(tok, &p, end, env);
6607         if (r < 0) goto err;
6608 
6609         fetched = 1;
6610         /* [--x] or [a&&-x] is warned. */
6611         if (r == TK_CC_RANGE || and_start != 0)
6612           CC_ESC_WARN(env, (UChar* )"-");
6613 
6614         goto val_entry;
6615       }
6616       else if (state == CS_RANGE) {
6617         CC_ESC_WARN(env, (UChar* )"-");
6618         goto any_char_in;  /* [!--] is allowed */
6619       }
6620       else { /* CS_COMPLETE */
6621         r = fetch_token_in_cc(tok, &p, end, env);
6622         if (r < 0) goto err;
6623 
6624         fetched = 1;
6625         if (r == TK_CC_CLOSE)
6626           goto range_end_val; /* allow [a-b-] */
6627         else if (r == TK_CC_AND) {
6628           CC_ESC_WARN(env, (UChar* )"-");
6629           goto range_end_val;
6630         }
6631 
6632         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
6633           CC_ESC_WARN(env, (UChar* )"-");
6634           goto range_end_val;   /* [0-9-a] is allowed as [0-9\-a] */
6635         }
6636         r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6637         goto err;
6638       }
6639       break;
6640 
6641     case TK_CC_OPEN_CC: /* [ */
6642       {
6643         Node *anode;
6644         CClassNode* acc;
6645 
6646         if (state == CS_VALUE) {
6647           r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6648                            &state, env);
6649           if (r != 0) goto err;
6650         }
6651         state = CS_COMPLETE;
6652 
6653         r = parse_cc(&anode, tok, &p, end, env);
6654         if (r != 0) {
6655           onig_node_free(anode);
6656           goto cc_open_err;
6657         }
6658         acc = CCLASS_(anode);
6659         r = or_cclass(cc, acc, env->enc);
6660         onig_node_free(anode);
6661 
6662       cc_open_err:
6663         if (r != 0) goto err;
6664       }
6665       break;
6666 
6667     case TK_CC_AND: /* && */
6668       {
6669         if (state == CS_VALUE) {
6670           r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6671                            &state, env);
6672           if (r != 0) goto err;
6673         }
6674         /* initialize local variables */
6675         and_start = 1;
6676         state = CS_START;
6677 
6678         if (IS_NOT_NULL(prev_cc)) {
6679           r = and_cclass(prev_cc, cc, env->enc);
6680           if (r != 0) goto err;
6681           bbuf_free(cc->mbuf);
6682         }
6683         else {
6684           prev_cc = cc;
6685           cc = &work_cc;
6686         }
6687         initialize_cclass(cc);
6688       }
6689       break;
6690 
6691     case TK_EOT:
6692       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
6693       goto err;
6694       break;
6695     default:
6696       r = ONIGERR_PARSER_BUG;
6697       goto err;
6698       break;
6699     }
6700 
6701     if (fetched)
6702       r = tok->type;
6703     else {
6704       r = fetch_token_in_cc(tok, &p, end, env);
6705       if (r < 0) goto err;
6706     }
6707   }
6708 
6709   if (state == CS_VALUE) {
6710     r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6711                      &state, env);
6712     if (r != 0) goto err;
6713   }
6714 
6715   if (IS_NOT_NULL(prev_cc)) {
6716     r = and_cclass(prev_cc, cc, env->enc);
6717     if (r != 0) goto err;
6718     bbuf_free(cc->mbuf);
6719     cc = prev_cc;
6720   }
6721 
6722   if (neg != 0)
6723     NCCLASS_SET_NOT(cc);
6724   else
6725     NCCLASS_CLEAR_NOT(cc);
6726   if (IS_NCCLASS_NOT(cc) &&
6727       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
6728     int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
6729     if (is_empty != 0)
6730       BITSET_IS_EMPTY(cc->bs, is_empty);
6731 
6732     if (is_empty == 0) {
6733       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
6734         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
6735           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
6736         else
6737           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
6738       }
6739     }
6740   }
6741   *src = p;
6742   DEC_PARSE_DEPTH(env->parse_depth);
6743   return 0;
6744 
6745  err:
6746   if (cc != CCLASS_(*np))
6747     bbuf_free(cc->mbuf);
6748   return r;
6749 }
6750 
6751 static int parse_alts(Node** top, PToken* tok, int term,
6752                       UChar** src, UChar* end, ScanEnv* env, int group_head);
6753 
6754 #ifdef USE_CALLOUT
6755 
6756 /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */
6757 static int
parse_callout_of_contents(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)6758 parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
6759 {
6760   int r;
6761   int i;
6762   int in;
6763   int num;
6764   OnigCodePoint c;
6765   UChar* code_start;
6766   UChar* code_end;
6767   UChar* contents;
6768   UChar* tag_start;
6769   UChar* tag_end;
6770   int brace_nest;
6771   CalloutListEntry* e;
6772   RegexExt* ext;
6773   OnigEncoding enc = env->enc;
6774   UChar* p = *src;
6775 
6776   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6777 
6778   brace_nest = 0;
6779   while (PPEEK_IS('{')) {
6780     brace_nest++;
6781     PINC_S;
6782     if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6783   }
6784 
6785   in = ONIG_CALLOUT_IN_PROGRESS;
6786   code_start = p;
6787   while (1) {
6788     if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6789 
6790     code_end = p;
6791     PFETCH_S(c);
6792     if (c == '}') {
6793       i = brace_nest;
6794       while (i > 0) {
6795         if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6796         PFETCH_S(c);
6797         if (c == '}') i--;
6798         else break;
6799       }
6800       if (i == 0) break;
6801     }
6802   }
6803 
6804   if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6805 
6806   PFETCH_S(c);
6807   if (c == '[') {
6808     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6809     tag_end = tag_start = p;
6810     while (! PEND) {
6811       if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6812       tag_end = p;
6813       PFETCH_S(c);
6814       if (c == ']') break;
6815     }
6816     if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
6817       return ONIGERR_INVALID_CALLOUT_TAG_NAME;
6818 
6819     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6820     PFETCH_S(c);
6821   }
6822   else {
6823     tag_start = tag_end = 0;
6824   }
6825 
6826   if (c == 'X') {
6827     in |= ONIG_CALLOUT_IN_RETRACTION;
6828     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6829     PFETCH_S(c);
6830   }
6831   else if (c == '<') {
6832     in = ONIG_CALLOUT_IN_RETRACTION;
6833     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6834     PFETCH_S(c);
6835   }
6836   else if (c == '>') { /* no needs (default) */
6837     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
6838     PFETCH_S(c);
6839   }
6840 
6841   if (c != cterm)
6842     return ONIGERR_INVALID_CALLOUT_PATTERN;
6843 
6844   r = reg_callout_list_entry(env, &num);
6845   if (r != 0) return r;
6846 
6847   ext = onig_get_regex_ext(env->reg);
6848   CHECK_NULL_RETURN_MEMERR(ext);
6849   if (IS_NULL(ext->pattern)) {
6850     r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
6851     if (r != ONIG_NORMAL) return r;
6852   }
6853 
6854   if (tag_start != tag_end) {
6855     r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
6856     if (r != ONIG_NORMAL) return r;
6857   }
6858 
6859   contents = onigenc_strdup(enc, code_start, code_end);
6860   CHECK_NULL_RETURN_MEMERR(contents);
6861 
6862   r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env);
6863   if (r != 0) {
6864     xfree(contents);
6865     return r;
6866   }
6867 
6868   e = onig_reg_callout_list_at(env->reg, num);
6869   if (IS_NULL(e)) {
6870     xfree(contents);
6871     return ONIGERR_MEMORY;
6872   }
6873 
6874   e->of      = ONIG_CALLOUT_OF_CONTENTS;
6875   e->in      = in;
6876   e->name_id = ONIG_NON_NAME_ID;
6877   e->u.content.start = contents;
6878   e->u.content.end   = contents + (code_end - code_start);
6879 
6880   *src = p;
6881   return 0;
6882 }
6883 
6884 static long
parse_long(OnigEncoding enc,UChar * s,UChar * end,int sign_on,long max,long * rl)6885 parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl)
6886 {
6887   long v;
6888   long d;
6889   int flag;
6890   UChar* p;
6891   OnigCodePoint c;
6892 
6893   if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG;
6894 
6895   flag = 1;
6896   v = 0;
6897   p = s;
6898   while (p < end) {
6899     c = ONIGENC_MBC_TO_CODE(enc, p, end);
6900     p += ONIGENC_MBC_ENC_LEN(enc, p);
6901     if (c >= '0' && c <= '9') {
6902       d = (long )(c - '0');
6903       if (v > (max - d) / 10)
6904         return ONIGERR_INVALID_CALLOUT_ARG;
6905 
6906       v = v * 10 + d;
6907     }
6908     else if (sign_on != 0 && (c == '-' || c == '+')) {
6909       if (c == '-') flag = -1;
6910     }
6911     else
6912       return ONIGERR_INVALID_CALLOUT_ARG;
6913 
6914     sign_on = 0;
6915   }
6916 
6917   *rl = flag * v;
6918   return ONIG_NORMAL;
6919 }
6920 
6921 static int
parse_callout_args(int skip_mode,int cterm,UChar ** src,UChar * end,int max_arg_num,unsigned int types[],OnigValue vals[],ScanEnv * env)6922 parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
6923                    int max_arg_num, unsigned int types[], OnigValue vals[],
6924                    ScanEnv* env)
6925 {
6926 #define MAX_CALLOUT_ARG_BYTE_LENGTH   128
6927 
6928   int r;
6929   int n;
6930   int esc;
6931   int cn;
6932   UChar* s;
6933   UChar* e;
6934   UChar* eesc;
6935   OnigCodePoint c;
6936   UChar* bufend;
6937   UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH];
6938   OnigEncoding enc = env->enc;
6939   UChar* p = *src;
6940 
6941   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6942 
6943   c = 0;
6944   n = 0;
6945   while (n < ONIG_CALLOUT_MAX_ARGS_NUM) {
6946     cn  = 0;
6947     esc = 0;
6948     eesc = 0;
6949     bufend = buf;
6950     s = e = p;
6951     while (1) {
6952       if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
6953 
6954       e = p;
6955       PFETCH_S(c);
6956       if (esc != 0) {
6957         esc = 0;
6958         if (c == '\\' || c == cterm || c == ',') {
6959           /* */
6960         }
6961         else {
6962           e = eesc;
6963           cn++;
6964         }
6965         goto add_char;
6966       }
6967       else {
6968         if (c == '\\') {
6969           esc = 1;
6970           eesc = e;
6971         }
6972         else if (c == cterm || c == ',')
6973           break;
6974         else {
6975           size_t clen;
6976 
6977         add_char:
6978           if (skip_mode == FALSE) {
6979             clen = p - e;
6980             if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH)
6981               return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */
6982 
6983             xmemcpy(bufend, e, clen);
6984             bufend += clen;
6985           }
6986           cn++;
6987         }
6988       }
6989     }
6990 
6991     if (cn != 0) {
6992       if (max_arg_num >= 0 && n >= max_arg_num)
6993         return ONIGERR_INVALID_CALLOUT_ARG;
6994 
6995       if (skip_mode == FALSE) {
6996         if ((types[n] & ONIG_TYPE_LONG) != 0) {
6997           int fixed = 0;
6998           if (cn > 0) {
6999             long rl;
7000             r = parse_long(enc, buf, bufend, 1, LONG_MAX, &rl);
7001             if (r == ONIG_NORMAL) {
7002               vals[n].l = rl;
7003               fixed = 1;
7004               types[n] = ONIG_TYPE_LONG;
7005             }
7006           }
7007 
7008           if (fixed == 0) {
7009             types[n] = (types[n] & ~ONIG_TYPE_LONG);
7010             if (types[n] == ONIG_TYPE_VOID)
7011               return ONIGERR_INVALID_CALLOUT_ARG;
7012           }
7013         }
7014 
7015         switch (types[n]) {
7016         case ONIG_TYPE_LONG:
7017           break;
7018 
7019         case ONIG_TYPE_CHAR:
7020           if (cn != 1) return ONIGERR_INVALID_CALLOUT_ARG;
7021           vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend);
7022           break;
7023 
7024         case ONIG_TYPE_STRING:
7025           {
7026             UChar* rs = onigenc_strdup(enc, buf, bufend);
7027             CHECK_NULL_RETURN_MEMERR(rs);
7028             vals[n].s.start = rs;
7029             vals[n].s.end   = rs + (e - s);
7030           }
7031           break;
7032 
7033         case ONIG_TYPE_TAG:
7034           if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e))
7035             return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7036 
7037           vals[n].s.start = s;
7038           vals[n].s.end   = e;
7039           break;
7040 
7041         case ONIG_TYPE_VOID:
7042         case ONIG_TYPE_POINTER:
7043           return ONIGERR_PARSER_BUG;
7044           break;
7045         }
7046       }
7047 
7048       n++;
7049     }
7050 
7051     if (c == cterm) break;
7052   }
7053 
7054   if (c != cterm) return ONIGERR_INVALID_CALLOUT_PATTERN;
7055 
7056   *src = p;
7057   return n;
7058 }
7059 
7060 /* (*name[TAG]) (*name[TAG]{a,b,..}) */
7061 static int
parse_callout_of_name(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)7062 parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
7063 {
7064   int r;
7065   int i;
7066   int in;
7067   int num;
7068   int name_id;
7069   int arg_num;
7070   int max_arg_num;
7071   int opt_arg_num;
7072   int is_not_single;
7073   OnigCodePoint c;
7074   UChar* name_start;
7075   UChar* name_end;
7076   UChar* tag_start;
7077   UChar* tag_end;
7078   Node*  node;
7079   CalloutListEntry* e;
7080   RegexExt* ext;
7081   unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM];
7082   OnigValue    vals[ONIG_CALLOUT_MAX_ARGS_NUM];
7083   OnigEncoding enc = env->enc;
7084   UChar* p = *src;
7085 
7086   /* PFETCH_READY; */
7087   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7088 
7089   node = 0;
7090   name_start = p;
7091   while (1) {
7092     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7093     name_end = p;
7094     PFETCH_S(c);
7095     if (c == cterm || c == '[' || c == '{') break;
7096   }
7097 
7098   if (! is_allowed_callout_name(enc, name_start, name_end))
7099     return ONIGERR_INVALID_CALLOUT_NAME;
7100 
7101   if (c == '[') {
7102     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7103     tag_end = tag_start = p;
7104     while (! PEND) {
7105       if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7106       tag_end = p;
7107       PFETCH_S(c);
7108       if (c == ']') break;
7109     }
7110     if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
7111       return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7112 
7113     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7114     PFETCH_S(c);
7115   }
7116   else {
7117     tag_start = tag_end = 0;
7118   }
7119 
7120   if (c == '{') {
7121     UChar* save;
7122 
7123     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7124 
7125     /* read for single check only */
7126     save = p;
7127     arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env);
7128     if (arg_num < 0) return arg_num;
7129 
7130     is_not_single = PPEEK_IS(cterm) ?  0 : 1;
7131     p = save;
7132     r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7133                                     &name_id);
7134     if (r != ONIG_NORMAL) return r;
7135 
7136     max_arg_num = get_callout_arg_num_by_name_id(name_id);
7137     for (i = 0; i < max_arg_num; i++) {
7138       types[i] = get_callout_arg_type_by_name_id(name_id, i);
7139     }
7140 
7141     arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env);
7142     if (arg_num < 0) return arg_num;
7143 
7144     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7145     PFETCH_S(c);
7146   }
7147   else {
7148     arg_num = 0;
7149 
7150     is_not_single = 0;
7151     r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7152                                       &name_id);
7153     if (r != ONIG_NORMAL) return r;
7154 
7155     max_arg_num = get_callout_arg_num_by_name_id(name_id);
7156     for (i = 0; i < max_arg_num; i++) {
7157       types[i] = get_callout_arg_type_by_name_id(name_id, i);
7158     }
7159   }
7160 
7161   in = onig_get_callout_in_by_name_id(name_id);
7162   opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id);
7163   if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num))
7164     return ONIGERR_INVALID_CALLOUT_ARG;
7165 
7166   if (c != cterm)
7167     return ONIGERR_INVALID_CALLOUT_PATTERN;
7168 
7169   r = reg_callout_list_entry(env, &num);
7170   if (r != 0) return r;
7171 
7172   ext = onig_get_regex_ext(env->reg);
7173   CHECK_NULL_RETURN_MEMERR(ext);
7174   if (IS_NULL(ext->pattern)) {
7175     r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7176     if (r != ONIG_NORMAL) return r;
7177   }
7178 
7179   if (tag_start != tag_end) {
7180     r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7181     if (r != ONIG_NORMAL) return r;
7182   }
7183 
7184   r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env);
7185   if (r != ONIG_NORMAL) return r;
7186 
7187   e = onig_reg_callout_list_at(env->reg, num);
7188   CHECK_NULL_RETURN_MEMERR(e);
7189 
7190   e->of         = ONIG_CALLOUT_OF_NAME;
7191   e->in         = in;
7192   e->name_id    = name_id;
7193   e->type       = onig_get_callout_type_by_name_id(name_id);
7194   e->start_func = onig_get_callout_start_func_by_name_id(name_id);
7195   e->end_func   = onig_get_callout_end_func_by_name_id(name_id);
7196   e->u.arg.num        = max_arg_num;
7197   e->u.arg.passed_num = arg_num;
7198   for (i = 0; i < max_arg_num; i++) {
7199     e->u.arg.types[i] = types[i];
7200     if (i < arg_num)
7201       e->u.arg.vals[i] = vals[i];
7202     else
7203       e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i);
7204   }
7205 
7206   *np = node;
7207   *src = p;
7208   return 0;
7209 }
7210 #endif
7211 
7212 static int
parse_bag(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)7213 parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
7214           ScanEnv* env)
7215 {
7216   int r, num;
7217   Node *target;
7218   OnigOptionType option;
7219   OnigCodePoint c;
7220   int list_capture;
7221   OnigEncoding enc = env->enc;
7222 
7223   UChar* p = *src;
7224   PFETCH_READY;
7225 
7226   *np = NULL;
7227   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7228 
7229   option = env->options;
7230   c = PPEEK;
7231   if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
7232     PINC;
7233     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7234 
7235     PFETCH(c);
7236     switch (c) {
7237     case ':':   /* (?:...) grouping only */
7238     group:
7239       r = fetch_token(tok, &p, end, env);
7240       if (r < 0) return r;
7241       r = parse_alts(np, tok, term, &p, end, env, FALSE);
7242       if (r < 0) return r;
7243       *src = p;
7244       return 1; /* group */
7245       break;
7246 
7247     case '=':
7248       *np = node_new_anchor(ANCR_PREC_READ);
7249       break;
7250     case '!':  /*         preceding read */
7251       *np = node_new_anchor(ANCR_PREC_READ_NOT);
7252       break;
7253     case '>':            /* (?>...) stop backtrack */
7254       *np = node_new_bag(BAG_STOP_BACKTRACK);
7255       break;
7256 
7257     case '\'':
7258       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7259         goto named_group1;
7260       }
7261       else
7262         return ONIGERR_UNDEFINED_GROUP_OPTION;
7263       break;
7264 
7265     case '<':   /* look behind (?<=...), (?<!...) */
7266       if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7267       PFETCH(c);
7268       if (c == '=')
7269         *np = node_new_anchor(ANCR_LOOK_BEHIND);
7270       else if (c == '!')
7271         *np = node_new_anchor(ANCR_LOOK_BEHIND_NOT);
7272       else {
7273         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7274           UChar *name;
7275           UChar *name_end;
7276           enum REF_NUM num_type;
7277 
7278           PUNFETCH;
7279           c = '<';
7280 
7281         named_group1:
7282           list_capture = 0;
7283 
7284 #ifdef USE_CAPTURE_HISTORY
7285         named_group2:
7286 #endif
7287           name = p;
7288           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num,
7289                          &num_type, FALSE);
7290           if (r < 0) return r;
7291 
7292           num = scan_env_add_mem_entry(env);
7293           if (num < 0) return num;
7294           if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM)
7295             return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7296 
7297           r = name_add(env->reg, name, name_end, num, env);
7298           if (r != 0) return r;
7299           *np = node_new_memory(1);
7300           CHECK_NULL_RETURN_MEMERR(*np);
7301           BAG_(*np)->m.regnum = num;
7302           if (list_capture != 0)
7303             MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7304           env->num_named++;
7305         }
7306         else {
7307           return ONIGERR_UNDEFINED_GROUP_OPTION;
7308         }
7309       }
7310       break;
7311 
7312     case '~':
7313       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) {
7314         Node* absent;
7315         Node* expr;
7316         int head_bar;
7317         int is_range_cutter;
7318 
7319         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7320 
7321         if (PPEEK_IS('|')) { /* (?~|generator|absent) */
7322           PINC;
7323           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7324 
7325           head_bar = 1;
7326           if (PPEEK_IS(')')) { /* (?~|)  : range clear */
7327             PINC;
7328             r = make_range_clear(np, env);
7329             if (r != 0) return r;
7330             goto end;
7331           }
7332         }
7333         else
7334           head_bar = 0;
7335 
7336         r = fetch_token(tok, &p, end, env);
7337         if (r < 0) return r;
7338         r = parse_alts(&absent, tok, term, &p, end, env, TRUE);
7339         if (r < 0) {
7340           onig_node_free(absent);
7341           return r;
7342         }
7343 
7344         expr = NULL_NODE;
7345         is_range_cutter = 0;
7346         if (head_bar != 0) {
7347           Node* top = absent;
7348           if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) {
7349             expr = NULL_NODE;
7350             is_range_cutter = 1;
7351             /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */
7352           }
7353           else {
7354             absent = NODE_CAR(top);
7355             expr   = NODE_CDR(top);
7356             NODE_CAR(top) = NULL_NODE;
7357             NODE_CDR(top) = NULL_NODE;
7358             onig_node_free(top);
7359             if (IS_NULL(NODE_CDR(expr))) {
7360               top = expr;
7361               expr = NODE_CAR(top);
7362               NODE_CAR(top) = NULL_NODE;
7363               onig_node_free(top);
7364             }
7365           }
7366         }
7367 
7368         r = make_absent_tree(np, absent, expr, is_range_cutter, env);
7369         if (r != 0) {
7370           return r;
7371         }
7372         goto end;
7373       }
7374       else {
7375         return ONIGERR_UNDEFINED_GROUP_OPTION;
7376       }
7377       break;
7378 
7379 #ifdef USE_CALLOUT
7380     case '{':
7381       if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS))
7382         return ONIGERR_UNDEFINED_GROUP_OPTION;
7383 
7384       r = parse_callout_of_contents(np, ')', &p, end, env);
7385       if (r != 0) return r;
7386 
7387       goto end;
7388       break;
7389 #endif
7390 
7391     case '(':
7392       /* (?()...) */
7393       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) {
7394         UChar *prev;
7395         Node* condition;
7396         int condition_is_checker;
7397 
7398         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7399         PFETCH(c);
7400         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7401 
7402         if (IS_CODE_DIGIT_ASCII(enc, c)
7403             || c == '-' || c == '+' || c == '<' || c == '\'') {
7404           UChar* name_end;
7405           int back_num;
7406           int exist_level;
7407           int level;
7408           enum REF_NUM num_type;
7409           int is_enclosed;
7410 
7411           is_enclosed = (c == '<' || c == '\'') ? 1 : 0;
7412           if (! is_enclosed)
7413             PUNFETCH;
7414           prev = p;
7415           exist_level = 0;
7416 #ifdef USE_BACKREF_WITH_LEVEL
7417           name_end = NULL_UCHARP; /* no need. escape gcc warning. */
7418           r = fetch_name_with_level(
7419                     (OnigCodePoint )(is_enclosed != 0 ? c : '('),
7420                     &p, end, &name_end,
7421                     env, &back_num, &level, &num_type);
7422           if (r == 1) exist_level = 1;
7423 #else
7424           r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('),
7425                          &p, end, &name_end, env, &back_num, &num_type, TRUE);
7426 #endif
7427           if (r < 0) {
7428             if (is_enclosed == 0) {
7429               goto any_condition;
7430             }
7431             else
7432               return r;
7433           }
7434 
7435           condition_is_checker = 1;
7436           if (num_type != IS_NOT_NUM) {
7437             if (num_type == IS_REL_NUM) {
7438               back_num = backref_rel_to_abs(back_num, env);
7439             }
7440             if (back_num <= 0)
7441               return ONIGERR_INVALID_BACKREF;
7442 
7443             if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7444               if (back_num > env->num_mem ||
7445                   IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
7446                 return ONIGERR_INVALID_BACKREF;
7447             }
7448 
7449             condition = node_new_backref_checker(1, &back_num, FALSE,
7450 #ifdef USE_BACKREF_WITH_LEVEL
7451                                                  exist_level, level,
7452 #endif
7453                                                  env);
7454           }
7455           else {
7456             int num;
7457             int* backs;
7458 
7459             num = name_to_group_numbers(env, prev, name_end, &backs);
7460             if (num <= 0) {
7461               return ONIGERR_UNDEFINED_NAME_REFERENCE;
7462             }
7463             if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7464               int i;
7465               for (i = 0; i < num; i++) {
7466                 if (backs[i] > env->num_mem ||
7467                     IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
7468                   return ONIGERR_INVALID_BACKREF;
7469               }
7470             }
7471 
7472             condition = node_new_backref_checker(num, backs, TRUE,
7473 #ifdef USE_BACKREF_WITH_LEVEL
7474                                                  exist_level, level,
7475 #endif
7476                                                  env);
7477           }
7478 
7479           if (is_enclosed != 0) {
7480             if (PEND) goto err_if_else;
7481             PFETCH(c);
7482             if (c != ')') goto err_if_else;
7483           }
7484         }
7485 #ifdef USE_CALLOUT
7486         else if (c == '?') {
7487           if (IS_SYNTAX_OP2(env->syntax,
7488                             ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) {
7489             if (! PEND && PPEEK_IS('{')) {
7490               /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */
7491               condition_is_checker = 0;
7492               PFETCH(c);
7493               r = parse_callout_of_contents(&condition, ')', &p, end, env);
7494               if (r != 0) return r;
7495               goto end_condition;
7496             }
7497           }
7498           goto any_condition;
7499         }
7500         else if (c == '*' &&
7501                  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7502           condition_is_checker = 0;
7503           r = parse_callout_of_name(&condition, ')', &p, end, env);
7504           if (r != 0) return r;
7505           goto end_condition;
7506         }
7507 #endif
7508         else {
7509         any_condition:
7510           PUNFETCH;
7511           condition_is_checker = 0;
7512           r = fetch_token(tok, &p, end, env);
7513           if (r < 0) return r;
7514           r = parse_alts(&condition, tok, term, &p, end, env, FALSE);
7515           if (r < 0) {
7516             onig_node_free(condition);
7517             return r;
7518           }
7519         }
7520 
7521 #ifdef USE_CALLOUT
7522       end_condition:
7523 #endif
7524         CHECK_NULL_RETURN_MEMERR(condition);
7525 
7526         if (PEND) {
7527         err_if_else:
7528           onig_node_free(condition);
7529           return ONIGERR_END_PATTERN_IN_GROUP;
7530         }
7531 
7532         if (PPEEK_IS(')')) { /* case: empty body: make backref checker */
7533           if (condition_is_checker == 0) {
7534             onig_node_free(condition);
7535             return ONIGERR_INVALID_IF_ELSE_SYNTAX;
7536           }
7537           PFETCH(c);
7538           *np = condition;
7539         }
7540         else { /* if-else */
7541           int then_is_empty;
7542           Node *Then, *Else;
7543 
7544           Then = 0;
7545           if (PPEEK_IS('|')) {
7546             PFETCH(c);
7547             then_is_empty = 1;
7548           }
7549           else
7550             then_is_empty = 0;
7551 
7552           r = fetch_token(tok, &p, end, env);
7553           if (r < 0) {
7554             onig_node_free(condition);
7555             return r;
7556           }
7557           r = parse_alts(&target, tok, term, &p, end, env, TRUE);
7558           if (r < 0) {
7559             onig_node_free(condition);
7560             onig_node_free(target);
7561             return r;
7562           }
7563 
7564           if (then_is_empty != 0) {
7565             Else = target;
7566           }
7567           else {
7568             if (NODE_TYPE(target) == NODE_ALT) {
7569               Then = NODE_CAR(target);
7570               if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) {
7571                 Else = NODE_CAR(NODE_CDR(target));
7572                 cons_node_free_alone(NODE_CDR(target));
7573               }
7574               else {
7575                 Else = NODE_CDR(target);
7576               }
7577               cons_node_free_alone(target);
7578             }
7579             else {
7580               Then = target;
7581               Else = 0;
7582             }
7583           }
7584 
7585           *np = node_new_bag_if_else(condition, Then, Else);
7586           if (IS_NULL(*np)) {
7587             onig_node_free(condition);
7588             onig_node_free(Then);
7589             onig_node_free(Else);
7590             return ONIGERR_MEMORY;
7591           }
7592         }
7593         goto end;
7594       }
7595       else {
7596         return ONIGERR_UNDEFINED_GROUP_OPTION;
7597       }
7598       break;
7599 
7600 #ifdef USE_CAPTURE_HISTORY
7601     case '@':
7602       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
7603         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7604           PFETCH(c);
7605           if (c == '<' || c == '\'') {
7606             list_capture = 1;
7607             goto named_group2; /* (?@<name>...) */
7608           }
7609           PUNFETCH;
7610         }
7611 
7612         *np = node_new_memory(0);
7613         CHECK_NULL_RETURN_MEMERR(*np);
7614         num = scan_env_add_mem_entry(env);
7615         if (num < 0) {
7616           return num;
7617         }
7618         else if (num >= (int )MEM_STATUS_BITS_NUM) {
7619           return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7620         }
7621         BAG_(*np)->m.regnum = num;
7622         MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7623       }
7624       else {
7625         return ONIGERR_UNDEFINED_GROUP_OPTION;
7626       }
7627       break;
7628 #endif
7629 
7630 #ifdef USE_POSIXLINE_OPTION
7631     case 'p':
7632 #endif
7633     case '-': case 'i': case 'm': case 's': case 'x':
7634     case 'W': case 'D': case 'S': case 'P':
7635     case 'y':
7636       {
7637         int neg = 0;
7638 
7639         while (1) {
7640           switch (c) {
7641           case ':':
7642           case ')':
7643             break;
7644 
7645           case '-':  neg = 1; break;
7646           case 'x':  OPTION_NEGATE(option, ONIG_OPTION_EXTEND,     neg); break;
7647           case 'i':  OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break;
7648           case 's':
7649             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7650               OPTION_NEGATE(option, ONIG_OPTION_MULTILINE,  neg);
7651             }
7652             else
7653               return ONIGERR_UNDEFINED_GROUP_OPTION;
7654             break;
7655 
7656           case 'm':
7657             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7658               OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE));
7659             }
7660             else if (IS_SYNTAX_OP2(env->syntax,
7661                         ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) {
7662               OPTION_NEGATE(option, ONIG_OPTION_MULTILINE,  neg);
7663             }
7664             else
7665               return ONIGERR_UNDEFINED_GROUP_OPTION;
7666             break;
7667 #ifdef USE_POSIXLINE_OPTION
7668           case 'p':
7669             OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
7670             break;
7671 #endif
7672           case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
7673           case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
7674           case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
7675           case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
7676 
7677           case 'y': /* y{g}, y{w} */
7678             {
7679               if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
7680                 return ONIGERR_UNDEFINED_GROUP_OPTION;
7681 
7682               if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION;
7683 
7684               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7685               if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION;
7686               PFETCH(c);
7687               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7688               PFETCH(c);
7689               switch (c) {
7690               case 'g':
7691                 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7692                   return ONIGERR_UNDEFINED_GROUP_OPTION;
7693 
7694                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE);
7695                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE);
7696                 break;
7697 #ifdef USE_UNICODE_WORD_BREAK
7698               case 'w':
7699                 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7700                   return ONIGERR_UNDEFINED_GROUP_OPTION;
7701 
7702                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE);
7703                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE);
7704                 break;
7705 #endif
7706               default:
7707                 return ONIGERR_UNDEFINED_GROUP_OPTION;
7708                 break;
7709               }
7710               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7711               PFETCH(c);
7712               if (c != '}')
7713                 return ONIGERR_UNDEFINED_GROUP_OPTION;
7714               break;
7715             } /* case 'y' */
7716 
7717           default:
7718             return ONIGERR_UNDEFINED_GROUP_OPTION;
7719           }
7720 
7721           if (c == ')') {
7722             *np = node_new_option(option);
7723             CHECK_NULL_RETURN_MEMERR(*np);
7724             *src = p;
7725             return 2; /* option only */
7726           }
7727           else if (c == ':') {
7728             OnigOptionType prev = env->options;
7729 
7730             env->options = option;
7731             r = fetch_token(tok, &p, end, env);
7732             if (r < 0) return r;
7733             r = parse_alts(&target, tok, term, &p, end, env, FALSE);
7734             env->options = prev;
7735             if (r < 0) {
7736               onig_node_free(target);
7737               return r;
7738             }
7739             *np = node_new_option(option);
7740             CHECK_NULL_RETURN_MEMERR(*np);
7741             NODE_BODY(*np) = target;
7742             *src = p;
7743             return 0;
7744           }
7745 
7746           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7747           PFETCH(c);
7748         } /* while (1) */
7749       }
7750       break;
7751 
7752     default:
7753       return ONIGERR_UNDEFINED_GROUP_OPTION;
7754     }
7755   }
7756 #ifdef USE_CALLOUT
7757   else if (c == '*' &&
7758            IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7759     PINC;
7760     r = parse_callout_of_name(np, ')', &p, end, env);
7761     if (r != 0) return r;
7762 
7763     goto end;
7764   }
7765 #endif
7766   else {
7767     if (OPTON_DONT_CAPTURE_GROUP(env->options))
7768       goto group;
7769 
7770     *np = node_new_memory(0);
7771     CHECK_NULL_RETURN_MEMERR(*np);
7772     num = scan_env_add_mem_entry(env);
7773     if (num < 0) return num;
7774     BAG_(*np)->m.regnum = num;
7775   }
7776 
7777   CHECK_NULL_RETURN_MEMERR(*np);
7778   r = fetch_token(tok, &p, end, env);
7779   if (r < 0) return r;
7780   r = parse_alts(&target, tok, term, &p, end, env, FALSE);
7781   if (r < 0) {
7782     onig_node_free(target);
7783     return r;
7784   }
7785 
7786   NODE_BODY(*np) = target;
7787 
7788   if (NODE_TYPE(*np) == NODE_BAG) {
7789     if (BAG_(*np)->type == BAG_MEMORY) {
7790       /* Don't move this to previous of parse_alts() */
7791       r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np);
7792       if (r != 0) return r;
7793     }
7794   }
7795 
7796  end:
7797   *src = p;
7798   return 0;
7799 }
7800 
7801 static const char* PopularQStr[] = {
7802   "?", "*", "+", "??", "*?", "+?"
7803 };
7804 
7805 static const char* ReduceQStr[] = {
7806   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
7807 };
7808 
7809 static int
assign_quantifier_body(Node * qnode,Node * target,int group,ScanEnv * env)7810 assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env)
7811 {
7812   QuantNode* qn;
7813 
7814   qn = QUANT_(qnode);
7815   if (qn->lower == 1 && qn->upper == 1)
7816     return 1;
7817 
7818   switch (NODE_TYPE(target)) {
7819   case NODE_STRING:
7820     if (group == 0) {
7821       if (str_node_can_be_split(target, env->enc)) {
7822         Node* n = str_node_split_last_char(target, env->enc);
7823         if (IS_NOT_NULL(n)) {
7824           NODE_BODY(qnode) = n;
7825           return 2;
7826         }
7827       }
7828     }
7829     break;
7830 
7831   case NODE_QUANT:
7832     { /* check redundant double repeat. */
7833       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
7834       QuantNode* qnt   = QUANT_(target);
7835       int nestq_num   = quantifier_type_num(qn);
7836       int targetq_num = quantifier_type_num(qnt);
7837 
7838 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
7839       if (targetq_num >= 0 && nestq_num >= 0 &&
7840           IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
7841         UChar buf[WARN_BUFSIZE];
7842 
7843         switch(ReduceTypeTable[targetq_num][nestq_num]) {
7844         case RQ_ASIS:
7845           break;
7846 
7847         case RQ_DEL:
7848           if (onig_verb_warn != onig_null_warn) {
7849             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
7850                                   env->pattern, env->pattern_end,
7851                                   (UChar* )"redundant nested repeat operator");
7852             (*onig_verb_warn)((char* )buf);
7853           }
7854           goto warn_exit;
7855           break;
7856 
7857         default:
7858           if (onig_verb_warn != onig_null_warn) {
7859             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
7860                                        env->pattern, env->pattern_end,
7861             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
7862             PopularQStr[targetq_num], PopularQStr[nestq_num],
7863             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
7864             (*onig_verb_warn)((char* )buf);
7865           }
7866           goto warn_exit;
7867           break;
7868         }
7869       }
7870 
7871     warn_exit:
7872 #endif
7873       if (targetq_num >= 0 && nestq_num < 0) {
7874         if (targetq_num == 1 || targetq_num == 2) { /* * or + */
7875           /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
7876           if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {
7877             qn->upper = (qn->lower == 0 ? 1 : qn->lower);
7878           }
7879         }
7880       }
7881       else {
7882         int r;
7883 
7884         NODE_BODY(qnode) = target;
7885         r = onig_reduce_nested_quantifier(qnode);
7886         return r;
7887       }
7888     }
7889     break;
7890 
7891   default:
7892     break;
7893   }
7894 
7895   NODE_BODY(qnode) = target;
7896   return 0;
7897 }
7898 
7899 
7900 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
7901 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)7902 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
7903 {
7904   BBuf *tbuf;
7905   int r;
7906 
7907   if (IS_NCCLASS_NOT(cc)) {
7908     bitset_invert(cc->bs);
7909 
7910     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
7911       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
7912       if (r != 0) return r;
7913 
7914       bbuf_free(cc->mbuf);
7915       cc->mbuf = tbuf;
7916     }
7917 
7918     NCCLASS_CLEAR_NOT(cc);
7919   }
7920 
7921   return 0;
7922 }
7923 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
7924 
7925 #define ADD_CODE_INTO_CC(cc, code, enc) do {\
7926   if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\
7927     add_code_range_to_buf(&((cc)->mbuf), code, code);\
7928   }\
7929   else {\
7930     BITSET_SET_BIT((cc)->bs, code);\
7931   }\
7932 } while (0)
7933 
7934 extern int
onig_new_cclass_with_code_list(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])7935 onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,
7936                                int n, OnigCodePoint codes[])
7937 {
7938   int i;
7939   Node* node;
7940   CClassNode* cc;
7941 
7942   *rnode = NULL_NODE;
7943 
7944   node = node_new_cclass();
7945   CHECK_NULL_RETURN_MEMERR(node);
7946 
7947   cc = CCLASS_(node);
7948 
7949   for (i = 0; i < n; i++) {
7950     ADD_CODE_INTO_CC(cc, codes[i], enc);
7951   }
7952 
7953   *rnode = node;
7954   return 0;
7955 }
7956 
7957 typedef struct {
7958   ScanEnv*    env;
7959   CClassNode* cc;
7960   Node*       alt_root;
7961   Node**      ptail;
7962 } IApplyCaseFoldArg;
7963 
7964 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)7965 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
7966 {
7967   IApplyCaseFoldArg* iarg;
7968   ScanEnv* env;
7969   CClassNode* cc;
7970 
7971   iarg = (IApplyCaseFoldArg* )arg;
7972   env = iarg->env;
7973   cc  = iarg->cc;
7974 
7975   if (to_len == 1) {
7976     int is_in = onig_is_code_in_cc(env->enc, from, cc);
7977 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
7978     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
7979         (is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
7980       ADD_CODE_INTO_CC(cc, *to, env->enc);
7981     }
7982 #else
7983     if (is_in != 0) {
7984       if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
7985           ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) {
7986         if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
7987         add_code_range(&(cc->mbuf), env, *to, *to);
7988       }
7989       else {
7990         if (IS_NCCLASS_NOT(cc)) {
7991           BITSET_CLEAR_BIT(cc->bs, *to);
7992         }
7993         else
7994           BITSET_SET_BIT(cc->bs, *to);
7995       }
7996     }
7997 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
7998   }
7999   else {
8000     int r, i, len;
8001     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8002 
8003     if (onig_is_code_in_cc(env->enc, from, cc)
8004 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8005         && !IS_NCCLASS_NOT(cc)
8006 #endif
8007         ) {
8008       int n, j, m, index;
8009       Node* list_node;
8010       Node* ns[3];
8011 
8012       n = 0;
8013       for (i = 0; i < to_len; i++) {
8014         OnigCodePoint code;
8015         Node* csnode;
8016         CClassNode* cs_cc;
8017 
8018         index = onigenc_unicode_fold1_key(&to[i]);
8019         if (index >= 0) {
8020           csnode = node_new_cclass();
8021           cs_cc = CCLASS_(csnode);
8022           if (IS_NULL(csnode)) {
8023           err_free_ns:
8024             for (j = 0; j < n; j++) onig_node_free(ns[j]);
8025             return ONIGERR_MEMORY;
8026           }
8027           m = FOLDS1_UNFOLDS_NUM(index);
8028           for (j = 0; j < m; j++) {
8029             code = FOLDS1_UNFOLDS(index)[j];
8030             ADD_CODE_INTO_CC(cs_cc, code, env->enc);
8031           }
8032           ADD_CODE_INTO_CC(cs_cc, to[i], env->enc);
8033           ns[n++] = csnode;
8034         }
8035         else {
8036           len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
8037           if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {
8038             csnode = node_new_str(buf, buf + len);
8039             if (IS_NULL(csnode)) goto err_free_ns;
8040 
8041             NODE_STRING_SET_CASE_EXPANDED(csnode);
8042             ns[n++] = csnode;
8043           }
8044           else {
8045             r = onig_node_str_cat(ns[n-1], buf, buf + len);
8046             if (r < 0) goto err_free_ns;
8047           }
8048         }
8049       }
8050 
8051       if (n == 1)
8052         list_node = ns[0];
8053       else
8054         list_node = make_list(n, ns);
8055 
8056       *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE);
8057       if (IS_NULL(*(iarg->ptail))) {
8058         onig_node_free(list_node);
8059         return ONIGERR_MEMORY;
8060       }
8061       iarg->ptail = &(NODE_CDR((*(iarg->ptail))));
8062     }
8063   }
8064 
8065   return 0;
8066 }
8067 
8068 static int
parse_exp(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8069 parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
8070           ScanEnv* env, int group_head)
8071 {
8072   int r, len, group;
8073   Node* qn;
8074   Node** tp;
8075   unsigned int parse_depth;
8076 
8077  retry:
8078   group = 0;
8079   *np = NULL;
8080   if (tok->type == (enum TokenSyms )term)
8081     goto end_of_token;
8082 
8083   parse_depth = env->parse_depth;
8084 
8085   switch (tok->type) {
8086   case TK_ALT:
8087   case TK_EOT:
8088   end_of_token:
8089     *np = node_new_empty();
8090     CHECK_NULL_RETURN_MEMERR(*np);
8091     return tok->type;
8092   break;
8093 
8094   case TK_SUBEXP_OPEN:
8095     r = parse_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env);
8096     if (r < 0) return r;
8097     if (r == 1) { /* group */
8098       if (group_head == 0)
8099         group = 1;
8100       else {
8101         Node* target = *np;
8102         *np = node_new_group(target);
8103         if (IS_NULL(*np)) {
8104           onig_node_free(target);
8105           return ONIGERR_MEMORY;
8106         }
8107         group = 2;
8108       }
8109     }
8110     else if (r == 2) { /* option only */
8111       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) {
8112         env->options = BAG_(*np)->o.options;
8113         r = fetch_token(tok, src, end, env);
8114         if (r < 0) return r;
8115         onig_node_free(*np);
8116         goto retry;
8117       }
8118       else {
8119         Node* target;
8120         OnigOptionType prev = env->options;
8121 
8122         env->options = BAG_(*np)->o.options;
8123         r = fetch_token(tok, src, end, env);
8124         if (r < 0) return r;
8125         r = parse_alts(&target, tok, term, src, end, env, FALSE);
8126         env->options = prev;
8127         if (r < 0) {
8128           onig_node_free(target);
8129           return r;
8130         }
8131         NODE_BODY(*np) = target;
8132       }
8133       return tok->type;
8134     }
8135     break;
8136 
8137   case TK_SUBEXP_CLOSE:
8138     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
8139       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
8140 
8141     if (tok->escaped) goto tk_crude_byte;
8142     else goto tk_byte;
8143     break;
8144 
8145   case TK_STRING:
8146   tk_byte:
8147     {
8148       *np = node_new_str_with_options(tok->backp, *src, env->options);
8149       CHECK_NULL_RETURN_MEMERR(*np);
8150 
8151       while (1) {
8152         r = fetch_token(tok, src, end, env);
8153         if (r < 0) return r;
8154         if (r != TK_STRING) break;
8155 
8156         r = onig_node_str_cat(*np, tok->backp, *src);
8157         if (r < 0) return r;
8158       }
8159 
8160     string_end:
8161       tp = np;
8162       goto repeat;
8163     }
8164     break;
8165 
8166   case TK_CRUDE_BYTE:
8167   tk_crude_byte:
8168     {
8169       *np = node_new_str_crude_char(tok->u.byte, env->options);
8170       CHECK_NULL_RETURN_MEMERR(*np);
8171       len = 1;
8172       while (1) {
8173         if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
8174           if (len == enclen(env->enc, STR_(*np)->s)) {
8175             r = fetch_token(tok, src, end, env);
8176             goto tk_crude_byte_end;
8177           }
8178         }
8179 
8180         r = fetch_token(tok, src, end, env);
8181         if (r < 0) return r;
8182         if (r != TK_CRUDE_BYTE)
8183           return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
8184 
8185         r = node_str_cat_char(*np, tok->u.byte);
8186         if (r < 0) return r;
8187 
8188         len++;
8189       }
8190 
8191     tk_crude_byte_end:
8192       if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end))
8193         return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8194 
8195       NODE_STRING_CLEAR_CRUDE(*np);
8196       goto string_end;
8197     }
8198     break;
8199 
8200   case TK_CODE_POINT:
8201     {
8202       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8203       len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
8204       if (len < 0) return len;
8205 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
8206       *np = node_new_str_crude(buf, buf + len, env->options);
8207 #else
8208       *np = node_new_str_with_options(buf, buf + len, env->options);
8209 #endif
8210       CHECK_NULL_RETURN_MEMERR(*np);
8211     }
8212     break;
8213 
8214   case TK_QUOTE_OPEN:
8215     {
8216       OnigCodePoint end_op[2];
8217       UChar *qstart, *qend, *nextp;
8218 
8219       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
8220       end_op[1] = (OnigCodePoint )'E';
8221       qstart = *src;
8222       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
8223       if (IS_NULL(qend)) {
8224         nextp = qend = end;
8225       }
8226       *np = node_new_str_with_options(qstart, qend, env->options);
8227       CHECK_NULL_RETURN_MEMERR(*np);
8228       *src = nextp;
8229     }
8230     break;
8231 
8232   case TK_CHAR_TYPE:
8233     {
8234       switch (tok->u.prop.ctype) {
8235       case ONIGENC_CTYPE_WORD:
8236         *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options);
8237         CHECK_NULL_RETURN_MEMERR(*np);
8238         break;
8239 
8240       case ONIGENC_CTYPE_SPACE:
8241       case ONIGENC_CTYPE_DIGIT:
8242       case ONIGENC_CTYPE_XDIGIT:
8243         {
8244           CClassNode* cc;
8245 
8246           *np = node_new_cclass();
8247           CHECK_NULL_RETURN_MEMERR(*np);
8248           cc = CCLASS_(*np);
8249           add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env);
8250           if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
8251         }
8252         break;
8253 
8254       default:
8255         return ONIGERR_PARSER_BUG;
8256         break;
8257       }
8258     }
8259     break;
8260 
8261   case TK_CHAR_PROPERTY:
8262     r = parse_char_property(np, tok, src, end, env);
8263     if (r != 0) return r;
8264     break;
8265 
8266   case TK_OPEN_CC:
8267     {
8268       CClassNode* cc;
8269 
8270       r = parse_cc(np, tok, src, end, env);
8271       if (r != 0) return r;
8272 
8273       cc = CCLASS_(*np);
8274       if (OPTON_IGNORECASE(env->options)) {
8275         IApplyCaseFoldArg iarg;
8276 
8277         iarg.env      = env;
8278         iarg.cc       = cc;
8279         iarg.alt_root = NULL_NODE;
8280         iarg.ptail    = &(iarg.alt_root);
8281 
8282         r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
8283                                         i_apply_case_fold, &iarg);
8284         if (r != 0) {
8285           onig_node_free(iarg.alt_root);
8286           return r;
8287         }
8288         if (IS_NOT_NULL(iarg.alt_root)) {
8289           Node* work = onig_node_new_alt(*np, iarg.alt_root);
8290           if (IS_NULL(work)) {
8291             onig_node_free(iarg.alt_root);
8292             return ONIGERR_MEMORY;
8293           }
8294           *np = work;
8295         }
8296       }
8297     }
8298     break;
8299 
8300   case TK_ANYCHAR:
8301     *np = node_new_anychar(env->options);
8302     CHECK_NULL_RETURN_MEMERR(*np);
8303     break;
8304 
8305   case TK_ANYCHAR_ANYTIME:
8306     *np = node_new_anychar(env->options);
8307     CHECK_NULL_RETURN_MEMERR(*np);
8308     qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
8309     CHECK_NULL_RETURN_MEMERR(qn);
8310     NODE_BODY(qn) = *np;
8311     *np = qn;
8312     break;
8313 
8314   case TK_BACKREF:
8315     len = tok->u.backref.num;
8316     *np = node_new_backref(len,
8317                   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
8318                   tok->u.backref.by_name,
8319 #ifdef USE_BACKREF_WITH_LEVEL
8320                            tok->u.backref.exist_level,
8321                            tok->u.backref.level,
8322 #endif
8323                            env);
8324     CHECK_NULL_RETURN_MEMERR(*np);
8325     break;
8326 
8327 #ifdef USE_CALL
8328   case TK_CALL:
8329     {
8330       int gnum = tok->u.call.gnum;
8331 
8332       *np = node_new_call(tok->u.call.name, tok->u.call.name_end,
8333                           gnum, tok->u.call.by_number);
8334       CHECK_NULL_RETURN_MEMERR(*np);
8335       env->num_call++;
8336       if (tok->u.call.by_number != 0 && gnum == 0) {
8337         env->has_call_zero = 1;
8338       }
8339     }
8340     break;
8341 #endif
8342 
8343   case TK_ANCHOR:
8344     *np = node_new_anchor_with_options(tok->u.anchor, env->options);
8345     CHECK_NULL_RETURN_MEMERR(*np);
8346     break;
8347 
8348   case TK_REPEAT:
8349   case TK_INTERVAL:
8350     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
8351       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
8352         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
8353       else {
8354         *np = node_new_empty();
8355         CHECK_NULL_RETURN_MEMERR(*np);
8356       }
8357     }
8358     else {
8359       goto tk_byte;
8360     }
8361     break;
8362 
8363   case TK_KEEP:
8364     r = node_new_keep(np, env);
8365     if (r < 0) return r;
8366     break;
8367 
8368   case TK_GENERAL_NEWLINE:
8369     r = node_new_general_newline(np, env);
8370     if (r < 0) return r;
8371     break;
8372 
8373   case TK_NO_NEWLINE:
8374     r = node_new_no_newline(np, env);
8375     if (r < 0) return r;
8376     break;
8377 
8378   case TK_TRUE_ANYCHAR:
8379     r = node_new_true_anychar(np);
8380     if (r < 0) return r;
8381     break;
8382 
8383   case TK_TEXT_SEGMENT:
8384     r = make_text_segment(np, env);
8385     if (r < 0) return r;
8386     break;
8387 
8388   default:
8389     return ONIGERR_PARSER_BUG;
8390     break;
8391   }
8392 
8393   {
8394     tp = np;
8395 
8396   re_entry:
8397     r = fetch_token(tok, src, end, env);
8398     if (r < 0) return r;
8399 
8400   repeat:
8401     if (r == TK_REPEAT || r == TK_INTERVAL) {
8402       Node* target;
8403 
8404       if (is_invalid_quantifier_target(*tp))
8405         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
8406 
8407       INC_PARSE_DEPTH(parse_depth);
8408 
8409       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
8410                                r == TK_INTERVAL);
8411       CHECK_NULL_RETURN_MEMERR(qn);
8412       QUANT_(qn)->greedy = tok->u.repeat.greedy;
8413       if (group == 2) {
8414         target = node_drop_group(*tp);
8415         *tp = NULL_NODE;
8416       }
8417       else {
8418         target = *tp;
8419       }
8420       r = assign_quantifier_body(qn, target, group, env);
8421       if (r < 0) {
8422         onig_node_free(qn);
8423         *tp = NULL_NODE;
8424         return r;
8425       }
8426 
8427       if (tok->u.repeat.possessive != 0) {
8428         Node* en;
8429         en = node_new_bag(BAG_STOP_BACKTRACK);
8430         if (IS_NULL(en)) {
8431           onig_node_free(qn);
8432           return ONIGERR_MEMORY;
8433         }
8434         NODE_BODY(en) = qn;
8435         qn = en;
8436       }
8437 
8438       if (r == 0) {
8439         *tp = qn;
8440       }
8441       else if (r == 1) { /* x{1,1} ==> x */
8442         onig_node_free(qn);
8443         *tp = target;
8444       }
8445       else if (r == 2) { /* split case: /abc+/ */
8446         Node *tmp;
8447 
8448         *tp = node_new_list(*tp, NULL);
8449         if (IS_NULL(*tp)) {
8450           onig_node_free(qn);
8451           return ONIGERR_MEMORY;
8452         }
8453         tmp = NODE_CDR(*tp) = node_new_list(qn, NULL);
8454         if (IS_NULL(tmp)) {
8455           onig_node_free(qn);
8456           return ONIGERR_MEMORY;
8457         }
8458         tp = &(NODE_CAR(tmp));
8459       }
8460       group = 0;
8461       goto re_entry;
8462     }
8463   }
8464 
8465   return r;
8466 }
8467 
8468 static int
parse_branch(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8469 parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8470              ScanEnv* env, int group_head)
8471 {
8472   int r;
8473   Node *node, **headp;
8474 
8475   *top = NULL;
8476   INC_PARSE_DEPTH(env->parse_depth);
8477 
8478   r = parse_exp(&node, tok, term, src, end, env, group_head);
8479   if (r < 0) {
8480     onig_node_free(node);
8481     return r;
8482   }
8483 
8484   if (r == TK_EOT || r == term || r == TK_ALT) {
8485     *top = node;
8486   }
8487   else {
8488     *top = node_new_list(node, NULL);
8489     if (IS_NULL(*top)) {
8490       onig_node_free(node);
8491       return ONIGERR_MEMORY;
8492     }
8493 
8494     headp = &(NODE_CDR(*top));
8495     while (r != TK_EOT && r != term && r != TK_ALT) {
8496       r = parse_exp(&node, tok, term, src, end, env, FALSE);
8497       if (r < 0) {
8498         onig_node_free(node);
8499         return r;
8500       }
8501 
8502       if (NODE_TYPE(node) == NODE_LIST) {
8503         *headp = node;
8504         while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node);
8505         headp = &(NODE_CDR(node));
8506       }
8507       else {
8508         *headp = node_new_list(node, NULL);
8509         headp = &(NODE_CDR(*headp));
8510       }
8511     }
8512   }
8513 
8514   DEC_PARSE_DEPTH(env->parse_depth);
8515   return r;
8516 }
8517 
8518 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
8519 static int
parse_alts(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8520 parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8521            ScanEnv* env, int group_head)
8522 {
8523   int r;
8524   Node *node, **headp;
8525   OnigOptionType save_options;
8526 
8527   *top = NULL;
8528   INC_PARSE_DEPTH(env->parse_depth);
8529   save_options = env->options;
8530 
8531   r = parse_branch(&node, tok, term, src, end, env, group_head);
8532   if (r < 0) {
8533     onig_node_free(node);
8534     return r;
8535   }
8536 
8537   if (r == term) {
8538     *top = node;
8539   }
8540   else if (r == TK_ALT) {
8541     *top  = onig_node_new_alt(node, NULL);
8542     if (IS_NULL(*top)) {
8543       onig_node_free(node);
8544       return ONIGERR_MEMORY;
8545     }
8546 
8547     headp = &(NODE_CDR(*top));
8548     while (r == TK_ALT) {
8549       r = fetch_token(tok, src, end, env);
8550       if (r < 0) return r;
8551       r = parse_branch(&node, tok, term, src, end, env, FALSE);
8552       if (r < 0) {
8553         onig_node_free(node);
8554         return r;
8555       }
8556       *headp = onig_node_new_alt(node, NULL);
8557       if (IS_NULL(*headp)) {
8558         onig_node_free(node);
8559         onig_node_free(*top);
8560         return ONIGERR_MEMORY;
8561       }
8562 
8563       headp = &(NODE_CDR(*headp));
8564     }
8565 
8566     if (tok->type != (enum TokenSyms )term)
8567       goto err;
8568   }
8569   else {
8570     onig_node_free(node);
8571   err:
8572     if (term == TK_SUBEXP_CLOSE)
8573       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
8574     else
8575       return ONIGERR_PARSER_BUG;
8576   }
8577 
8578   env->options = save_options;
8579   DEC_PARSE_DEPTH(env->parse_depth);
8580   return r;
8581 }
8582 
8583 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)8584 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
8585 {
8586   int r;
8587   PToken tok;
8588 
8589   r = fetch_token(&tok, src, end, env);
8590   if (r < 0) return r;
8591   r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE);
8592   if (r < 0) return r;
8593 
8594   return 0;
8595 }
8596 
8597 #ifdef USE_CALL
8598 static int
make_call_zero_body(Node * node,ScanEnv * env,Node ** rnode)8599 make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
8600 {
8601   int r;
8602 
8603   Node* x = node_new_memory(0 /* 0: is not named */);
8604   CHECK_NULL_RETURN_MEMERR(x);
8605 
8606   NODE_BODY(x) = node;
8607   BAG_(x)->m.regnum = 0;
8608   r = scan_env_set_mem_node(env, 0, x);
8609   if (r != 0) {
8610     onig_node_free(x);
8611     return r;
8612   }
8613 
8614   *rnode = x;
8615   return 0;
8616 }
8617 #endif
8618 
8619 extern int
onig_parse_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)8620 onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
8621                 regex_t* reg, ScanEnv* env)
8622 {
8623   int r;
8624   UChar* p;
8625 #ifdef USE_CALLOUT
8626   RegexExt* ext;
8627 #endif
8628 
8629   names_clear(reg);
8630 
8631   scan_env_clear(env);
8632   env->options        = reg->options;
8633   env->case_fold_flag = reg->case_fold_flag;
8634   env->enc            = reg->enc;
8635   env->syntax         = reg->syntax;
8636   env->pattern        = (UChar* )pattern;
8637   env->pattern_end    = (UChar* )end;
8638   env->reg            = reg;
8639 
8640   *root = NULL;
8641 
8642   if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
8643     return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8644 
8645   p = (UChar* )pattern;
8646   r = parse_regexp(root, &p, (UChar* )end, env);
8647 
8648 #ifdef USE_CALL
8649   if (r != 0) return r;
8650 
8651   if (env->has_call_zero != 0) {
8652     Node* zero_node;
8653     r = make_call_zero_body(*root, env, &zero_node);
8654     if (r != 0) return r;
8655 
8656     *root = zero_node;
8657   }
8658 #endif
8659 
8660   reg->num_mem = env->num_mem;
8661 
8662 #ifdef USE_CALLOUT
8663   ext = reg->extp;
8664   if (IS_NOT_NULL(ext) && ext->callout_num > 0) {
8665     r = setup_ext_callout_list_values(reg);
8666   }
8667 #endif
8668 
8669   return r;
8670 }
8671 
8672 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)8673 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
8674                                UChar* arg, UChar* arg_end)
8675 {
8676   env->error     = arg;
8677   env->error_end = arg_end;
8678 }
8679