1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2020  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #ifdef DEBUG_NODE_FREE
31 #ifndef NEED_TO_INCLUDE_STDIO
32 #define NEED_TO_INCLUDE_STDIO
33 #endif
34 #endif
35 
36 #include "regparse.h"
37 #include "st.h"
38 
39 #define INIT_TAG_NAMES_ALLOC_NUM   5
40 
41 #define WARN_BUFSIZE    256
42 
43 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
44 
45 #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \
46   ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */)
47 #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \
48   ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
49 
50 #define OPTON_SINGLELINE(option)     ((option) & ONIG_OPTION_SINGLELINE)
51 #define OPTON_MULTILINE(option)      ((option) & ONIG_OPTION_MULTILINE)
52 #define OPTON_IGNORECASE(option)     ((option) & ONIG_OPTION_IGNORECASE)
53 #define OPTON_EXTEND(option)         ((option) & ONIG_OPTION_EXTEND)
54 #define OPTON_WORD_ASCII(option) \
55   ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
56 #define OPTON_DIGIT_ASCII(option) \
57   ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
58 #define OPTON_SPACE_ASCII(option) \
59   ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
60 #define OPTON_POSIX_ASCII(option)    ((option) & ONIG_OPTION_POSIX_IS_ASCII)
61 #define OPTON_TEXT_SEGMENT_WORD(option)  ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD)
62 
63 #define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \
64   ((ctype) >= 0 && \
65   (((ctype) < ONIGENC_CTYPE_ASCII  && OPTON_POSIX_ASCII(options)) ||\
66    ((ctype) == ONIGENC_CTYPE_WORD  && OPTON_WORD_ASCII(options))  ||\
67    ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\
68    ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options))))
69 
70 
71 OnigSyntaxType OnigSyntaxOniguruma = {
72   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
73      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
74      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
75      ONIG_SYN_OP_ESC_CONTROL_CHARS |
76      ONIG_SYN_OP_ESC_C_CONTROL )
77    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
78   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
79       ONIG_SYN_OP2_OPTION_ONIGURUMA |
80       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
81       ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
82       ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
83       ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS |
84       ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME    |
85       ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
86       ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
87       ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT |
88       ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
89       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
90       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
91       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
92       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
93       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
94       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
95       ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
96   , ( SYN_GNU_REGEX_BV |
97       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
98       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
99       ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND |
100       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
101       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
102       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
103       ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |
104       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
105       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
106   , ONIG_OPTION_NONE
107   ,
108   {
109       (OnigCodePoint )'\\'                       /* esc */
110     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
111     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
112     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
113     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
114     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
115   }
116 };
117 
118 OnigSyntaxType OnigSyntaxRuby = {
119   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
120      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
121      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
122      ONIG_SYN_OP_ESC_CONTROL_CHARS |
123      ONIG_SYN_OP_ESC_C_CONTROL )
124    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
125   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
126       ONIG_SYN_OP2_OPTION_RUBY |
127       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
128       ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
129       ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
130       ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
131       ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
132       ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
133       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
134       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
135       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
136       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
137       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
138       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
139       ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
140   , ( SYN_GNU_REGEX_BV |
141       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
142       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
143       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
144       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
145       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
146       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
147       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
148   , ONIG_OPTION_NONE
149   ,
150   {
151       (OnigCodePoint )'\\'                       /* esc */
152     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
153     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
154     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
155     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
156     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
157   }
158 };
159 
160 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA;
161 
162 typedef enum {
163   CS_VALUE,
164   CS_RANGE,
165   CS_COMPLETE,
166   CS_START
167 } CSTATE;
168 
169 typedef enum {
170   CV_UNDEF,
171   CV_SB,
172   CV_MB,
173   CV_CPROP
174 } CVAL;
175 
onig_null_warn(const char * s ARG_UNUSED)176 extern void onig_null_warn(const char* s ARG_UNUSED) { }
177 
178 #ifdef DEFAULT_WARN_FUNCTION
179 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
180 #else
181 static OnigWarnFunc onig_warn = onig_null_warn;
182 #endif
183 
184 #ifdef DEFAULT_VERB_WARN_FUNCTION
185 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
186 #else
187 static OnigWarnFunc onig_verb_warn = onig_null_warn;
188 #endif
189 
onig_set_warn_func(OnigWarnFunc f)190 extern void onig_set_warn_func(OnigWarnFunc f)
191 {
192   onig_warn = f;
193 }
194 
onig_set_verb_warn_func(OnigWarnFunc f)195 extern void onig_set_verb_warn_func(OnigWarnFunc f)
196 {
197   onig_verb_warn = f;
198 }
199 
200 extern void
onig_warning(const char * s)201 onig_warning(const char* s)
202 {
203   if (onig_warn == onig_null_warn) return ;
204 
205   (*onig_warn)(s);
206 }
207 
208 #define DEFAULT_MAX_CAPTURE_NUM   32767
209 
210 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
211 
212 extern int
onig_set_capture_num_limit(int num)213 onig_set_capture_num_limit(int num)
214 {
215   if (num < 0) return -1;
216 
217   MaxCaptureNum = num;
218   return 0;
219 }
220 
221 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
222 
223 extern unsigned int
onig_get_parse_depth_limit(void)224 onig_get_parse_depth_limit(void)
225 {
226   return ParseDepthLimit;
227 }
228 
229 extern int
onig_set_parse_depth_limit(unsigned int depth)230 onig_set_parse_depth_limit(unsigned int depth)
231 {
232   if (depth == 0)
233     ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
234   else
235     ParseDepthLimit = depth;
236   return 0;
237 }
238 
239 #ifdef ONIG_DEBUG_PARSE
240 #define INC_PARSE_DEPTH(d) do {\
241   (d)++;\
242   if (env->max_parse_depth < (d)) env->max_parse_depth = d;\
243   if ((d) > ParseDepthLimit) \
244     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
245 } while (0)
246 #else
247 #define INC_PARSE_DEPTH(d) do {\
248   (d)++;\
249   if ((d) > ParseDepthLimit) \
250     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
251 } while (0)
252 #endif
253 
254 #define DEC_PARSE_DEPTH(d)  (d)--
255 
256 
257 static int
bbuf_init(BBuf * buf,int size)258 bbuf_init(BBuf* buf, int size)
259 {
260   if (size <= 0) {
261     size   = 0;
262     buf->p = NULL;
263   }
264   else {
265     buf->p = (UChar* )xmalloc(size);
266     if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
267   }
268 
269   buf->alloc = size;
270   buf->used  = 0;
271   return 0;
272 }
273 
274 static void
bbuf_free(BBuf * bbuf)275 bbuf_free(BBuf* bbuf)
276 {
277   if (IS_NOT_NULL(bbuf)) {
278     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
279     xfree(bbuf);
280   }
281 }
282 
283 static int
bbuf_clone(BBuf ** rto,BBuf * from)284 bbuf_clone(BBuf** rto, BBuf* from)
285 {
286   int r;
287   BBuf *to;
288 
289   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
290   CHECK_NULL_RETURN_MEMERR(to);
291   r = BB_INIT(to, from->alloc);
292   if (r != 0) {
293     bbuf_free(to);
294     *rto = 0;
295     return r;
296   }
297   to->used = from->used;
298   xmemcpy(to->p, from->p, from->used);
299   return 0;
300 }
301 
302 static int
backref_rel_to_abs(int rel_no,ScanEnv * env)303 backref_rel_to_abs(int rel_no, ScanEnv* env)
304 {
305   if (rel_no > 0) {
306     if (rel_no > ONIG_INT_MAX - env->num_mem)
307       return ONIGERR_INVALID_BACKREF;
308     return env->num_mem + rel_no;
309   }
310   else {
311     return env->num_mem + 1 + rel_no;
312   }
313 }
314 
315 #define OPTION_ON(v,f)     ((v) |= (f))
316 #define OPTION_OFF(v,f)    ((v) &= ~(f))
317 
318 #define OPTION_NEGATE(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
319 
320 #define MBCODE_START_POS(enc) \
321   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
322 
323 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
324   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
325 
326 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
327   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
328     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
329     if (r != 0) return r;\
330   }\
331 } while (0)
332 
333 
334 #define BITSET_IS_EMPTY(bs,empty) do {\
335   int i;\
336   empty = 1;\
337   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\
338     if ((bs)[i] != 0) {\
339       empty = 0; break;\
340     }\
341   }\
342 } while (0)
343 
344 static void
bitset_set_range(BitSetRef bs,int from,int to)345 bitset_set_range(BitSetRef bs, int from, int to)
346 {
347   int i;
348   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
349     BITSET_SET_BIT(bs, i);
350   }
351 }
352 
353 static void
bitset_invert(BitSetRef bs)354 bitset_invert(BitSetRef bs)
355 {
356   int i;
357   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); }
358 }
359 
360 static void
bitset_invert_to(BitSetRef from,BitSetRef to)361 bitset_invert_to(BitSetRef from, BitSetRef to)
362 {
363   int i;
364   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); }
365 }
366 
367 static void
bitset_and(BitSetRef dest,BitSetRef bs)368 bitset_and(BitSetRef dest, BitSetRef bs)
369 {
370   int i;
371   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; }
372 }
373 
374 static void
bitset_or(BitSetRef dest,BitSetRef bs)375 bitset_or(BitSetRef dest, BitSetRef bs)
376 {
377   int i;
378   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; }
379 }
380 
381 static void
bitset_copy(BitSetRef dest,BitSetRef bs)382 bitset_copy(BitSetRef dest, BitSetRef bs)
383 {
384   int i;
385   for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; }
386 }
387 
388 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)389 onig_strncmp(const UChar* s1, const UChar* s2, int n)
390 {
391   int x;
392 
393   while (n-- > 0) {
394     x = *s2++ - *s1++;
395     if (x) return x;
396   }
397   return 0;
398 }
399 
400 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)401 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
402 {
403   int len = (int )(end - src);
404   if (len > 0) {
405     xmemcpy(dest, src, len);
406     dest[len] = (UChar )0;
407   }
408 }
409 
410 /* scan pattern methods */
411 #define PEND_VALUE   0
412 
413 #define PFETCH_READY  UChar* pfetch_prev
414 #define PEND         (p < end ?  0 : 1)
415 #define PUNFETCH     p = pfetch_prev
416 #define PINC       do { \
417   pfetch_prev = p; \
418   p += ONIGENC_MBC_ENC_LEN(enc, p); \
419 } while (0)
420 #define PFETCH(c)  do { \
421   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
422   pfetch_prev = p; \
423   p += ONIGENC_MBC_ENC_LEN(enc, p); \
424 } while (0)
425 
426 #define PINC_S     do { \
427   p += ONIGENC_MBC_ENC_LEN(enc, p); \
428 } while (0)
429 #define PFETCH_S(c) do { \
430   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
431   p += ONIGENC_MBC_ENC_LEN(enc, p); \
432 } while (0)
433 
434 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
435 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
436 
437 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)438 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
439             int capa)
440 {
441   UChar* r;
442   ptrdiff_t dest_delta = dest_end - dest;
443 
444   if (dest)
445     r = (UChar* )xrealloc(dest, capa + 1);
446   else
447     r = (UChar* )xmalloc(capa + 1);
448 
449   CHECK_NULL_RETURN(r);
450   onig_strcpy(r + dest_delta, src, src_end);
451   return r;
452 }
453 
454 /* dest on static area */
455 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)456 strcat_capa_from_static(UChar* dest, UChar* dest_end,
457                         const UChar* src, const UChar* src_end, int capa)
458 {
459   UChar* r;
460 
461   r = (UChar* )xmalloc(capa + 1);
462   CHECK_NULL_RETURN(r);
463   onig_strcpy(r, dest, dest_end);
464   onig_strcpy(r + (dest_end - dest), src, src_end);
465   return r;
466 }
467 
468 
469 #ifdef USE_ST_LIBRARY
470 
471 typedef struct {
472   UChar* s;
473   UChar* end;
474 } st_str_end_key;
475 
476 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)477 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
478 {
479   UChar *p, *q;
480   int c;
481 
482   if ((x->end - x->s) != (y->end - y->s))
483     return 1;
484 
485   p = x->s;
486   q = y->s;
487   while (p < x->end) {
488     c = (int )*p - (int )*q;
489     if (c != 0) return c;
490 
491     p++; q++;
492   }
493 
494   return 0;
495 }
496 
497 static int
str_end_hash(st_str_end_key * x)498 str_end_hash(st_str_end_key* x)
499 {
500   UChar *p;
501   unsigned val = 0;
502 
503   p = x->s;
504   while (p < x->end) {
505     val = val * 997 + (unsigned )*p++;
506   }
507 
508   return (int) (val + (val >> 5));
509 }
510 
511 extern hash_table_type
onig_st_init_strend_table_with_size(int size)512 onig_st_init_strend_table_with_size(int size)
513 {
514   static struct st_hash_type hashType = {
515     str_end_cmp,
516     str_end_hash,
517   };
518 
519   return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
520 }
521 
522 extern int
onig_st_lookup_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type * value)523 onig_st_lookup_strend(hash_table_type table, const UChar* str_key,
524                       const UChar* end_key, hash_data_type *value)
525 {
526   st_str_end_key key;
527 
528   key.s   = (UChar* )str_key;
529   key.end = (UChar* )end_key;
530 
531   return onig_st_lookup(table, (st_data_t )(&key), value);
532 }
533 
534 extern int
onig_st_insert_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type value)535 onig_st_insert_strend(hash_table_type table, const UChar* str_key,
536                       const UChar* end_key, hash_data_type value)
537 {
538   st_str_end_key* key;
539   int result;
540 
541   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
542   CHECK_NULL_RETURN_MEMERR(key);
543 
544   key->s   = (UChar* )str_key;
545   key->end = (UChar* )end_key;
546   result = onig_st_insert(table, (st_data_t )key, value);
547   if (result) {
548     xfree(key);
549   }
550   return result;
551 }
552 
553 
554 #ifdef USE_CALLOUT
555 
556 typedef struct {
557   OnigEncoding enc;
558   int    type; /* callout type: single or not */
559   UChar* s;
560   UChar* end;
561 } st_callout_name_key;
562 
563 static int
callout_name_table_cmp(st_callout_name_key * x,st_callout_name_key * y)564 callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y)
565 {
566   UChar *p, *q;
567   int c;
568 
569   if (x->enc  != y->enc)  return 1;
570   if (x->type != y->type) return 1;
571   if ((x->end - x->s) != (y->end - y->s))
572     return 1;
573 
574   p = x->s;
575   q = y->s;
576   while (p < x->end) {
577     c = (int )*p - (int )*q;
578     if (c != 0) return c;
579 
580     p++; q++;
581   }
582 
583   return 0;
584 }
585 
586 static int
callout_name_table_hash(st_callout_name_key * x)587 callout_name_table_hash(st_callout_name_key* x)
588 {
589   UChar *p;
590   unsigned int val = 0;
591 
592   p = x->s;
593   while (p < x->end) {
594     val = val * 997 + (unsigned int )*p++;
595   }
596 
597   /* use intptr_t for escape warning in Windows */
598   return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type);
599 }
600 
601 extern hash_table_type
onig_st_init_callout_name_table_with_size(int size)602 onig_st_init_callout_name_table_with_size(int size)
603 {
604   static struct st_hash_type hashType = {
605     callout_name_table_cmp,
606     callout_name_table_hash,
607   };
608 
609   return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
610 }
611 
612 extern int
onig_st_lookup_callout_name_table(hash_table_type table,OnigEncoding enc,int type,const UChar * str_key,const UChar * end_key,hash_data_type * value)613 onig_st_lookup_callout_name_table(hash_table_type table,
614                                   OnigEncoding enc,
615                                   int type,
616                                   const UChar* str_key,
617                                   const UChar* end_key,
618                                   hash_data_type *value)
619 {
620   st_callout_name_key key;
621 
622   key.enc  = enc;
623   key.type = type;
624   key.s    = (UChar* )str_key;
625   key.end  = (UChar* )end_key;
626 
627   return onig_st_lookup(table, (st_data_t )(&key), value);
628 }
629 
630 static int
st_insert_callout_name_table(hash_table_type table,OnigEncoding enc,int type,UChar * str_key,UChar * end_key,hash_data_type value)631 st_insert_callout_name_table(hash_table_type table,
632                              OnigEncoding enc, int type,
633                              UChar* str_key, UChar* end_key,
634                              hash_data_type value)
635 {
636   st_callout_name_key* key;
637   int result;
638 
639   key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key));
640   CHECK_NULL_RETURN_MEMERR(key);
641 
642   /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */
643   key->enc  = enc;
644   key->type = type;
645   key->s    = str_key;
646   key->end  = end_key;
647   result = onig_st_insert(table, (st_data_t )key, value);
648   if (result) {
649     xfree(key);
650   }
651   return result;
652 }
653 #endif
654 
655 #endif /* USE_ST_LIBRARY */
656 
657 
658 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
659 
660 typedef struct {
661   UChar* name;
662   int    name_len;   /* byte length */
663   int    back_num;   /* number of backrefs */
664   int    back_alloc;
665   int    back_ref1;
666   int*   back_refs;
667 } NameEntry;
668 
669 #ifdef USE_ST_LIBRARY
670 
671 #define INIT_NAMES_ALLOC_NUM    5
672 
673 typedef st_table  NameTable;
674 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
675 
676 #define NAMEBUF_SIZE    24
677 #define NAMEBUF_SIZE_1  25
678 
679 #ifdef ONIG_DEBUG
680 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)681 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
682 {
683   int i;
684   FILE* fp = (FILE* )arg;
685 
686   fprintf(fp, "%s: ", e->name);
687   if (e->back_num == 0)
688     fputs("-", fp);
689   else if (e->back_num == 1)
690     fprintf(fp, "%d", e->back_ref1);
691   else {
692     for (i = 0; i < e->back_num; i++) {
693       if (i > 0) fprintf(fp, ", ");
694       fprintf(fp, "%d", e->back_refs[i]);
695     }
696   }
697   fputs("\n", fp);
698   return ST_CONTINUE;
699 }
700 
701 extern int
onig_print_names(FILE * fp,regex_t * reg)702 onig_print_names(FILE* fp, regex_t* reg)
703 {
704   NameTable* t = (NameTable* )reg->name_table;
705 
706   if (IS_NOT_NULL(t)) {
707     fprintf(fp, "name table\n");
708     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
709     fputs("\n", fp);
710   }
711   return 0;
712 }
713 #endif /* ONIG_DEBUG */
714 
715 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)716 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
717 {
718   xfree(e->name);
719   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
720   xfree(key);
721   xfree(e);
722   return ST_DELETE;
723 }
724 
725 static int
names_clear(regex_t * reg)726 names_clear(regex_t* reg)
727 {
728   NameTable* t = (NameTable* )reg->name_table;
729 
730   if (IS_NOT_NULL(t)) {
731     onig_st_foreach(t, i_free_name_entry, 0);
732   }
733   return 0;
734 }
735 
736 extern int
onig_names_free(regex_t * reg)737 onig_names_free(regex_t* reg)
738 {
739   int r;
740   NameTable* t;
741 
742   r = names_clear(reg);
743   if (r != 0) return r;
744 
745   t = (NameTable* )reg->name_table;
746   if (IS_NOT_NULL(t)) onig_st_free_table(t);
747   reg->name_table = (void* )NULL;
748   return 0;
749 }
750 
751 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)752 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
753 {
754   NameEntry* e;
755   NameTable* t = (NameTable* )reg->name_table;
756 
757   e = (NameEntry* )NULL;
758   if (IS_NOT_NULL(t)) {
759     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
760   }
761   return e;
762 }
763 
764 typedef struct {
765   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
766   regex_t* reg;
767   void* arg;
768   int ret;
769   OnigEncoding enc;
770 } INamesArg;
771 
772 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)773 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
774 {
775   int r = (*(arg->func))(e->name,
776                          e->name + e->name_len,
777                          e->back_num,
778                          (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
779                          arg->reg, arg->arg);
780   if (r != 0) {
781     arg->ret = r;
782     return ST_STOP;
783   }
784   return ST_CONTINUE;
785 }
786 
787 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)788 onig_foreach_name(regex_t* reg,
789   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
790 {
791   INamesArg narg;
792   NameTable* t = (NameTable* )reg->name_table;
793 
794   narg.ret = 0;
795   if (IS_NOT_NULL(t)) {
796     narg.func = func;
797     narg.reg  = reg;
798     narg.arg  = arg;
799     narg.enc  = reg->enc; /* should be pattern encoding. */
800     onig_st_foreach(t, i_names, (HashDataType )&narg);
801   }
802   return narg.ret;
803 }
804 
805 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumMap * map)806 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map)
807 {
808   int i;
809 
810   if (e->back_num > 1) {
811     for (i = 0; i < e->back_num; i++) {
812       e->back_refs[i] = map[e->back_refs[i]].new_val;
813     }
814   }
815   else if (e->back_num == 1) {
816     e->back_ref1 = map[e->back_ref1].new_val;
817   }
818 
819   return ST_CONTINUE;
820 }
821 
822 extern int
onig_renumber_name_table(regex_t * reg,GroupNumMap * map)823 onig_renumber_name_table(regex_t* reg, GroupNumMap* map)
824 {
825   NameTable* t = (NameTable* )reg->name_table;
826 
827   if (IS_NOT_NULL(t)) {
828     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
829   }
830   return 0;
831 }
832 
833 
834 extern int
onig_number_of_names(regex_t * reg)835 onig_number_of_names(regex_t* reg)
836 {
837   NameTable* t = (NameTable* )reg->name_table;
838 
839   if (IS_NOT_NULL(t))
840     return t->num_entries;
841   else
842     return 0;
843 }
844 
845 #else  /* USE_ST_LIBRARY */
846 
847 #define INIT_NAMES_ALLOC_NUM    8
848 
849 typedef struct {
850   NameEntry* e;
851   int        num;
852   int        alloc;
853 } NameTable;
854 
855 #ifdef ONIG_DEBUG
856 extern int
onig_print_names(FILE * fp,regex_t * reg)857 onig_print_names(FILE* fp, regex_t* reg)
858 {
859   int i, j;
860   NameEntry* e;
861   NameTable* t = (NameTable* )reg->name_table;
862 
863   if (IS_NOT_NULL(t) && t->num > 0) {
864     fprintf(fp, "name table\n");
865     for (i = 0; i < t->num; i++) {
866       e = &(t->e[i]);
867       fprintf(fp, "%s: ", e->name);
868       if (e->back_num == 0) {
869         fputs("-", fp);
870       }
871       else if (e->back_num == 1) {
872         fprintf(fp, "%d", e->back_ref1);
873       }
874       else {
875         for (j = 0; j < e->back_num; j++) {
876           if (j > 0) fprintf(fp, ", ");
877           fprintf(fp, "%d", e->back_refs[j]);
878         }
879       }
880       fputs("\n", fp);
881     }
882     fputs("\n", fp);
883   }
884   return 0;
885 }
886 #endif
887 
888 static int
names_clear(regex_t * reg)889 names_clear(regex_t* reg)
890 {
891   int i;
892   NameEntry* e;
893   NameTable* t = (NameTable* )reg->name_table;
894 
895   if (IS_NOT_NULL(t)) {
896     for (i = 0; i < t->num; i++) {
897       e = &(t->e[i]);
898       if (IS_NOT_NULL(e->name)) {
899         xfree(e->name);
900         e->name       = NULL;
901         e->name_len   = 0;
902         e->back_num   = 0;
903         e->back_alloc = 0;
904         if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
905         e->back_refs = (int* )NULL;
906       }
907     }
908     if (IS_NOT_NULL(t->e)) {
909       xfree(t->e);
910       t->e = NULL;
911     }
912     t->num = 0;
913   }
914   return 0;
915 }
916 
917 extern int
onig_names_free(regex_t * reg)918 onig_names_free(regex_t* reg)
919 {
920   int r;
921   NameTable* t;
922 
923   r = names_clear(reg);
924   if (r != 0) return r;
925 
926   t = (NameTable* )reg->name_table;
927   if (IS_NOT_NULL(t)) xfree(t);
928   reg->name_table = NULL;
929   return 0;
930 }
931 
932 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)933 name_find(regex_t* reg, UChar* name, UChar* name_end)
934 {
935   int i, len;
936   NameEntry* e;
937   NameTable* t = (NameTable* )reg->name_table;
938 
939   if (IS_NOT_NULL(t)) {
940     len = name_end - name;
941     for (i = 0; i < t->num; i++) {
942       e = &(t->e[i]);
943       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
944         return e;
945     }
946   }
947   return (NameEntry* )NULL;
948 }
949 
950 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)951 onig_foreach_name(regex_t* reg,
952   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
953 {
954   int i, r;
955   NameEntry* e;
956   NameTable* t = (NameTable* )reg->name_table;
957 
958   if (IS_NOT_NULL(t)) {
959     for (i = 0; i < t->num; i++) {
960       e = &(t->e[i]);
961       r = (*func)(e->name, e->name + e->name_len, e->back_num,
962                   (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
963                   reg, arg);
964       if (r != 0) return r;
965     }
966   }
967   return 0;
968 }
969 
970 extern int
onig_number_of_names(regex_t * reg)971 onig_number_of_names(regex_t* reg)
972 {
973   NameTable* t = (NameTable* )reg->name_table;
974 
975   if (IS_NOT_NULL(t))
976     return t->num;
977   else
978     return 0;
979 }
980 
981 #endif /* else USE_ST_LIBRARY */
982 
983 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)984 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
985 {
986   int r;
987   int alloc;
988   NameEntry* e;
989   NameTable* t = (NameTable* )reg->name_table;
990 
991   if (name_end - name <= 0)
992     return ONIGERR_EMPTY_GROUP_NAME;
993 
994   e = name_find(reg, name, name_end);
995   if (IS_NULL(e)) {
996 #ifdef USE_ST_LIBRARY
997     if (IS_NULL(t)) {
998       t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM);
999       CHECK_NULL_RETURN_MEMERR(t);
1000       reg->name_table = (void* )t;
1001     }
1002     e = (NameEntry* )xmalloc(sizeof(NameEntry));
1003     CHECK_NULL_RETURN_MEMERR(e);
1004 
1005     e->name = onigenc_strdup(reg->enc, name, name_end);
1006     if (IS_NULL(e->name)) {
1007       xfree(e);  return ONIGERR_MEMORY;
1008     }
1009     r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
1010                               (HashDataType )e);
1011     if (r < 0) return r;
1012 
1013     e->name_len   = (int )(name_end - name);
1014     e->back_num   = 0;
1015     e->back_alloc = 0;
1016     e->back_refs  = (int* )NULL;
1017 
1018 #else
1019 
1020     if (IS_NULL(t)) {
1021       alloc = INIT_NAMES_ALLOC_NUM;
1022       t = (NameTable* )xmalloc(sizeof(NameTable));
1023       CHECK_NULL_RETURN_MEMERR(t);
1024       t->e     = NULL;
1025       t->alloc = 0;
1026       t->num   = 0;
1027 
1028       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
1029       if (IS_NULL(t->e)) {
1030         xfree(t);
1031         return ONIGERR_MEMORY;
1032       }
1033       t->alloc = alloc;
1034       reg->name_table = t;
1035       goto clear;
1036     }
1037     else if (t->num == t->alloc) {
1038       int i;
1039 
1040       alloc = t->alloc * 2;
1041       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
1042       CHECK_NULL_RETURN_MEMERR(t->e);
1043       t->alloc = alloc;
1044 
1045     clear:
1046       for (i = t->num; i < t->alloc; i++) {
1047         t->e[i].name       = NULL;
1048         t->e[i].name_len   = 0;
1049         t->e[i].back_num   = 0;
1050         t->e[i].back_alloc = 0;
1051         t->e[i].back_refs  = (int* )NULL;
1052       }
1053     }
1054     e = &(t->e[t->num]);
1055     t->num++;
1056     e->name = onigenc_strdup(reg->enc, name, name_end);
1057     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1058     e->name_len = name_end - name;
1059 #endif
1060   }
1061 
1062   if (e->back_num >= 1 &&
1063       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
1064     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1065                                    name, name_end);
1066     return ONIGERR_MULTIPLEX_DEFINED_NAME;
1067   }
1068 
1069   e->back_num++;
1070   if (e->back_num == 1) {
1071     e->back_ref1 = backref;
1072   }
1073   else {
1074     if (e->back_num == 2) {
1075       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
1076       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
1077       CHECK_NULL_RETURN_MEMERR(e->back_refs);
1078       e->back_alloc = alloc;
1079       e->back_refs[0] = e->back_ref1;
1080       e->back_refs[1] = backref;
1081     }
1082     else {
1083       if (e->back_num > e->back_alloc) {
1084         alloc = e->back_alloc * 2;
1085         e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
1086         CHECK_NULL_RETURN_MEMERR(e->back_refs);
1087         e->back_alloc = alloc;
1088       }
1089       e->back_refs[e->back_num - 1] = backref;
1090     }
1091   }
1092 
1093   return 0;
1094 }
1095 
1096 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)1097 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
1098                            const UChar* name_end, int** nums)
1099 {
1100   NameEntry* e = name_find(reg, name, name_end);
1101 
1102   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
1103 
1104   switch (e->back_num) {
1105   case 0:
1106     break;
1107   case 1:
1108     *nums = &(e->back_ref1);
1109     break;
1110   default:
1111     *nums = e->back_refs;
1112     break;
1113   }
1114   return e->back_num;
1115 }
1116 
1117 static int
name_to_group_numbers(ScanEnv * env,const UChar * name,const UChar * name_end,int ** nums)1118 name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end,
1119                       int** nums)
1120 {
1121   regex_t* reg;
1122   NameEntry* e;
1123 
1124   reg = env->reg;
1125   e = name_find(reg, name, name_end);
1126 
1127   if (IS_NULL(e)) {
1128     onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
1129                                    (UChar* )name, (UChar* )name_end);
1130     return ONIGERR_UNDEFINED_NAME_REFERENCE;
1131   }
1132 
1133   switch (e->back_num) {
1134   case 0:
1135     break;
1136   case 1:
1137     *nums = &(e->back_ref1);
1138     break;
1139   default:
1140     *nums = e->back_refs;
1141     break;
1142   }
1143   return e->back_num;
1144 }
1145 
1146 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)1147 onig_name_to_backref_number(regex_t* reg, const UChar* name,
1148                             const UChar* name_end, OnigRegion *region)
1149 {
1150   int i, n, *nums;
1151 
1152   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
1153   if (n < 0)
1154     return n;
1155   else if (n == 0)
1156     return ONIGERR_PARSER_BUG;
1157   else if (n == 1)
1158     return nums[0];
1159   else {
1160     if (IS_NOT_NULL(region)) {
1161       for (i = n - 1; i >= 0; i--) {
1162         if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
1163           return nums[i];
1164       }
1165     }
1166     return nums[n - 1];
1167   }
1168 }
1169 
1170 extern int
onig_noname_group_capture_is_active(regex_t * reg)1171 onig_noname_group_capture_is_active(regex_t* reg)
1172 {
1173   if (OPTON_DONT_CAPTURE_GROUP(reg->options))
1174     return 0;
1175 
1176   if (onig_number_of_names(reg) > 0 &&
1177       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
1178       ! OPTON_CAPTURE_GROUP(reg->options)) {
1179     return 0;
1180   }
1181 
1182   return 1;
1183 }
1184 
1185 #ifdef USE_CALLOUT
1186 
1187 typedef struct {
1188   OnigCalloutType type;
1189   int             in;
1190   OnigCalloutFunc start_func;
1191   OnigCalloutFunc end_func;
1192   int             arg_num;
1193   int             opt_arg_num;
1194   unsigned int    arg_types[ONIG_CALLOUT_MAX_ARGS_NUM];
1195   OnigValue       opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM];
1196   UChar*          name; /* reference to GlobalCalloutNameTable entry: e->name */
1197 } CalloutNameListEntry;
1198 
1199 typedef struct {
1200   int  n;
1201   int  alloc;
1202   CalloutNameListEntry* v;
1203 } CalloutNameListType;
1204 
1205 static CalloutNameListType* GlobalCalloutNameList;
1206 
1207 static int
make_callout_func_list(CalloutNameListType ** rs,int init_size)1208 make_callout_func_list(CalloutNameListType** rs, int init_size)
1209 {
1210   CalloutNameListType* s;
1211   CalloutNameListEntry* v;
1212 
1213   *rs = 0;
1214 
1215   s = xmalloc(sizeof(*s));
1216   if (IS_NULL(s)) return ONIGERR_MEMORY;
1217 
1218   v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size);
1219   if (IS_NULL(v)) {
1220     xfree(s);
1221     return ONIGERR_MEMORY;
1222   }
1223 
1224   s->n = 0;
1225   s->alloc = init_size;
1226   s->v = v;
1227 
1228   *rs = s;
1229   return ONIG_NORMAL;
1230 }
1231 
1232 static void
free_callout_func_list(CalloutNameListType * s)1233 free_callout_func_list(CalloutNameListType* s)
1234 {
1235   if (IS_NOT_NULL(s)) {
1236     if (IS_NOT_NULL(s->v)) {
1237       int i, j;
1238 
1239       for (i = 0; i < s->n; i++) {
1240         CalloutNameListEntry* e = s->v + i;
1241         for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) {
1242           if (e->arg_types[j] == ONIG_TYPE_STRING) {
1243             UChar* p = e->opt_defaults[j].s.start;
1244             if (IS_NOT_NULL(p)) xfree(p);
1245           }
1246         }
1247       }
1248       xfree(s->v);
1249     }
1250     xfree(s);
1251   }
1252 }
1253 
1254 static int
callout_func_list_add(CalloutNameListType * s,int * rid)1255 callout_func_list_add(CalloutNameListType* s, int* rid)
1256 {
1257   if (s->n >= s->alloc) {
1258     int new_size = s->alloc * 2;
1259     CalloutNameListEntry* nv = (CalloutNameListEntry* )
1260       xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size);
1261     if (IS_NULL(nv)) return ONIGERR_MEMORY;
1262 
1263     s->alloc = new_size;
1264     s->v = nv;
1265   }
1266 
1267   *rid = s->n;
1268 
1269   xmemset(&(s->v[s->n]), 0, sizeof(*(s->v)));
1270   s->n++;
1271   return ONIG_NORMAL;
1272 }
1273 
1274 
1275 typedef struct {
1276   UChar* name;
1277   int    name_len;   /* byte length */
1278   int    id;
1279 } CalloutNameEntry;
1280 
1281 #ifdef USE_ST_LIBRARY
1282 typedef st_table  CalloutNameTable;
1283 #else
1284 typedef struct {
1285   CalloutNameEntry* e;
1286   int               num;
1287   int               alloc;
1288 } CalloutNameTable;
1289 #endif
1290 
1291 static CalloutNameTable* GlobalCalloutNameTable;
1292 static int CalloutNameIDCounter;
1293 
1294 #ifdef USE_ST_LIBRARY
1295 
1296 static int
i_free_callout_name_entry(st_callout_name_key * key,CalloutNameEntry * e,void * arg ARG_UNUSED)1297 i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e,
1298                           void* arg ARG_UNUSED)
1299 {
1300   if (IS_NOT_NULL(e)) {
1301     xfree(e->name);
1302   }
1303   /*xfree(key->s); */ /* is same as e->name */
1304   xfree(key);
1305   xfree(e);
1306   return ST_DELETE;
1307 }
1308 
1309 static int
callout_name_table_clear(CalloutNameTable * t)1310 callout_name_table_clear(CalloutNameTable* t)
1311 {
1312   if (IS_NOT_NULL(t)) {
1313     onig_st_foreach(t, i_free_callout_name_entry, 0);
1314   }
1315   return 0;
1316 }
1317 
1318 static int
global_callout_name_table_free(void)1319 global_callout_name_table_free(void)
1320 {
1321   if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1322     int r = callout_name_table_clear(GlobalCalloutNameTable);
1323     if (r != 0) return r;
1324 
1325     onig_st_free_table(GlobalCalloutNameTable);
1326     GlobalCalloutNameTable = 0;
1327     CalloutNameIDCounter = 0;
1328   }
1329 
1330   return 0;
1331 }
1332 
1333 static CalloutNameEntry*
callout_name_find(OnigEncoding enc,int is_not_single,const UChar * name,const UChar * name_end)1334 callout_name_find(OnigEncoding enc, int is_not_single,
1335                   const UChar* name, const UChar* name_end)
1336 {
1337   int r;
1338   CalloutNameEntry* e;
1339   CalloutNameTable* t = GlobalCalloutNameTable;
1340 
1341   e = (CalloutNameEntry* )NULL;
1342   if (IS_NOT_NULL(t)) {
1343     r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1344                                           (HashDataType* )((void* )(&e)));
1345     if (r == 0) { /* not found */
1346       if (enc != ONIG_ENCODING_ASCII &&
1347           ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
1348         enc = ONIG_ENCODING_ASCII;
1349         onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1350                                           (HashDataType* )((void* )(&e)));
1351       }
1352     }
1353   }
1354   return e;
1355 }
1356 
1357 #else
1358 
1359 static int
callout_name_table_clear(CalloutNameTable * t)1360 callout_name_table_clear(CalloutNameTable* t)
1361 {
1362   int i;
1363   CalloutNameEntry* e;
1364 
1365   if (IS_NOT_NULL(t)) {
1366     for (i = 0; i < t->num; i++) {
1367       e = &(t->e[i]);
1368       if (IS_NOT_NULL(e->name)) {
1369         xfree(e->name);
1370         e->name     = NULL;
1371         e->name_len = 0;
1372         e->id       = 0;
1373         e->func     = 0;
1374       }
1375     }
1376     if (IS_NOT_NULL(t->e)) {
1377       xfree(t->e);
1378       t->e = NULL;
1379     }
1380     t->num = 0;
1381   }
1382   return 0;
1383 }
1384 
1385 static int
global_callout_name_table_free(void)1386 global_callout_name_table_free(void)
1387 {
1388   if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1389     int r = callout_name_table_clear(GlobalCalloutNameTable);
1390     if (r != 0) return r;
1391 
1392     xfree(GlobalCalloutNameTable);
1393     GlobalCalloutNameTable = 0;
1394     CalloutNameIDCounter = 0;
1395   }
1396   return 0;
1397 }
1398 
1399 static CalloutNameEntry*
callout_name_find(UChar * name,UChar * name_end)1400 callout_name_find(UChar* name, UChar* name_end)
1401 {
1402   int i, len;
1403   CalloutNameEntry* e;
1404   CalloutNameTable* t = Calloutnames;
1405 
1406   if (IS_NOT_NULL(t)) {
1407     len = name_end - name;
1408     for (i = 0; i < t->num; i++) {
1409       e = &(t->e[i]);
1410       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
1411         return e;
1412     }
1413   }
1414   return (CalloutNameEntry* )NULL;
1415 }
1416 
1417 #endif
1418 
1419 /* name string must be single byte char string. */
1420 static int
callout_name_entry(CalloutNameEntry ** rentry,OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end)1421 callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc,
1422                    int is_not_single, UChar* name, UChar* name_end)
1423 {
1424   int r;
1425   CalloutNameEntry* e;
1426   CalloutNameTable* t = GlobalCalloutNameTable;
1427 
1428   *rentry = 0;
1429   if (name_end - name <= 0)
1430     return ONIGERR_INVALID_CALLOUT_NAME;
1431 
1432   e = callout_name_find(enc, is_not_single, name, name_end);
1433   if (IS_NULL(e)) {
1434 #ifdef USE_ST_LIBRARY
1435     if (IS_NULL(t)) {
1436       t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM);
1437       CHECK_NULL_RETURN_MEMERR(t);
1438       GlobalCalloutNameTable = t;
1439     }
1440     e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry));
1441     CHECK_NULL_RETURN_MEMERR(e);
1442 
1443     e->name = onigenc_strdup(enc, name, name_end);
1444     if (IS_NULL(e->name)) {
1445       xfree(e);  return ONIGERR_MEMORY;
1446     }
1447 
1448     r = st_insert_callout_name_table(t, enc, is_not_single,
1449                                      e->name, (e->name + (name_end - name)),
1450                                      (HashDataType )e);
1451     if (r < 0) return r;
1452 
1453 #else
1454 
1455     int alloc;
1456 
1457     if (IS_NULL(t)) {
1458       alloc = INIT_NAMES_ALLOC_NUM;
1459       t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable));
1460       CHECK_NULL_RETURN_MEMERR(t);
1461       t->e     = NULL;
1462       t->alloc = 0;
1463       t->num   = 0;
1464 
1465       t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc);
1466       if (IS_NULL(t->e)) {
1467         xfree(t);
1468         return ONIGERR_MEMORY;
1469       }
1470       t->alloc = alloc;
1471       GlobalCalloutNameTable = t;
1472       goto clear;
1473     }
1474     else if (t->num == t->alloc) {
1475       int i;
1476 
1477       alloc = t->alloc * 2;
1478       t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc);
1479       CHECK_NULL_RETURN_MEMERR(t->e);
1480       t->alloc = alloc;
1481 
1482     clear:
1483       for (i = t->num; i < t->alloc; i++) {
1484         t->e[i].name       = NULL;
1485         t->e[i].name_len   = 0;
1486         t->e[i].id         = 0;
1487       }
1488     }
1489     e = &(t->e[t->num]);
1490     t->num++;
1491     e->name = onigenc_strdup(enc, name, name_end);
1492     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1493 #endif
1494 
1495     CalloutNameIDCounter++;
1496     e->id = CalloutNameIDCounter;
1497     e->name_len = (int )(name_end - name);
1498   }
1499 
1500   *rentry = e;
1501   return e->id;
1502 }
1503 
1504 static int
is_allowed_callout_name(OnigEncoding enc,UChar * name,UChar * name_end)1505 is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end)
1506 {
1507   UChar* p;
1508   OnigCodePoint c;
1509 
1510   if (name >= name_end) return 0;
1511 
1512   p = name;
1513   while (p < name_end) {
1514     c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1515     if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c))
1516       return 0;
1517 
1518     if (p == name) {
1519       if (c >= '0' && c <= '9') return 0;
1520     }
1521 
1522     p += ONIGENC_MBC_ENC_LEN(enc, p);
1523   }
1524 
1525   return 1;
1526 }
1527 
1528 static int
is_allowed_callout_tag_name(OnigEncoding enc,UChar * name,UChar * name_end)1529 is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end)
1530 {
1531   UChar* p;
1532   OnigCodePoint c;
1533 
1534   if (name >= name_end) return 0;
1535 
1536   p = name;
1537   while (p < name_end) {
1538     c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1539     if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c))
1540       return 0;
1541 
1542     if (p == name) {
1543       if (c >= '0' && c <= '9') return 0;
1544     }
1545 
1546     p += ONIGENC_MBC_ENC_LEN(enc, p);
1547   }
1548 
1549   return 1;
1550 }
1551 
1552 extern int
onig_set_callout_of_name(OnigEncoding enc,OnigCalloutType callout_type,UChar * name,UChar * name_end,int in,OnigCalloutFunc start_func,OnigCalloutFunc end_func,int arg_num,unsigned int arg_types[],int opt_arg_num,OnigValue opt_defaults[])1553 onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type,
1554                          UChar* name, UChar* name_end, int in,
1555                          OnigCalloutFunc start_func,
1556                          OnigCalloutFunc end_func,
1557                          int arg_num, unsigned int arg_types[],
1558                          int opt_arg_num, OnigValue opt_defaults[])
1559 {
1560   int r;
1561   int i;
1562   int j;
1563   int id;
1564   int is_not_single;
1565   CalloutNameEntry* e;
1566   CalloutNameListEntry* fe;
1567 
1568   if (callout_type != ONIG_CALLOUT_TYPE_SINGLE)
1569     return ONIGERR_INVALID_ARGUMENT;
1570 
1571   if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM)
1572     return ONIGERR_INVALID_CALLOUT_ARG;
1573 
1574   if (opt_arg_num < 0 || opt_arg_num > arg_num)
1575     return ONIGERR_INVALID_CALLOUT_ARG;
1576 
1577   if (start_func == 0 && end_func == 0)
1578     return ONIGERR_INVALID_CALLOUT_ARG;
1579 
1580   if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0)
1581     return ONIGERR_INVALID_CALLOUT_ARG;
1582 
1583   for (i = 0; i < arg_num; i++) {
1584     unsigned int t = arg_types[i];
1585     if (t == ONIG_TYPE_VOID)
1586       return ONIGERR_INVALID_CALLOUT_ARG;
1587     else {
1588       if (i >= arg_num - opt_arg_num) {
1589         if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING &&
1590             t != ONIG_TYPE_TAG)
1591           return ONIGERR_INVALID_CALLOUT_ARG;
1592       }
1593       else {
1594         if (t != ONIG_TYPE_LONG) {
1595           t = t & ~ONIG_TYPE_LONG;
1596           if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG)
1597             return ONIGERR_INVALID_CALLOUT_ARG;
1598         }
1599       }
1600     }
1601   }
1602 
1603   if (! is_allowed_callout_name(enc, name, name_end)) {
1604     return ONIGERR_INVALID_CALLOUT_NAME;
1605   }
1606 
1607   is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE);
1608   id = callout_name_entry(&e, enc, is_not_single, name, name_end);
1609   if (id < 0) return id;
1610 
1611   r = ONIG_NORMAL;
1612   if (IS_NULL(GlobalCalloutNameList)) {
1613     r = make_callout_func_list(&GlobalCalloutNameList, 10);
1614     if (r != ONIG_NORMAL) return r;
1615   }
1616 
1617   while (id >= GlobalCalloutNameList->n) {
1618     int rid;
1619     r = callout_func_list_add(GlobalCalloutNameList, &rid);
1620     if (r != ONIG_NORMAL) return r;
1621   }
1622 
1623   fe = GlobalCalloutNameList->v + id;
1624   fe->type         = callout_type;
1625   fe->in           = in;
1626   fe->start_func   = start_func;
1627   fe->end_func     = end_func;
1628   fe->arg_num      = arg_num;
1629   fe->opt_arg_num  = opt_arg_num;
1630   fe->name         = e->name;
1631 
1632   for (i = 0; i < arg_num; i++) {
1633     fe->arg_types[i] = arg_types[i];
1634   }
1635   for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) {
1636     if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT;
1637     if (fe->arg_types[i] == ONIG_TYPE_STRING) {
1638       OnigValue* val;
1639       UChar* ds;
1640 
1641       val = opt_defaults + j;
1642       ds = onigenc_strdup(enc, val->s.start, val->s.end);
1643       CHECK_NULL_RETURN_MEMERR(ds);
1644 
1645       fe->opt_defaults[i].s.start = ds;
1646       fe->opt_defaults[i].s.end   = ds + (val->s.end - val->s.start);
1647     }
1648     else {
1649       fe->opt_defaults[i] = opt_defaults[j];
1650     }
1651   }
1652 
1653   r = id;
1654   return r;
1655 }
1656 
1657 static int
get_callout_name_id_by_name(OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end,int * rid)1658 get_callout_name_id_by_name(OnigEncoding enc, int is_not_single,
1659                             UChar* name, UChar* name_end, int* rid)
1660 {
1661   int r;
1662   CalloutNameEntry* e;
1663 
1664   if (! is_allowed_callout_name(enc, name, name_end)) {
1665     return ONIGERR_INVALID_CALLOUT_NAME;
1666   }
1667 
1668   e = callout_name_find(enc, is_not_single, name, name_end);
1669   if (IS_NULL(e)) {
1670     return ONIGERR_UNDEFINED_CALLOUT_NAME;
1671   }
1672 
1673   r = ONIG_NORMAL;
1674   *rid = e->id;
1675 
1676   return r;
1677 }
1678 
1679 extern OnigCalloutFunc
onig_get_callout_start_func(regex_t * reg,int callout_num)1680 onig_get_callout_start_func(regex_t* reg, int callout_num)
1681 {
1682   /* If used for callouts of contents, return 0. */
1683   CalloutListEntry* e;
1684 
1685   e = onig_reg_callout_list_at(reg, callout_num);
1686   CHECK_NULL_RETURN(e);
1687   return e->start_func;
1688 }
1689 
1690 extern const UChar*
onig_get_callout_tag_start(regex_t * reg,int callout_num)1691 onig_get_callout_tag_start(regex_t* reg, int callout_num)
1692 {
1693   CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1694   CHECK_NULL_RETURN(e);
1695   return e->tag_start;
1696 }
1697 
1698 extern const UChar*
onig_get_callout_tag_end(regex_t * reg,int callout_num)1699 onig_get_callout_tag_end(regex_t* reg, int callout_num)
1700 {
1701   CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1702   CHECK_NULL_RETURN(e);
1703   return e->tag_end;
1704 }
1705 
1706 
1707 extern OnigCalloutType
onig_get_callout_type_by_name_id(int name_id)1708 onig_get_callout_type_by_name_id(int name_id)
1709 {
1710   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1711     return 0;
1712 
1713   return GlobalCalloutNameList->v[name_id].type;
1714 }
1715 
1716 extern OnigCalloutFunc
onig_get_callout_start_func_by_name_id(int name_id)1717 onig_get_callout_start_func_by_name_id(int name_id)
1718 {
1719   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1720     return 0;
1721 
1722   return GlobalCalloutNameList->v[name_id].start_func;
1723 }
1724 
1725 extern OnigCalloutFunc
onig_get_callout_end_func_by_name_id(int name_id)1726 onig_get_callout_end_func_by_name_id(int name_id)
1727 {
1728   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1729     return 0;
1730 
1731   return GlobalCalloutNameList->v[name_id].end_func;
1732 }
1733 
1734 extern int
onig_get_callout_in_by_name_id(int name_id)1735 onig_get_callout_in_by_name_id(int name_id)
1736 {
1737   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1738     return 0;
1739 
1740   return GlobalCalloutNameList->v[name_id].in;
1741 }
1742 
1743 static int
get_callout_arg_num_by_name_id(int name_id)1744 get_callout_arg_num_by_name_id(int name_id)
1745 {
1746   return GlobalCalloutNameList->v[name_id].arg_num;
1747 }
1748 
1749 static int
get_callout_opt_arg_num_by_name_id(int name_id)1750 get_callout_opt_arg_num_by_name_id(int name_id)
1751 {
1752   return GlobalCalloutNameList->v[name_id].opt_arg_num;
1753 }
1754 
1755 static unsigned int
get_callout_arg_type_by_name_id(int name_id,int index)1756 get_callout_arg_type_by_name_id(int name_id, int index)
1757 {
1758   return GlobalCalloutNameList->v[name_id].arg_types[index];
1759 }
1760 
1761 static OnigValue
get_callout_opt_default_by_name_id(int name_id,int index)1762 get_callout_opt_default_by_name_id(int name_id, int index)
1763 {
1764   return GlobalCalloutNameList->v[name_id].opt_defaults[index];
1765 }
1766 
1767 extern UChar*
onig_get_callout_name_by_name_id(int name_id)1768 onig_get_callout_name_by_name_id(int name_id)
1769 {
1770   if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1771     return 0;
1772 
1773   return GlobalCalloutNameList->v[name_id].name;
1774 }
1775 
1776 extern int
onig_global_callout_names_free(void)1777 onig_global_callout_names_free(void)
1778 {
1779   free_callout_func_list(GlobalCalloutNameList);
1780   GlobalCalloutNameList = 0;
1781 
1782   global_callout_name_table_free();
1783   return ONIG_NORMAL;
1784 }
1785 
1786 
1787 typedef st_table   CalloutTagTable;
1788 typedef intptr_t   CalloutTagVal;
1789 
1790 #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST     (1<<0)
1791 
1792 static int
i_callout_callout_list_set(UChar * key,CalloutTagVal e,void * arg)1793 i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg)
1794 {
1795   int num;
1796   RegexExt* ext = (RegexExt* )arg;
1797 
1798   num = (int )e - 1;
1799   ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST;
1800   return ST_CONTINUE;
1801 }
1802 
1803 static int
setup_ext_callout_list_values(regex_t * reg)1804 setup_ext_callout_list_values(regex_t* reg)
1805 {
1806   int i, j;
1807   RegexExt* ext;
1808 
1809   ext = reg->extp;
1810   if (IS_NOT_NULL(ext->tag_table)) {
1811     onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set,
1812                     (st_data_t )ext);
1813   }
1814 
1815   for (i = 0; i < ext->callout_num; i++) {
1816     CalloutListEntry* e = ext->callout_list + i;
1817     if (e->of == ONIG_CALLOUT_OF_NAME) {
1818       for (j = 0; j < e->u.arg.num; j++) {
1819         if (e->u.arg.types[j] == ONIG_TYPE_TAG) {
1820           UChar* start;
1821           UChar* end;
1822           int num;
1823           start = e->u.arg.vals[j].s.start;
1824           end   = e->u.arg.vals[j].s.end;
1825           num = onig_get_callout_num_by_tag(reg, start, end);
1826           if (num < 0) return num;
1827           e->u.arg.vals[j].tag = num;
1828         }
1829       }
1830     }
1831   }
1832 
1833   return ONIG_NORMAL;
1834 }
1835 
1836 extern int
onig_callout_tag_is_exist_at_callout_num(regex_t * reg,int callout_num)1837 onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num)
1838 {
1839   RegexExt* ext = reg->extp;
1840 
1841   if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0;
1842   if (callout_num > ext->callout_num) return 0;
1843 
1844   return (ext->callout_list[callout_num].flag &
1845           CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0;
1846 }
1847 
1848 static int
i_free_callout_tag_entry(UChar * key,CalloutTagVal e,void * arg ARG_UNUSED)1849 i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED)
1850 {
1851   xfree(key);
1852   return ST_DELETE;
1853 }
1854 
1855 static int
callout_tag_table_clear(CalloutTagTable * t)1856 callout_tag_table_clear(CalloutTagTable* t)
1857 {
1858   if (IS_NOT_NULL(t)) {
1859     onig_st_foreach(t, i_free_callout_tag_entry, 0);
1860   }
1861   return 0;
1862 }
1863 
1864 extern int
onig_callout_tag_table_free(void * table)1865 onig_callout_tag_table_free(void* table)
1866 {
1867   CalloutTagTable* t = (CalloutTagTable* )table;
1868 
1869   if (IS_NOT_NULL(t)) {
1870     int r = callout_tag_table_clear(t);
1871     if (r != 0) return r;
1872 
1873     onig_st_free_table(t);
1874   }
1875 
1876   return 0;
1877 }
1878 
1879 extern int
onig_get_callout_num_by_tag(regex_t * reg,const UChar * tag,const UChar * tag_end)1880 onig_get_callout_num_by_tag(regex_t* reg,
1881                             const UChar* tag, const UChar* tag_end)
1882 {
1883   int r;
1884   RegexExt* ext;
1885   CalloutTagVal e;
1886 
1887   ext = reg->extp;
1888   if (IS_NULL(ext) || IS_NULL(ext->tag_table))
1889     return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1890 
1891   r = onig_st_lookup_strend(ext->tag_table, tag, tag_end,
1892                             (HashDataType* )((void* )(&e)));
1893   if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1894   return (int )e;
1895 }
1896 
1897 static CalloutTagVal
callout_tag_find(CalloutTagTable * t,const UChar * name,const UChar * name_end)1898 callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end)
1899 {
1900   CalloutTagVal e;
1901 
1902   e = -1;
1903   if (IS_NOT_NULL(t)) {
1904     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
1905   }
1906   return e;
1907 }
1908 
1909 static int
callout_tag_table_new(CalloutTagTable ** rt)1910 callout_tag_table_new(CalloutTagTable** rt)
1911 {
1912   CalloutTagTable* t;
1913 
1914   *rt = 0;
1915   t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM);
1916   CHECK_NULL_RETURN_MEMERR(t);
1917 
1918   *rt = t;
1919   return ONIG_NORMAL;
1920 }
1921 
1922 static int
callout_tag_entry_raw(ScanEnv * env,CalloutTagTable * t,UChar * name,UChar * name_end,CalloutTagVal entry_val)1923 callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name,
1924                       UChar* name_end, CalloutTagVal entry_val)
1925 {
1926   int r;
1927   CalloutTagVal val;
1928 
1929   if (name_end - name <= 0)
1930     return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1931 
1932   val = callout_tag_find(t, name, name_end);
1933   if (val >= 0) {
1934     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1935                                    name, name_end);
1936     return ONIGERR_MULTIPLEX_DEFINED_NAME;
1937   }
1938 
1939   r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);
1940   if (r < 0) return r;
1941 
1942   return ONIG_NORMAL;
1943 }
1944 
1945 static int
ext_ensure_tag_table(regex_t * reg)1946 ext_ensure_tag_table(regex_t* reg)
1947 {
1948   int r;
1949   RegexExt* ext;
1950   CalloutTagTable* t;
1951 
1952   ext = onig_get_regex_ext(reg);
1953   CHECK_NULL_RETURN_MEMERR(ext);
1954 
1955   if (IS_NULL(ext->tag_table)) {
1956     r = callout_tag_table_new(&t);
1957     if (r != ONIG_NORMAL) return r;
1958 
1959     ext->tag_table = t;
1960   }
1961 
1962   return ONIG_NORMAL;
1963 }
1964 
1965 static int
callout_tag_entry(ScanEnv * env,regex_t * reg,UChar * name,UChar * name_end,CalloutTagVal entry_val)1966 callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
1967                   CalloutTagVal entry_val)
1968 {
1969   int r;
1970   RegexExt* ext;
1971   CalloutListEntry* e;
1972 
1973   r = ext_ensure_tag_table(reg);
1974   if (r != ONIG_NORMAL) return r;
1975 
1976   ext = onig_get_regex_ext(reg);
1977   CHECK_NULL_RETURN_MEMERR(ext);
1978   r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);
1979 
1980   e = onig_reg_callout_list_at(reg, (int )entry_val);
1981   CHECK_NULL_RETURN_MEMERR(e);
1982   e->tag_start = name;
1983   e->tag_end   = name_end;
1984 
1985   return r;
1986 }
1987 
1988 #endif /* USE_CALLOUT */
1989 
1990 
1991 #define INIT_SCANENV_MEMENV_ALLOC_SIZE   16
1992 
1993 static void
scan_env_clear(ScanEnv * env)1994 scan_env_clear(ScanEnv* env)
1995 {
1996   MEM_STATUS_CLEAR(env->cap_history);
1997   MEM_STATUS_CLEAR(env->backtrack_mem);
1998   MEM_STATUS_CLEAR(env->backrefed_mem);
1999   env->error      = (UChar* )NULL;
2000   env->error_end  = (UChar* )NULL;
2001   env->num_call   = 0;
2002 
2003 #ifdef USE_CALL
2004   env->unset_addr_list = NULL;
2005   env->has_call_zero   = 0;
2006 #endif
2007 
2008   env->num_mem    = 0;
2009   env->num_named  = 0;
2010   env->mem_alloc  = 0;
2011   env->mem_env_dynamic = (MemEnv* )NULL;
2012 
2013   xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static));
2014 
2015   env->parse_depth      = 0;
2016 #ifdef ONIG_DEBUG_PARSE
2017   env->max_parse_depth  = 0;
2018 #endif
2019   env->backref_num      = 0;
2020   env->keep_num         = 0;
2021   env->id_num           = 0;
2022   env->save_alloc_num   = 0;
2023   env->saves            = 0;
2024 }
2025 
2026 static int
scan_env_add_mem_entry(ScanEnv * env)2027 scan_env_add_mem_entry(ScanEnv* env)
2028 {
2029   int i, need, alloc;
2030   MemEnv* p;
2031 
2032   need = env->num_mem + 1;
2033   if (need > MaxCaptureNum && MaxCaptureNum != 0)
2034     return ONIGERR_TOO_MANY_CAPTURES;
2035 
2036   if (need >= SCANENV_MEMENV_SIZE) {
2037     if (env->mem_alloc <= need) {
2038       if (IS_NULL(env->mem_env_dynamic)) {
2039         alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE;
2040         p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);
2041         CHECK_NULL_RETURN_MEMERR(p);
2042         xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static));
2043       }
2044       else {
2045         alloc = env->mem_alloc * 2;
2046         p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc);
2047         CHECK_NULL_RETURN_MEMERR(p);
2048       }
2049 
2050       for (i = env->num_mem + 1; i < alloc; i++) {
2051         p[i].mem_node = NULL_NODE;
2052         p[i].empty_repeat_node = NULL_NODE;
2053       }
2054 
2055       env->mem_env_dynamic = p;
2056       env->mem_alloc = alloc;
2057     }
2058   }
2059 
2060   env->num_mem++;
2061   return env->num_mem;
2062 }
2063 
2064 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)2065 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
2066 {
2067   if (env->num_mem >= num)
2068     SCANENV_MEMENV(env)[num].mem_node = node;
2069   else
2070     return ONIGERR_PARSER_BUG;
2071   return 0;
2072 }
2073 
2074 static void
node_free_body(Node * node)2075 node_free_body(Node* node)
2076 {
2077   if (IS_NULL(node)) return ;
2078 
2079   switch (NODE_TYPE(node)) {
2080   case NODE_STRING:
2081     if (STR_(node)->capacity != 0 &&
2082         IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
2083       xfree(STR_(node)->s);
2084     }
2085     break;
2086 
2087   case NODE_LIST:
2088   case NODE_ALT:
2089     onig_node_free(NODE_CAR(node));
2090     node = NODE_CDR(node);
2091     while (IS_NOT_NULL(node)) {
2092       Node* next = NODE_CDR(node);
2093       onig_node_free(NODE_CAR(node));
2094       xfree(node);
2095       node = next;
2096     }
2097     break;
2098 
2099   case NODE_CCLASS:
2100     {
2101       CClassNode* cc = CCLASS_(node);
2102 
2103       if (cc->mbuf)
2104         bbuf_free(cc->mbuf);
2105     }
2106     break;
2107 
2108   case NODE_BACKREF:
2109     if (IS_NOT_NULL(BACKREF_(node)->back_dynamic))
2110       xfree(BACKREF_(node)->back_dynamic);
2111     break;
2112 
2113   case NODE_BAG:
2114     if (NODE_BODY(node))
2115       onig_node_free(NODE_BODY(node));
2116 
2117     {
2118       BagNode* en = BAG_(node);
2119       if (en->type == BAG_IF_ELSE) {
2120         onig_node_free(en->te.Then);
2121         onig_node_free(en->te.Else);
2122       }
2123     }
2124     break;
2125 
2126   case NODE_QUANT:
2127     if (NODE_BODY(node))
2128       onig_node_free(NODE_BODY(node));
2129     break;
2130 
2131   case NODE_ANCHOR:
2132     if (NODE_BODY(node))
2133       onig_node_free(NODE_BODY(node));
2134     if (IS_NOT_NULL(ANCHOR_(node)->lead_node))
2135       onig_node_free(ANCHOR_(node)->lead_node);
2136     break;
2137 
2138   case NODE_CTYPE:
2139   case NODE_CALL:
2140   case NODE_GIMMICK:
2141     break;
2142   }
2143 }
2144 
2145 extern void
onig_node_free(Node * node)2146 onig_node_free(Node* node)
2147 {
2148   if (IS_NULL(node)) return ;
2149 
2150 #ifdef DEBUG_NODE_FREE
2151   fprintf(stderr, "onig_node_free: %p\n", node);
2152 #endif
2153 
2154   node_free_body(node);
2155   xfree(node);
2156 }
2157 
2158 static void
cons_node_free_alone(Node * node)2159 cons_node_free_alone(Node* node)
2160 {
2161   NODE_CAR(node) = 0;
2162   NODE_CDR(node) = 0;
2163   onig_node_free(node);
2164 }
2165 
2166 static Node*
node_new(void)2167 node_new(void)
2168 {
2169   Node* node;
2170 
2171   node = (Node* )xmalloc(sizeof(Node));
2172   CHECK_NULL_RETURN(node);
2173   xmemset(node, 0, sizeof(*node));
2174 
2175 #ifdef DEBUG_NODE_FREE
2176   fprintf(stderr, "node_new: %p\n", node);
2177 #endif
2178   return node;
2179 }
2180 
2181 extern int
onig_node_copy(Node ** rcopy,Node * from)2182 onig_node_copy(Node** rcopy, Node* from)
2183 {
2184   int r;
2185   Node* copy;
2186 
2187   *rcopy = NULL_NODE;
2188 
2189   switch (NODE_TYPE(from)) {
2190   case NODE_LIST:
2191   case NODE_ALT:
2192   case NODE_ANCHOR:
2193     /* These node's link to other nodes are processed by caller. */
2194     break;
2195   case NODE_STRING:
2196   case NODE_CCLASS:
2197   case NODE_CTYPE:
2198     /* Fixed contents after copy. */
2199     break;
2200   default:
2201     /* Not supported yet. */
2202     return ONIGERR_TYPE_BUG;
2203     break;
2204   }
2205 
2206   copy = node_new();
2207   CHECK_NULL_RETURN_MEMERR(copy);
2208   xmemcpy(copy, from, sizeof(*copy));
2209 
2210   switch (NODE_TYPE(copy)) {
2211   case NODE_STRING:
2212     r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE);
2213     if (r != 0) {
2214     err:
2215       onig_node_free(copy);
2216       return r;
2217     }
2218     break;
2219 
2220   case NODE_CCLASS:
2221     {
2222       CClassNode *fcc, *tcc;
2223 
2224       fcc = CCLASS_(from);
2225       tcc = CCLASS_(copy);
2226       if (IS_NOT_NULL(fcc->mbuf)) {
2227         r = bbuf_clone(&(tcc->mbuf), fcc->mbuf);
2228         if (r != 0) goto err;
2229       }
2230     }
2231     break;
2232 
2233   default:
2234     break;
2235   }
2236 
2237   *rcopy = copy;
2238   return ONIG_NORMAL;
2239 }
2240 
2241 
2242 static void
initialize_cclass(CClassNode * cc)2243 initialize_cclass(CClassNode* cc)
2244 {
2245   BITSET_CLEAR(cc->bs);
2246   cc->flags = 0;
2247   cc->mbuf  = NULL;
2248 }
2249 
2250 static Node*
node_new_cclass(void)2251 node_new_cclass(void)
2252 {
2253   Node* node = node_new();
2254   CHECK_NULL_RETURN(node);
2255 
2256   NODE_SET_TYPE(node, NODE_CCLASS);
2257   initialize_cclass(CCLASS_(node));
2258   return node;
2259 }
2260 
2261 static Node*
node_new_ctype(int type,int not,OnigOptionType options)2262 node_new_ctype(int type, int not, OnigOptionType options)
2263 {
2264   Node* node = node_new();
2265   CHECK_NULL_RETURN(node);
2266 
2267   NODE_SET_TYPE(node, NODE_CTYPE);
2268   CTYPE_(node)->ctype   = type;
2269   CTYPE_(node)->not     = not;
2270   CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options);
2271   return node;
2272 }
2273 
2274 static Node*
node_new_anychar(OnigOptionType options)2275 node_new_anychar(OnigOptionType options)
2276 {
2277   Node* node;
2278 
2279   node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options);
2280   CHECK_NULL_RETURN(node);
2281 
2282   if (OPTON_MULTILINE(options))
2283     NODE_STATUS_ADD(node, MULTILINE);
2284   return node;
2285 }
2286 
2287 static int
node_new_no_newline(Node ** node,ScanEnv * env)2288 node_new_no_newline(Node** node, ScanEnv* env)
2289 {
2290   Node* n;
2291 
2292   n = node_new_anychar(ONIG_OPTION_NONE);
2293   CHECK_NULL_RETURN_MEMERR(n);
2294   *node = n;
2295   return 0;
2296 }
2297 
2298 static int
node_new_true_anychar(Node ** node)2299 node_new_true_anychar(Node** node)
2300 {
2301   Node* n;
2302 
2303   n = node_new_anychar(ONIG_OPTION_MULTILINE);
2304   CHECK_NULL_RETURN_MEMERR(n);
2305   *node = n;
2306   return 0;
2307 }
2308 
2309 static Node*
node_new_list(Node * left,Node * right)2310 node_new_list(Node* left, Node* right)
2311 {
2312   Node* node = node_new();
2313   CHECK_NULL_RETURN(node);
2314 
2315   NODE_SET_TYPE(node, NODE_LIST);
2316   NODE_CAR(node)  = left;
2317   NODE_CDR(node) = right;
2318   return node;
2319 }
2320 
2321 extern Node*
onig_node_new_list(Node * left,Node * right)2322 onig_node_new_list(Node* left, Node* right)
2323 {
2324   return node_new_list(left, right);
2325 }
2326 
2327 extern Node*
onig_node_new_alt(Node * left,Node * right)2328 onig_node_new_alt(Node* left, Node* right)
2329 {
2330   Node* node = node_new();
2331   CHECK_NULL_RETURN(node);
2332 
2333   NODE_SET_TYPE(node, NODE_ALT);
2334   NODE_CAR(node)  = left;
2335   NODE_CDR(node) = right;
2336   return node;
2337 }
2338 
2339 static Node*
make_list_or_alt(NodeType type,int n,Node * ns[])2340 make_list_or_alt(NodeType type, int n, Node* ns[])
2341 {
2342   Node* r;
2343 
2344   if (n <= 0) return NULL_NODE;
2345 
2346   if (n == 1) {
2347     r = node_new();
2348     CHECK_NULL_RETURN(r);
2349     NODE_SET_TYPE(r, type);
2350     NODE_CAR(r) = ns[0];
2351     NODE_CDR(r) = NULL_NODE;
2352   }
2353   else {
2354     Node* right;
2355 
2356     r = node_new();
2357     CHECK_NULL_RETURN(r);
2358 
2359     right = make_list_or_alt(type, n - 1, ns + 1);
2360     if (IS_NULL(right)) {
2361       onig_node_free(r);
2362       return NULL_NODE;
2363     }
2364 
2365     NODE_SET_TYPE(r, type);
2366     NODE_CAR(r) = ns[0];
2367     NODE_CDR(r) = right;
2368   }
2369 
2370   return r;
2371 }
2372 
2373 static Node*
make_list(int n,Node * ns[])2374 make_list(int n, Node* ns[])
2375 {
2376   return make_list_or_alt(NODE_LIST, n, ns);
2377 }
2378 
2379 static Node*
make_alt(int n,Node * ns[])2380 make_alt(int n, Node* ns[])
2381 {
2382   return make_list_or_alt(NODE_ALT, n, ns);
2383 }
2384 
2385 static Node*
node_new_anchor(int type)2386 node_new_anchor(int type)
2387 {
2388   Node* node;
2389 
2390   node = node_new();
2391   CHECK_NULL_RETURN(node);
2392 
2393   NODE_SET_TYPE(node, NODE_ANCHOR);
2394   ANCHOR_(node)->type       = type;
2395   ANCHOR_(node)->char_min_len = 0;
2396   ANCHOR_(node)->char_max_len = INFINITE_LEN;
2397   ANCHOR_(node)->ascii_mode = 0;
2398   ANCHOR_(node)->lead_node  = NULL_NODE;
2399   return node;
2400 }
2401 
2402 static Node*
node_new_anchor_with_options(int type,OnigOptionType options)2403 node_new_anchor_with_options(int type, OnigOptionType options)
2404 {
2405   int ascii_mode;
2406   Node* node;
2407 
2408   node = node_new_anchor(type);
2409   CHECK_NULL_RETURN(node);
2410 
2411   ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0;
2412   ANCHOR_(node)->ascii_mode = ascii_mode;
2413 
2414   if (type == ANCR_TEXT_SEGMENT_BOUNDARY ||
2415       type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) {
2416     if (OPTON_TEXT_SEGMENT_WORD(options))
2417       NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD);
2418   }
2419 
2420   return node;
2421 }
2422 
2423 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2424 node_new_backref(int back_num, int* backrefs, int by_name,
2425 #ifdef USE_BACKREF_WITH_LEVEL
2426                  int exist_level, int nest_level,
2427 #endif
2428                  ScanEnv* env)
2429 {
2430   int i;
2431   Node* node;
2432 
2433   node = node_new();
2434   CHECK_NULL_RETURN(node);
2435 
2436   NODE_SET_TYPE(node, NODE_BACKREF);
2437   BACKREF_(node)->back_num = back_num;
2438   BACKREF_(node)->back_dynamic = (int* )NULL;
2439   if (by_name != 0)
2440     NODE_STATUS_ADD(node, BY_NAME);
2441 
2442   if (OPTON_IGNORECASE(env->options))
2443     NODE_STATUS_ADD(node, IGNORECASE);
2444 
2445 #ifdef USE_BACKREF_WITH_LEVEL
2446   if (exist_level != 0) {
2447     NODE_STATUS_ADD(node, NEST_LEVEL);
2448     BACKREF_(node)->nest_level  = nest_level;
2449   }
2450 #endif
2451 
2452   for (i = 0; i < back_num; i++) {
2453     if (backrefs[i] <= env->num_mem &&
2454         IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) {
2455       NODE_STATUS_ADD(node, RECURSION);   /* /...(\1).../ */
2456       break;
2457     }
2458   }
2459 
2460   if (back_num <= NODE_BACKREFS_SIZE) {
2461     for (i = 0; i < back_num; i++)
2462       BACKREF_(node)->back_static[i] = backrefs[i];
2463   }
2464   else {
2465     int* p = (int* )xmalloc(sizeof(int) * back_num);
2466     if (IS_NULL(p)) {
2467       onig_node_free(node);
2468       return NULL;
2469     }
2470     BACKREF_(node)->back_dynamic = p;
2471     for (i = 0; i < back_num; i++)
2472       p[i] = backrefs[i];
2473   }
2474 
2475   env->backref_num++;
2476   return node;
2477 }
2478 
2479 static Node*
node_new_backref_checker(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2480 node_new_backref_checker(int back_num, int* backrefs, int by_name,
2481 #ifdef USE_BACKREF_WITH_LEVEL
2482                          int exist_level, int nest_level,
2483 #endif
2484                          ScanEnv* env)
2485 {
2486   Node* node;
2487 
2488   node = node_new_backref(back_num, backrefs, by_name,
2489 #ifdef USE_BACKREF_WITH_LEVEL
2490                           exist_level, nest_level,
2491 #endif
2492                           env);
2493   CHECK_NULL_RETURN(node);
2494 
2495   NODE_STATUS_ADD(node, CHECKER);
2496   return node;
2497 }
2498 
2499 #ifdef USE_CALL
2500 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum,int by_number)2501 node_new_call(UChar* name, UChar* name_end, int gnum, int by_number)
2502 {
2503   Node* node = node_new();
2504   CHECK_NULL_RETURN(node);
2505 
2506   NODE_SET_TYPE(node, NODE_CALL);
2507   CALL_(node)->by_number   = by_number;
2508   CALL_(node)->name        = name;
2509   CALL_(node)->name_end    = name_end;
2510   CALL_(node)->called_gnum = gnum;
2511   CALL_(node)->entry_count = 1;
2512   return node;
2513 }
2514 #endif
2515 
2516 static Node*
node_new_quantifier(int lower,int upper,int by_number)2517 node_new_quantifier(int lower, int upper, int by_number)
2518 {
2519   Node* node = node_new();
2520   CHECK_NULL_RETURN(node);
2521 
2522   NODE_SET_TYPE(node, NODE_QUANT);
2523   QUANT_(node)->lower            = lower;
2524   QUANT_(node)->upper            = upper;
2525   QUANT_(node)->greedy           = 1;
2526   QUANT_(node)->emptiness        = BODY_IS_NOT_EMPTY;
2527   QUANT_(node)->head_exact       = NULL_NODE;
2528   QUANT_(node)->next_head_exact  = NULL_NODE;
2529   QUANT_(node)->include_referred = 0;
2530   if (by_number != 0)
2531     NODE_STATUS_ADD(node, BY_NUMBER);
2532 
2533   return node;
2534 }
2535 
2536 static Node*
node_new_bag(enum BagType type)2537 node_new_bag(enum BagType type)
2538 {
2539   Node* node = node_new();
2540   CHECK_NULL_RETURN(node);
2541 
2542   NODE_SET_TYPE(node, NODE_BAG);
2543   BAG_(node)->type = type;
2544 
2545   switch (type) {
2546   case BAG_MEMORY:
2547     BAG_(node)->m.regnum       =  0;
2548     BAG_(node)->m.called_addr  = -1;
2549     BAG_(node)->m.entry_count  =  1;
2550     BAG_(node)->m.called_state =  0;
2551     break;
2552 
2553   case BAG_OPTION:
2554     BAG_(node)->o.options =  0;
2555     break;
2556 
2557   case BAG_STOP_BACKTRACK:
2558     break;
2559 
2560   case BAG_IF_ELSE:
2561     BAG_(node)->te.Then = 0;
2562     BAG_(node)->te.Else = 0;
2563     break;
2564   }
2565 
2566   BAG_(node)->opt_count = 0;
2567   return node;
2568 }
2569 
2570 extern Node*
onig_node_new_bag(enum BagType type)2571 onig_node_new_bag(enum BagType type)
2572 {
2573   return node_new_bag(type);
2574 }
2575 
2576 static Node*
node_new_bag_if_else(Node * cond,Node * Then,Node * Else)2577 node_new_bag_if_else(Node* cond, Node* Then, Node* Else)
2578 {
2579   Node* n;
2580   n = node_new_bag(BAG_IF_ELSE);
2581   CHECK_NULL_RETURN(n);
2582 
2583   NODE_BODY(n) = cond;
2584   BAG_(n)->te.Then = Then;
2585   BAG_(n)->te.Else = Else;
2586   return n;
2587 }
2588 
2589 static Node*
node_new_memory(int is_named)2590 node_new_memory(int is_named)
2591 {
2592   Node* node = node_new_bag(BAG_MEMORY);
2593   CHECK_NULL_RETURN(node);
2594   if (is_named != 0)
2595     NODE_STATUS_ADD(node, NAMED_GROUP);
2596 
2597   return node;
2598 }
2599 
2600 static Node*
node_new_option(OnigOptionType option)2601 node_new_option(OnigOptionType option)
2602 {
2603   Node* node = node_new_bag(BAG_OPTION);
2604   CHECK_NULL_RETURN(node);
2605   BAG_(node)->o.options = option;
2606   return node;
2607 }
2608 
2609 static Node*
node_new_group(Node * content)2610 node_new_group(Node* content)
2611 {
2612   Node* node;
2613 
2614   node = node_new();
2615   CHECK_NULL_RETURN(node);
2616   NODE_SET_TYPE(node, NODE_LIST);
2617   NODE_CAR(node) = content;
2618   NODE_CDR(node) = NULL_NODE;
2619 
2620   return node;
2621 }
2622 
2623 static Node*
node_drop_group(Node * group)2624 node_drop_group(Node* group)
2625 {
2626   Node* content;
2627 
2628   content = NODE_CAR(group);
2629   NODE_CAR(group) = NULL_NODE;
2630   onig_node_free(group);
2631   return content;
2632 }
2633 
2634 static int
node_set_fail(Node * node)2635 node_set_fail(Node* node)
2636 {
2637   NODE_SET_TYPE(node, NODE_GIMMICK);
2638   GIMMICK_(node)->type = GIMMICK_FAIL;
2639   return ONIG_NORMAL;
2640 }
2641 
2642 static int
node_new_fail(Node ** node,ScanEnv * env)2643 node_new_fail(Node** node, ScanEnv* env)
2644 {
2645   *node = node_new();
2646   CHECK_NULL_RETURN_MEMERR(*node);
2647 
2648   return node_set_fail(*node);
2649 }
2650 
2651 extern int
onig_node_reset_fail(Node * node)2652 onig_node_reset_fail(Node* node)
2653 {
2654   node_free_body(node);
2655   return node_set_fail(node);
2656 }
2657 
2658 static int
node_new_save_gimmick(Node ** node,enum SaveType save_type,ScanEnv * env)2659 node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
2660 {
2661   int id;
2662 
2663   ID_ENTRY(env, id);
2664 
2665   *node = node_new();
2666   CHECK_NULL_RETURN_MEMERR(*node);
2667 
2668   NODE_SET_TYPE(*node, NODE_GIMMICK);
2669   GIMMICK_(*node)->id   = id;
2670   GIMMICK_(*node)->type = GIMMICK_SAVE;
2671   GIMMICK_(*node)->detail_type = (int )save_type;
2672 
2673   return ONIG_NORMAL;
2674 }
2675 
2676 static int
node_new_update_var_gimmick(Node ** node,enum UpdateVarType update_var_type,int id,ScanEnv * env)2677 node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
2678                             int id, ScanEnv* env)
2679 {
2680   *node = node_new();
2681   CHECK_NULL_RETURN_MEMERR(*node);
2682 
2683   NODE_SET_TYPE(*node, NODE_GIMMICK);
2684   GIMMICK_(*node)->id   = id;
2685   GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR;
2686   GIMMICK_(*node)->detail_type = (int )update_var_type;
2687 
2688   return ONIG_NORMAL;
2689 }
2690 
2691 static int
node_new_keep(Node ** node,ScanEnv * env)2692 node_new_keep(Node** node, ScanEnv* env)
2693 {
2694   int r;
2695 
2696   r = node_new_save_gimmick(node, SAVE_KEEP, env);
2697   if (r != 0) return r;
2698 
2699   env->keep_num++;
2700   return ONIG_NORMAL;
2701 }
2702 
2703 #ifdef USE_CALLOUT
2704 
2705 extern void
onig_free_reg_callout_list(int n,CalloutListEntry * list)2706 onig_free_reg_callout_list(int n, CalloutListEntry* list)
2707 {
2708   int i;
2709   int j;
2710 
2711   if (IS_NULL(list)) return ;
2712 
2713   for (i = 0; i < n; i++) {
2714     if (list[i].of == ONIG_CALLOUT_OF_NAME) {
2715       for (j = 0; j < list[i].u.arg.passed_num; j++) {
2716         if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) {
2717           if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start))
2718             xfree(list[i].u.arg.vals[j].s.start);
2719         }
2720       }
2721     }
2722     else { /* ONIG_CALLOUT_OF_CONTENTS */
2723       if (IS_NOT_NULL(list[i].u.content.start)) {
2724         xfree((void* )list[i].u.content.start);
2725       }
2726     }
2727   }
2728 
2729   xfree(list);
2730 }
2731 
2732 extern CalloutListEntry*
onig_reg_callout_list_at(regex_t * reg,int num)2733 onig_reg_callout_list_at(regex_t* reg, int num)
2734 {
2735   RegexExt* ext = reg->extp;
2736   CHECK_NULL_RETURN(ext);
2737 
2738   if (num <= 0 || num > ext->callout_num)
2739     return 0;
2740 
2741   num--;
2742   return ext->callout_list + num;
2743 }
2744 
2745 static int
reg_callout_list_entry(ScanEnv * env,int * rnum)2746 reg_callout_list_entry(ScanEnv* env, int* rnum)
2747 {
2748 #define INIT_CALLOUT_LIST_NUM  3
2749 
2750   int num;
2751   CalloutListEntry* list;
2752   CalloutListEntry* e;
2753   RegexExt* ext;
2754 
2755   ext = onig_get_regex_ext(env->reg);
2756   CHECK_NULL_RETURN_MEMERR(ext);
2757 
2758   if (IS_NULL(ext->callout_list)) {
2759     list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM);
2760     CHECK_NULL_RETURN_MEMERR(list);
2761 
2762     ext->callout_list = list;
2763     ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM;
2764     ext->callout_num = 0;
2765   }
2766 
2767   num = ext->callout_num + 1;
2768   if (num > ext->callout_list_alloc) {
2769     int alloc = ext->callout_list_alloc * 2;
2770     list = (CalloutListEntry* )xrealloc(ext->callout_list,
2771                                         sizeof(CalloutListEntry) * alloc);
2772     CHECK_NULL_RETURN_MEMERR(list);
2773 
2774     ext->callout_list       = list;
2775     ext->callout_list_alloc = alloc;
2776   }
2777 
2778   e = ext->callout_list + (num - 1);
2779 
2780   e->flag             = 0;
2781   e->of               = 0;
2782   e->in               = ONIG_CALLOUT_OF_CONTENTS;
2783   e->type             = 0;
2784   e->tag_start        = 0;
2785   e->tag_end          = 0;
2786   e->start_func       = 0;
2787   e->end_func         = 0;
2788   e->u.arg.num        = 0;
2789   e->u.arg.passed_num = 0;
2790 
2791   ext->callout_num = num;
2792   *rnum = num;
2793   return ONIG_NORMAL;
2794 }
2795 
2796 static int
node_new_callout(Node ** node,OnigCalloutOf callout_of,int num,int id,ScanEnv * env)2797 node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
2798                  ScanEnv* env)
2799 {
2800   *node = node_new();
2801   CHECK_NULL_RETURN_MEMERR(*node);
2802 
2803   NODE_SET_TYPE(*node, NODE_GIMMICK);
2804   GIMMICK_(*node)->id          = id;
2805   GIMMICK_(*node)->num         = num;
2806   GIMMICK_(*node)->type        = GIMMICK_CALLOUT;
2807   GIMMICK_(*node)->detail_type = (int )callout_of;
2808 
2809   return ONIG_NORMAL;
2810 }
2811 #endif
2812 
2813 static int
make_text_segment(Node ** node,ScanEnv * env)2814 make_text_segment(Node** node, ScanEnv* env)
2815 {
2816   int r;
2817   int i;
2818   Node* x;
2819   Node* ns[2];
2820 
2821   /* \X == (?>\O(?:\Y\O)*) */
2822 
2823   ns[1] = NULL_NODE;
2824 
2825   r = ONIGERR_MEMORY;
2826   ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options);
2827   if (IS_NULL(ns[0])) goto err;
2828 
2829   r = node_new_true_anychar(&ns[1]);
2830   if (r != 0) goto err1;
2831 
2832   x = make_list(2, ns);
2833   if (IS_NULL(x)) goto err;
2834   ns[0] = x;
2835   ns[1] = NULL_NODE;
2836 
2837   x = node_new_quantifier(0, INFINITE_REPEAT, TRUE);
2838   if (IS_NULL(x)) goto err;
2839 
2840   NODE_BODY(x) = ns[0];
2841   ns[0] = NULL_NODE;
2842   ns[1] = x;
2843 
2844   r = node_new_true_anychar(&ns[0]);
2845   if (r != 0) goto err1;
2846 
2847   x = make_list(2, ns);
2848   if (IS_NULL(x)) goto err;
2849 
2850   ns[0] = x;
2851   ns[1] = NULL_NODE;
2852 
2853   x = node_new_bag(BAG_STOP_BACKTRACK);
2854   if (IS_NULL(x)) goto err;
2855 
2856   NODE_BODY(x) = ns[0];
2857 
2858   *node = x;
2859   return ONIG_NORMAL;
2860 
2861  err:
2862   r = ONIGERR_MEMORY;
2863  err1:
2864   for (i = 0; i < 2; i++) onig_node_free(ns[i]);
2865   return r;
2866 }
2867 
2868 static int
make_absent_engine(Node ** node,int pre_save_right_id,Node * absent,Node * step_one,int lower,int upper,int possessive,int is_range_cutter,ScanEnv * env)2869 make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
2870                    Node* step_one, int lower, int upper, int possessive,
2871                    int is_range_cutter, ScanEnv* env)
2872 {
2873   int r;
2874   int i;
2875   int id;
2876   Node* x;
2877   Node* ns[4];
2878 
2879   for (i = 0; i < 4; i++) ns[i] = NULL_NODE;
2880 
2881   ns[1] = absent;
2882   ns[3] = step_one; /* for err */
2883   r = node_new_save_gimmick(&ns[0], SAVE_S, env);
2884   if (r != 0) goto err;
2885 
2886   id = GIMMICK_(ns[0])->id;
2887   r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK,
2888                                   id, env);
2889   if (r != 0) goto err;
2890 
2891   if (is_range_cutter != 0)
2892     NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS);
2893 
2894   r = node_new_fail(&ns[3], env);
2895   if (r != 0) goto err;
2896 
2897   x = make_list(4, ns);
2898   if (IS_NULL(x)) goto err0;
2899 
2900   ns[0] = x;
2901   ns[1] = step_one;
2902   ns[2] = ns[3] = NULL_NODE;
2903 
2904   x = make_alt(2, ns);
2905   if (IS_NULL(x)) goto err0;
2906 
2907   ns[0] = x;
2908 
2909   x = node_new_quantifier(lower, upper, FALSE);
2910   if (IS_NULL(x)) goto err0;
2911 
2912   NODE_BODY(x) = ns[0];
2913   ns[0] = x;
2914 
2915   if (possessive != 0) {
2916     x = node_new_bag(BAG_STOP_BACKTRACK);
2917     if (IS_NULL(x)) goto err0;
2918 
2919     NODE_BODY(x) = ns[0];
2920     ns[0] = x;
2921   }
2922 
2923   r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2924                                   pre_save_right_id, env);
2925   if (r != 0) goto err;
2926 
2927   r = node_new_fail(&ns[2], env);
2928   if (r != 0) goto err;
2929 
2930   x = make_list(2, ns + 1);
2931   if (IS_NULL(x)) goto err0;
2932 
2933   ns[1] = x; ns[2] = NULL_NODE;
2934 
2935   x = make_alt(2, ns);
2936   if (IS_NULL(x)) goto err0;
2937 
2938   if (is_range_cutter != FALSE)
2939     NODE_STATUS_ADD(x, SUPER);
2940 
2941   *node = x;
2942   return ONIG_NORMAL;
2943 
2944  err0:
2945   r = ONIGERR_MEMORY;
2946  err:
2947   for (i = 0; i < 4; i++) onig_node_free(ns[i]);
2948   return r;
2949 }
2950 
2951 static int
make_absent_tail(Node ** node1,Node ** node2,int pre_save_right_id,ScanEnv * env)2952 make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
2953                  ScanEnv* env)
2954 {
2955   int r;
2956   int id;
2957   Node* save;
2958   Node* x;
2959   Node* ns[2];
2960 
2961   *node1 = *node2 = NULL_NODE;
2962   save = ns[0] = ns[1] = NULL_NODE;
2963 
2964   r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2965   if (r != 0) goto err;
2966 
2967   id = GIMMICK_(save)->id;
2968   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2969                                   id, env);
2970   if (r != 0) goto err;
2971 
2972   r = node_new_fail(&ns[1], env);
2973   if (r != 0) goto err;
2974 
2975   x = make_list(2, ns);
2976   if (IS_NULL(x)) goto err0;
2977 
2978   ns[0] = NULL_NODE; ns[1] = x;
2979 
2980   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2981                                   pre_save_right_id, env);
2982   if (r != 0) goto err;
2983 
2984   x = make_alt(2, ns);
2985   if (IS_NULL(x)) goto err0;
2986 
2987   *node1 = save;
2988   *node2 = x;
2989   return ONIG_NORMAL;
2990 
2991  err0:
2992   r = ONIGERR_MEMORY;
2993  err:
2994   onig_node_free(save);
2995   onig_node_free(ns[0]);
2996   onig_node_free(ns[1]);
2997   return r;
2998 }
2999 
3000 static int
make_range_clear(Node ** node,ScanEnv * env)3001 make_range_clear(Node** node, ScanEnv* env)
3002 {
3003   int r;
3004   int id;
3005   Node* save;
3006   Node* x;
3007   Node* ns[2];
3008 
3009   *node = NULL_NODE;
3010   save = ns[0] = ns[1] = NULL_NODE;
3011 
3012   r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
3013   if (r != 0) goto err;
3014 
3015   id = GIMMICK_(save)->id;
3016   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3017                                   id, env);
3018   if (r != 0) goto err;
3019 
3020   r = node_new_fail(&ns[1], env);
3021   if (r != 0) goto err;
3022 
3023   x = make_list(2, ns);
3024   if (IS_NULL(x)) goto err0;
3025 
3026   ns[0] = NULL_NODE; ns[1] = x;
3027 
3028 #define ID_NOT_USED_DONT_CARE_ME   0
3029 
3030   r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT,
3031                                   ID_NOT_USED_DONT_CARE_ME, env);
3032   if (r != 0) goto err;
3033   NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS);
3034 
3035   x = make_alt(2, ns);
3036   if (IS_NULL(x)) goto err0;
3037 
3038   NODE_STATUS_ADD(x, SUPER);
3039 
3040   ns[0] = save;
3041   ns[1] = x;
3042   save = NULL_NODE;
3043   x = make_list(2, ns);
3044   if (IS_NULL(x)) goto err0;
3045 
3046   *node = x;
3047   return ONIG_NORMAL;
3048 
3049  err0:
3050   r = ONIGERR_MEMORY;
3051  err:
3052   onig_node_free(save);
3053   onig_node_free(ns[0]);
3054   onig_node_free(ns[1]);
3055   return r;
3056 }
3057 
3058 static int
is_simple_one_char_repeat(Node * node,Node ** rquant,Node ** rbody,int * is_possessive,ScanEnv * env)3059 is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
3060                           int* is_possessive, ScanEnv* env)
3061 {
3062   Node* quant;
3063   Node* body;
3064 
3065   *rquant = *rbody = 0;
3066   *is_possessive = 0;
3067 
3068   if (NODE_TYPE(node) == NODE_QUANT) {
3069     quant = node;
3070   }
3071   else {
3072     if (NODE_TYPE(node) == NODE_BAG) {
3073       BagNode* en = BAG_(node);
3074       if (en->type == BAG_STOP_BACKTRACK) {
3075         *is_possessive = 1;
3076         quant = NODE_BAG_BODY(en);
3077         if (NODE_TYPE(quant) != NODE_QUANT)
3078           return 0;
3079       }
3080       else
3081         return 0;
3082     }
3083     else
3084       return 0;
3085   }
3086 
3087   if (QUANT_(quant)->greedy == 0)
3088     return 0;
3089 
3090   body = NODE_BODY(quant);
3091   switch (NODE_TYPE(body)) {
3092   case NODE_STRING:
3093     {
3094       int len;
3095       StrNode* sn = STR_(body);
3096       UChar *s = sn->s;
3097 
3098       len = 0;
3099       while (s < sn->end) {
3100         s += enclen(env->enc, s);
3101         len++;
3102       }
3103       if (len != 1)
3104         return 0;
3105     }
3106 
3107   case NODE_CCLASS:
3108     break;
3109 
3110   default:
3111     return 0;
3112     break;
3113   }
3114 
3115   if (node != quant) {
3116     NODE_BODY(node) = 0;
3117     onig_node_free(node);
3118   }
3119   NODE_BODY(quant) = NULL_NODE;
3120   *rquant = quant;
3121   *rbody  = body;
3122   return 1;
3123 }
3124 
3125 static int
make_absent_tree_for_simple_one_char_repeat(Node ** node,Node * absent,Node * quant,Node * body,int possessive,ScanEnv * env)3126 make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant,
3127                                             Node* body, int possessive, ScanEnv* env)
3128 {
3129   int r;
3130   int i;
3131   int id1;
3132   int lower, upper;
3133   Node* x;
3134   Node* ns[4];
3135 
3136   *node = NULL_NODE;
3137   r = ONIGERR_MEMORY;
3138   ns[0] = ns[1] = NULL_NODE;
3139   ns[2] = body, ns[3] = absent;
3140 
3141   lower = QUANT_(quant)->lower;
3142   upper = QUANT_(quant)->upper;
3143 
3144   r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3145   if (r != 0) goto err;
3146 
3147   id1 = GIMMICK_(ns[0])->id;
3148 
3149   r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive,
3150                          FALSE, env);
3151   if (r != 0) goto err;
3152 
3153   ns[2] = ns[3] = NULL_NODE;
3154 
3155   r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3156                                   id1, env);
3157   if (r != 0) goto err;
3158 
3159   x = make_list(3, ns);
3160   if (IS_NULL(x)) goto err0;
3161 
3162   *node = x;
3163   return ONIG_NORMAL;
3164 
3165  err0:
3166   r = ONIGERR_MEMORY;
3167  err:
3168   for (i = 0; i < 4; i++) onig_node_free(ns[i]);
3169   return r;
3170 }
3171 
3172 static int
make_absent_tree(Node ** node,Node * absent,Node * expr,int is_range_cutter,ScanEnv * env)3173 make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,
3174                  ScanEnv* env)
3175 {
3176   int r;
3177   int i;
3178   int id1, id2;
3179   int possessive;
3180   Node* x;
3181   Node* ns[7];
3182 
3183   r = ONIGERR_MEMORY;
3184   for (i = 0; i < 7; i++) ns[i] = NULL_NODE;
3185   ns[4] = expr; ns[5] = absent;
3186 
3187   if (is_range_cutter == 0) {
3188     Node* quant;
3189     Node* body;
3190 
3191     if (expr == NULL_NODE) {
3192       /* default expr \O* */
3193       quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
3194       if (IS_NULL(quant)) goto err0;
3195 
3196       r = node_new_true_anychar(&body);
3197       if (r != 0) {
3198         onig_node_free(quant);
3199         goto err;
3200       }
3201       possessive = 0;
3202       goto simple;
3203     }
3204     else {
3205       if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) {
3206       simple:
3207         r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant,
3208                                                         body, possessive, env);
3209         onig_node_free(quant);
3210         if (r != 0) {
3211           ns[4] = NULL_NODE;
3212           onig_node_free(body);
3213           goto err;
3214         }
3215 
3216         return ONIG_NORMAL;
3217       }
3218     }
3219   }
3220 
3221   r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3222   if (r != 0) goto err;
3223 
3224   id1 = GIMMICK_(ns[0])->id;
3225 
3226   r = node_new_save_gimmick(&ns[1], SAVE_S, env);
3227   if (r != 0) goto err;
3228 
3229   id2 = GIMMICK_(ns[1])->id;
3230 
3231   r = node_new_true_anychar(&ns[3]);
3232   if (r != 0) goto err;
3233 
3234   possessive = 1;
3235   r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,
3236                          possessive, is_range_cutter, env);
3237   if (r != 0) goto err;
3238 
3239   ns[3] = NULL_NODE;
3240   ns[5] = NULL_NODE;
3241 
3242   r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env);
3243   if (r != 0) goto err;
3244 
3245   if (is_range_cutter != 0) {
3246     x = make_list(4, ns);
3247     if (IS_NULL(x)) goto err0;
3248   }
3249   else {
3250     r = make_absent_tail(&ns[5], &ns[6], id1, env);
3251     if (r != 0) goto err;
3252 
3253     x = make_list(7, ns);
3254     if (IS_NULL(x)) goto err0;
3255   }
3256 
3257   *node = x;
3258   return ONIG_NORMAL;
3259 
3260  err0:
3261   r = ONIGERR_MEMORY;
3262  err:
3263   for (i = 0; i < 7; i++) onig_node_free(ns[i]);
3264   return r;
3265 }
3266 
3267 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)3268 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
3269 {
3270   int addlen = (int )(end - s);
3271 
3272   if (addlen > 0) {
3273     int len  = (int )(STR_(node)->end - STR_(node)->s);
3274 
3275     if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) {
3276       UChar* p;
3277       int capa = len + addlen + NODE_STRING_MARGIN;
3278 
3279       if (capa <= STR_(node)->capacity) {
3280         onig_strcpy(STR_(node)->s + len, s, end);
3281       }
3282       else {
3283         if (STR_(node)->s == STR_(node)->buf)
3284           p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end,
3285                                       s, end, capa);
3286         else
3287           p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa);
3288 
3289         CHECK_NULL_RETURN_MEMERR(p);
3290         STR_(node)->s        = p;
3291         STR_(node)->capacity = capa;
3292       }
3293     }
3294     else {
3295       onig_strcpy(STR_(node)->s + len, s, end);
3296     }
3297     STR_(node)->end = STR_(node)->s + len + addlen;
3298   }
3299 
3300   return 0;
3301 }
3302 
3303 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end,int need_free)3304 onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free)
3305 {
3306   onig_node_str_clear(node, need_free);
3307   return onig_node_str_cat(node, s, end);
3308 }
3309 
3310 static int
node_str_cat_char(Node * node,UChar c)3311 node_str_cat_char(Node* node, UChar c)
3312 {
3313   UChar s[1];
3314 
3315   s[0] = c;
3316   return onig_node_str_cat(node, s, s + 1);
3317 }
3318 
3319 extern void
onig_node_str_clear(Node * node,int need_free)3320 onig_node_str_clear(Node* node, int need_free)
3321 {
3322   if (need_free != 0 &&
3323       STR_(node)->capacity != 0 &&
3324       IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
3325     xfree(STR_(node)->s);
3326   }
3327 
3328   STR_(node)->flag     = 0;
3329   STR_(node)->s        = STR_(node)->buf;
3330   STR_(node)->end      = STR_(node)->buf;
3331   STR_(node)->capacity = 0;
3332 }
3333 
3334 static int
node_set_str(Node * node,const UChar * s,const UChar * end)3335 node_set_str(Node* node, const UChar* s, const UChar* end)
3336 {
3337   int r;
3338 
3339   NODE_SET_TYPE(node, NODE_STRING);
3340   STR_(node)->flag     = 0;
3341   STR_(node)->s        = STR_(node)->buf;
3342   STR_(node)->end      = STR_(node)->buf;
3343   STR_(node)->capacity = 0;
3344 
3345   r = onig_node_str_cat(node, s, end);
3346   return r;
3347 }
3348 
3349 static Node*
node_new_str(const UChar * s,const UChar * end)3350 node_new_str(const UChar* s, const UChar* end)
3351 {
3352   int r;
3353   Node* node = node_new();
3354   CHECK_NULL_RETURN(node);
3355 
3356   r = node_set_str(node, s, end);
3357   if (r != 0) {
3358     onig_node_free(node);
3359     return NULL;
3360   }
3361 
3362   return node;
3363 }
3364 
3365 static int
node_reset_str(Node * node,const UChar * s,const UChar * end)3366 node_reset_str(Node* node, const UChar* s, const UChar* end)
3367 {
3368   node_free_body(node);
3369   return node_set_str(node, s, end);
3370 }
3371 
3372 extern int
onig_node_reset_empty(Node * node)3373 onig_node_reset_empty(Node* node)
3374 {
3375   return node_reset_str(node, NULL, NULL);
3376 }
3377 
3378 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)3379 onig_node_new_str(const UChar* s, const UChar* end)
3380 {
3381   return node_new_str(s, end);
3382 }
3383 
3384 static Node*
node_new_str_with_options(const UChar * s,const UChar * end,OnigOptionType options)3385 node_new_str_with_options(const UChar* s, const UChar* end,
3386                           OnigOptionType options)
3387 {
3388   Node* node;
3389   node = node_new_str(s, end);
3390 
3391   if (OPTON_IGNORECASE(options))
3392     NODE_STATUS_ADD(node, IGNORECASE);
3393 
3394   return node;
3395 }
3396 
3397 static Node*
node_new_str_crude(UChar * s,UChar * end,OnigOptionType options)3398 node_new_str_crude(UChar* s, UChar* end, OnigOptionType options)
3399 {
3400   Node* node = node_new_str_with_options(s, end, options);
3401   CHECK_NULL_RETURN(node);
3402   NODE_STRING_SET_CRUDE(node);
3403   return node;
3404 }
3405 
3406 static Node*
node_new_empty(void)3407 node_new_empty(void)
3408 {
3409   return node_new_str(NULL, NULL);
3410 }
3411 
3412 static Node*
node_new_str_crude_char(UChar c,OnigOptionType options)3413 node_new_str_crude_char(UChar c, OnigOptionType options)
3414 {
3415   int i;
3416   UChar p[1];
3417   Node* node;
3418 
3419   p[0] = c;
3420   node = node_new_str_crude(p, p + 1, options);
3421 
3422   /* clear buf tail */
3423   for (i = 1; i < NODE_STRING_BUF_SIZE; i++)
3424     STR_(node)->buf[i] = '\0';
3425 
3426   return node;
3427 }
3428 
3429 static Node*
str_node_split_last_char(Node * node,OnigEncoding enc)3430 str_node_split_last_char(Node* node, OnigEncoding enc)
3431 {
3432   const UChar *p;
3433   Node* rn;
3434   StrNode* sn;
3435 
3436   sn = STR_(node);
3437   rn = NULL_NODE;
3438   if (sn->end > sn->s) {
3439     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
3440     if (p && p > sn->s) { /* can be split. */
3441       rn = node_new_str(p, sn->end);
3442       CHECK_NULL_RETURN(rn);
3443 
3444       sn->end = (UChar* )p;
3445       STR_(rn)->flag = sn->flag;
3446       NODE_STATUS(rn) = NODE_STATUS(node);
3447     }
3448   }
3449 
3450   return rn;
3451 }
3452 
3453 static int
str_node_can_be_split(Node * node,OnigEncoding enc)3454 str_node_can_be_split(Node* node, OnigEncoding enc)
3455 {
3456   StrNode* sn = STR_(node);
3457   if (sn->end > sn->s) {
3458     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
3459   }
3460   return 0;
3461 }
3462 
3463 static int
scan_number(UChar ** src,const UChar * end,OnigEncoding enc)3464 scan_number(UChar** src, const UChar* end, OnigEncoding enc)
3465 {
3466   int num, val;
3467   OnigCodePoint c;
3468   UChar* p = *src;
3469   PFETCH_READY;
3470 
3471   num = 0;
3472   while (! PEND) {
3473     PFETCH(c);
3474     if (IS_CODE_DIGIT_ASCII(enc, c)) {
3475       val = (int )DIGITVAL(c);
3476       if ((ONIG_INT_MAX - val) / 10 < num)
3477         return -1;  /* overflow */
3478 
3479       num = num * 10 + val;
3480     }
3481     else {
3482       PUNFETCH;
3483       break;
3484     }
3485   }
3486   *src = p;
3487   return num;
3488 }
3489 
3490 static int
scan_hexadecimal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3491 scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen,
3492                         OnigEncoding enc, OnigCodePoint* rcode)
3493 {
3494   OnigCodePoint code;
3495   OnigCodePoint c;
3496   unsigned int val;
3497   int n;
3498   UChar* p = *src;
3499   PFETCH_READY;
3500 
3501   code = 0;
3502   n = 0;
3503   while (! PEND && n < maxlen) {
3504     PFETCH(c);
3505     if (IS_CODE_XDIGIT_ASCII(enc, c)) {
3506       n++;
3507       val = (unsigned int )XDIGITVAL(enc, c);
3508       if ((UINT_MAX - val) / 16UL < code)
3509         return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3510 
3511       code = (code << 4) + val;
3512     }
3513     else {
3514       PUNFETCH;
3515       break;
3516     }
3517   }
3518 
3519   if (n < minlen)
3520     return ONIGERR_INVALID_CODE_POINT_VALUE;
3521 
3522   *rcode = code;
3523   *src = p;
3524   return ONIG_NORMAL;
3525 }
3526 
3527 static int
scan_octal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3528 scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen,
3529                   OnigEncoding enc, OnigCodePoint* rcode)
3530 {
3531   OnigCodePoint code;
3532   OnigCodePoint c;
3533   unsigned int val;
3534   int n;
3535   UChar* p = *src;
3536   PFETCH_READY;
3537 
3538   code = 0;
3539   n = 0;
3540   while (! PEND && n < maxlen) {
3541     PFETCH(c);
3542     if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') {
3543       n++;
3544       val = (unsigned int )ODIGITVAL(c);
3545       if ((UINT_MAX - val) / 8UL < code)
3546         return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3547 
3548       code = (code << 3) + val;
3549     }
3550     else {
3551       PUNFETCH;
3552       break;
3553     }
3554   }
3555 
3556   if (n < minlen)
3557     return ONIGERR_INVALID_CODE_POINT_VALUE;
3558 
3559   *rcode = code;
3560   *src = p;
3561   return ONIG_NORMAL;
3562 }
3563 
3564 static int
scan_number_of_base(UChar ** src,UChar * end,int minlen,OnigEncoding enc,OnigCodePoint * rcode,int base)3565 scan_number_of_base(UChar** src, UChar* end, int minlen,
3566                     OnigEncoding enc, OnigCodePoint* rcode, int base)
3567 {
3568   int r;
3569 
3570   if (base == 16)
3571     r = scan_hexadecimal_number(src, end, minlen, 8, enc, rcode);
3572   else if (base == 8)
3573     r = scan_octal_number(src, end, minlen, 11, enc, rcode);
3574   else
3575     r = ONIGERR_INVALID_CODE_POINT_VALUE;
3576 
3577   return r;
3578 }
3579 
3580 #define IS_CODE_POINT_DIVIDE(c)  ((c) == ' ' || (c) == '\n')
3581 
3582 enum CPS_STATE {
3583   CPS_EMPTY = 0,
3584   CPS_START = 1,
3585   CPS_RANGE = 2
3586 };
3587 
3588 static int
check_code_point_sequence_cc(UChar * p,UChar * end,int base,OnigEncoding enc,int state)3589 check_code_point_sequence_cc(UChar* p, UChar* end, int base,
3590                              OnigEncoding enc, int state)
3591 {
3592   int r;
3593   int n;
3594   int end_digit;
3595   OnigCodePoint code;
3596   OnigCodePoint c;
3597   PFETCH_READY;
3598 
3599   end_digit = FALSE;
3600   n = 0;
3601   while (! PEND) {
3602   start:
3603     PFETCH(c);
3604     if (c == '}') {
3605     end_char:
3606       if (state == CPS_RANGE) return ONIGERR_INVALID_CODE_POINT_VALUE;
3607       return n;
3608     }
3609 
3610     if (IS_CODE_POINT_DIVIDE(c)) {
3611       while (! PEND) {
3612         PFETCH(c);
3613         if (! IS_CODE_POINT_DIVIDE(c)) break;
3614       }
3615       if (IS_CODE_POINT_DIVIDE(c))
3616         return ONIGERR_INVALID_CODE_POINT_VALUE;
3617     }
3618     else if (c == '-') {
3619     range:
3620       if (state != CPS_START) return ONIGERR_INVALID_CODE_POINT_VALUE;
3621       if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
3622       end_digit = FALSE;
3623       state = CPS_RANGE;
3624       goto start;
3625     }
3626     else if (end_digit == TRUE) {
3627       if (base == 16) {
3628         if (IS_CODE_XDIGIT_ASCII(enc, c))
3629           return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3630       }
3631       else if (base == 8) {
3632         if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8')
3633           return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3634       }
3635 
3636       return ONIGERR_INVALID_CODE_POINT_VALUE;
3637     }
3638 
3639     if (c == '}') goto end_char;
3640     if (c == '-') goto range;
3641 
3642     PUNFETCH;
3643     r = scan_number_of_base(&p, end, 1, enc, &code, base);
3644     if (r != 0) return r;
3645     n++;
3646     end_digit = TRUE;
3647     state = (state == CPS_RANGE) ? CPS_EMPTY : CPS_START;
3648   }
3649 
3650   return ONIGERR_INVALID_CODE_POINT_VALUE;
3651 }
3652 
3653 static int
check_code_point_sequence(UChar * p,UChar * end,int base,OnigEncoding enc)3654 check_code_point_sequence(UChar* p, UChar* end, int base, OnigEncoding enc)
3655 {
3656   int r;
3657   int n;
3658   int end_digit;
3659   OnigCodePoint code;
3660   OnigCodePoint c;
3661   PFETCH_READY;
3662 
3663   end_digit = FALSE;
3664   n = 0;
3665   while (! PEND) {
3666     PFETCH(c);
3667     if (c == '}') {
3668     end_char:
3669       return n;
3670     }
3671 
3672     if (IS_CODE_POINT_DIVIDE(c)) {
3673       while (! PEND) {
3674         PFETCH(c);
3675         if (! IS_CODE_POINT_DIVIDE(c)) break;
3676       }
3677       if (IS_CODE_POINT_DIVIDE(c))
3678         return ONIGERR_INVALID_CODE_POINT_VALUE;
3679     }
3680     else if (end_digit == TRUE) {
3681       if (base == 16) {
3682         if (IS_CODE_XDIGIT_ASCII(enc, c))
3683           return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3684       }
3685       else if (base == 8) {
3686         if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8')
3687           return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3688       }
3689 
3690       return ONIGERR_INVALID_CODE_POINT_VALUE;
3691     }
3692 
3693     if (c == '}') goto end_char;
3694 
3695     PUNFETCH;
3696     r = scan_number_of_base(&p, end, 1, enc, &code, base);
3697     if (r != 0) return r;
3698     n++;
3699     end_digit = TRUE;
3700   }
3701 
3702   return ONIGERR_INVALID_CODE_POINT_VALUE;
3703 }
3704 
3705 static int
get_next_code_point(UChar ** src,UChar * end,int base,OnigEncoding enc,int in_cc,OnigCodePoint * rcode)3706 get_next_code_point(UChar** src, UChar* end, int base, OnigEncoding enc, int in_cc, OnigCodePoint* rcode)
3707 {
3708   int r;
3709   OnigCodePoint c;
3710   UChar* p = *src;
3711   PFETCH_READY;
3712 
3713   while (! PEND) {
3714     PFETCH(c);
3715     if (! IS_CODE_POINT_DIVIDE(c)) {
3716       if (c == '}') {
3717         *src = p;
3718         return 1; /* end of sequence */
3719       }
3720       else if (c == '-' && in_cc == TRUE) {
3721         *src = p;
3722         return 2; /* range */
3723       }
3724       PUNFETCH;
3725       break;
3726     }
3727     else {
3728       if (PEND)
3729         return ONIGERR_INVALID_CODE_POINT_VALUE;
3730     }
3731   }
3732 
3733   r = scan_number_of_base(&p, end, 1, enc, rcode, base);
3734   if (r != 0) return r;
3735 
3736   *src = p;
3737   return ONIG_NORMAL;
3738 }
3739 
3740 
3741 #define BB_WRITE_CODE_POINT(bbuf,pos,code) \
3742     BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
3743 
3744 /* data format:
3745      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
3746      (all data size is OnigCodePoint)
3747  */
3748 static int
new_code_range(BBuf ** pbuf)3749 new_code_range(BBuf** pbuf)
3750 {
3751 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
3752   int r;
3753   OnigCodePoint n;
3754   BBuf* bbuf;
3755 
3756   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
3757   CHECK_NULL_RETURN_MEMERR(bbuf);
3758   r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE);
3759   if (r != 0) {
3760     xfree(bbuf);
3761     *pbuf = 0;
3762     return r;
3763   }
3764 
3765   n = 0;
3766   BB_WRITE_CODE_POINT(bbuf, 0, n);
3767   return 0;
3768 }
3769 
3770 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)3771 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
3772 {
3773   int r, inc_n, pos;
3774   int low, high, bound, x;
3775   OnigCodePoint n, *data;
3776   BBuf* bbuf;
3777 
3778   if (from > to) {
3779     n = from; from = to; to = n;
3780   }
3781 
3782   if (IS_NULL(*pbuf)) {
3783     r = new_code_range(pbuf);
3784     if (r != 0) return r;
3785     bbuf = *pbuf;
3786     n = 0;
3787   }
3788   else {
3789     bbuf = *pbuf;
3790     GET_CODE_POINT(n, bbuf->p);
3791   }
3792   data = (OnigCodePoint* )(bbuf->p);
3793   data++;
3794 
3795   for (low = 0, bound = n; low < bound; ) {
3796     x = (low + bound) >> 1;
3797     if (from > data[x*2 + 1])
3798       low = x + 1;
3799     else
3800       bound = x;
3801   }
3802 
3803   high = (to == ~((OnigCodePoint )0)) ? n : low;
3804   for (bound = n; high < bound; ) {
3805     x = (high + bound) >> 1;
3806     if (to + 1 >= data[x*2])
3807       high = x + 1;
3808     else
3809       bound = x;
3810   }
3811 
3812   inc_n = low + 1 - high;
3813   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
3814     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
3815 
3816   if (inc_n != 1) {
3817     if (from > data[low*2])
3818       from = data[low*2];
3819     if (to < data[(high - 1)*2 + 1])
3820       to = data[(high - 1)*2 + 1];
3821   }
3822 
3823   if (inc_n != 0 && (OnigCodePoint )high < n) {
3824     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
3825     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
3826     int size = (n - high) * 2 * SIZE_CODE_POINT;
3827 
3828     if (inc_n > 0) {
3829       BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
3830     }
3831     else {
3832       BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
3833     }
3834   }
3835 
3836   pos = SIZE_CODE_POINT * (1 + low * 2);
3837   BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
3838   BB_WRITE_CODE_POINT(bbuf, pos, from);
3839   BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
3840   n += inc_n;
3841   BB_WRITE_CODE_POINT(bbuf, 0, n);
3842 
3843   return 0;
3844 }
3845 
3846 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)3847 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
3848 {
3849   if (from > to) {
3850     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3851       return 0;
3852     else
3853       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3854   }
3855 
3856   return add_code_range_to_buf(pbuf, from, to);
3857 }
3858 
3859 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)3860 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
3861 {
3862   int r, i, n;
3863   OnigCodePoint pre, from, *data, to = 0;
3864 
3865   *pbuf = (BBuf* )NULL;
3866   if (IS_NULL(bbuf)) {
3867   set_all:
3868     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3869   }
3870 
3871   data = (OnigCodePoint* )(bbuf->p);
3872   GET_CODE_POINT(n, data);
3873   data++;
3874   if (n <= 0) goto set_all;
3875 
3876   r = 0;
3877   pre = MBCODE_START_POS(enc);
3878   for (i = 0; i < n; i++) {
3879     from = data[i*2];
3880     to   = data[i*2+1];
3881     if (pre <= from - 1) {
3882       r = add_code_range_to_buf(pbuf, pre, from - 1);
3883       if (r != 0) {
3884         bbuf_free(*pbuf);
3885         return r;
3886       }
3887     }
3888     if (to == ~((OnigCodePoint )0)) break;
3889     pre = to + 1;
3890   }
3891   if (to < ~((OnigCodePoint )0)) {
3892     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
3893     if (r != 0) bbuf_free(*pbuf);
3894   }
3895   return r;
3896 }
3897 
3898 #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\
3899   BBuf *tbuf; \
3900   int  tnot; \
3901   tnot = not1;  not1  = not2;  not2  = tnot; \
3902   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
3903 } while (0)
3904 
3905 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3906 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
3907                   BBuf* bbuf2, int not2, BBuf** pbuf)
3908 {
3909   int r;
3910   OnigCodePoint i, n1, *data1;
3911   OnigCodePoint from, to;
3912 
3913   *pbuf = (BBuf* )NULL;
3914   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
3915     if (not1 != 0 || not2 != 0)
3916       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3917     return 0;
3918   }
3919 
3920   r = 0;
3921   if (IS_NULL(bbuf2))
3922     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3923 
3924   if (IS_NULL(bbuf1)) {
3925     if (not1 != 0) {
3926       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3927     }
3928     else {
3929       if (not2 == 0) {
3930         return bbuf_clone(pbuf, bbuf2);
3931       }
3932       else {
3933         return not_code_range_buf(enc, bbuf2, pbuf);
3934       }
3935     }
3936   }
3937 
3938   if (not1 != 0)
3939     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3940 
3941   data1 = (OnigCodePoint* )(bbuf1->p);
3942   GET_CODE_POINT(n1, data1);
3943   data1++;
3944 
3945   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
3946     r = bbuf_clone(pbuf, bbuf2);
3947   }
3948   else if (not1 == 0) { /* 1 OR (not 2) */
3949     r = not_code_range_buf(enc, bbuf2, pbuf);
3950   }
3951   if (r != 0) return r;
3952 
3953   for (i = 0; i < n1; i++) {
3954     from = data1[i*2];
3955     to   = data1[i*2+1];
3956     r = add_code_range_to_buf(pbuf, from, to);
3957     if (r != 0) return r;
3958   }
3959   return 0;
3960 }
3961 
3962 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)3963 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
3964                 OnigCodePoint* data, int n)
3965 {
3966   int i, r;
3967   OnigCodePoint from2, to2;
3968 
3969   for (i = 0; i < n; i++) {
3970     from2 = data[i*2];
3971     to2   = data[i*2+1];
3972     if (from2 < from1) {
3973       if (to2 < from1) continue;
3974       else {
3975         from1 = to2 + 1;
3976       }
3977     }
3978     else if (from2 <= to1) {
3979       if (to2 < to1) {
3980         if (from1 <= from2 - 1) {
3981           r = add_code_range_to_buf(pbuf, from1, from2-1);
3982           if (r != 0) return r;
3983         }
3984         from1 = to2 + 1;
3985       }
3986       else {
3987         to1 = from2 - 1;
3988       }
3989     }
3990     else {
3991       from1 = from2;
3992     }
3993     if (from1 > to1) break;
3994   }
3995   if (from1 <= to1) {
3996     r = add_code_range_to_buf(pbuf, from1, to1);
3997     if (r != 0) return r;
3998   }
3999   return 0;
4000 }
4001 
4002 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)4003 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
4004 {
4005   int r;
4006   OnigCodePoint i, j, n1, n2, *data1, *data2;
4007   OnigCodePoint from, to, from1, to1, from2, to2;
4008 
4009   *pbuf = (BBuf* )NULL;
4010   if (IS_NULL(bbuf1)) {
4011     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
4012       return bbuf_clone(pbuf, bbuf2);
4013     return 0;
4014   }
4015   else if (IS_NULL(bbuf2)) {
4016     if (not2 != 0)
4017       return bbuf_clone(pbuf, bbuf1);
4018     return 0;
4019   }
4020 
4021   if (not1 != 0)
4022     SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
4023 
4024   data1 = (OnigCodePoint* )(bbuf1->p);
4025   data2 = (OnigCodePoint* )(bbuf2->p);
4026   GET_CODE_POINT(n1, data1);
4027   GET_CODE_POINT(n2, data2);
4028   data1++;
4029   data2++;
4030 
4031   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
4032     for (i = 0; i < n1; i++) {
4033       from1 = data1[i*2];
4034       to1   = data1[i*2+1];
4035       for (j = 0; j < n2; j++) {
4036         from2 = data2[j*2];
4037         to2   = data2[j*2+1];
4038         if (from2 > to1) break;
4039         if (to2 < from1) continue;
4040         from = MAX(from1, from2);
4041         to   = MIN(to1, to2);
4042         r = add_code_range_to_buf(pbuf, from, to);
4043         if (r != 0) return r;
4044       }
4045     }
4046   }
4047   else if (not1 == 0) { /* 1 AND (not 2) */
4048     for (i = 0; i < n1; i++) {
4049       from1 = data1[i*2];
4050       to1   = data1[i*2+1];
4051       r = and_code_range1(pbuf, from1, to1, data2, n2);
4052       if (r != 0) return r;
4053     }
4054   }
4055 
4056   return 0;
4057 }
4058 
4059 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)4060 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
4061 {
4062   int r, not1, not2;
4063   BBuf *buf1, *buf2, *pbuf;
4064   BitSetRef bsr1, bsr2;
4065   BitSet bs1, bs2;
4066 
4067   not1 = IS_NCCLASS_NOT(dest);
4068   bsr1 = dest->bs;
4069   buf1 = dest->mbuf;
4070   not2 = IS_NCCLASS_NOT(cc);
4071   bsr2 = cc->bs;
4072   buf2 = cc->mbuf;
4073 
4074   if (not1 != 0) {
4075     bitset_invert_to(bsr1, bs1);
4076     bsr1 = bs1;
4077   }
4078   if (not2 != 0) {
4079     bitset_invert_to(bsr2, bs2);
4080     bsr2 = bs2;
4081   }
4082   bitset_and(bsr1, bsr2);
4083   if (bsr1 != dest->bs) {
4084     bitset_copy(dest->bs, bsr1);
4085   }
4086   if (not1 != 0) {
4087     bitset_invert(dest->bs);
4088   }
4089 
4090   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4091     if (not1 != 0 && not2 != 0) {
4092       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
4093     }
4094     else {
4095       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
4096       if (r == 0 && not1 != 0) {
4097         BBuf *tbuf;
4098         r = not_code_range_buf(enc, pbuf, &tbuf);
4099         if (r != 0) {
4100           bbuf_free(pbuf);
4101           return r;
4102         }
4103         bbuf_free(pbuf);
4104         pbuf = tbuf;
4105       }
4106     }
4107     if (r != 0) return r;
4108 
4109     dest->mbuf = pbuf;
4110     bbuf_free(buf1);
4111     return r;
4112   }
4113   return 0;
4114 }
4115 
4116 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)4117 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
4118 {
4119   int r, not1, not2;
4120   BBuf *buf1, *buf2, *pbuf;
4121   BitSetRef bsr1, bsr2;
4122   BitSet bs1, bs2;
4123 
4124   not1 = IS_NCCLASS_NOT(dest);
4125   bsr1 = dest->bs;
4126   buf1 = dest->mbuf;
4127   not2 = IS_NCCLASS_NOT(cc);
4128   bsr2 = cc->bs;
4129   buf2 = cc->mbuf;
4130 
4131   if (not1 != 0) {
4132     bitset_invert_to(bsr1, bs1);
4133     bsr1 = bs1;
4134   }
4135   if (not2 != 0) {
4136     bitset_invert_to(bsr2, bs2);
4137     bsr2 = bs2;
4138   }
4139   bitset_or(bsr1, bsr2);
4140   if (bsr1 != dest->bs) {
4141     bitset_copy(dest->bs, bsr1);
4142   }
4143   if (not1 != 0) {
4144     bitset_invert(dest->bs);
4145   }
4146 
4147   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4148     if (not1 != 0 && not2 != 0) {
4149       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
4150     }
4151     else {
4152       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
4153       if (r == 0 && not1 != 0) {
4154         BBuf *tbuf;
4155         r = not_code_range_buf(enc, pbuf, &tbuf);
4156         if (r != 0) {
4157           bbuf_free(pbuf);
4158           return r;
4159         }
4160         bbuf_free(pbuf);
4161         pbuf = tbuf;
4162       }
4163     }
4164     if (r != 0) return r;
4165 
4166     dest->mbuf = pbuf;
4167     bbuf_free(buf1);
4168     return r;
4169   }
4170   else
4171     return 0;
4172 }
4173 
4174 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)4175 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
4176 {
4177   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
4178     switch (c) {
4179     case 'n': return '\n';
4180     case 't': return '\t';
4181     case 'r': return '\r';
4182     case 'f': return '\f';
4183     case 'a': return '\007';
4184     case 'b': return '\010';
4185     case 'e': return '\033';
4186     case 'v':
4187       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
4188         return '\v';
4189       break;
4190 
4191     default:
4192       break;
4193     }
4194   }
4195   return c;
4196 }
4197 
4198 static int
is_invalid_quantifier_target(Node * node)4199 is_invalid_quantifier_target(Node* node)
4200 {
4201   switch (NODE_TYPE(node)) {
4202   case NODE_ANCHOR:
4203   case NODE_GIMMICK:
4204     return 1;
4205     break;
4206 
4207   case NODE_BAG:
4208     /* allow enclosed elements */
4209     /* return is_invalid_quantifier_target(NODE_BODY(node)); */
4210     break;
4211 
4212   case NODE_LIST:
4213     do {
4214       if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0;
4215     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4216     return 0;
4217     break;
4218 
4219   case NODE_ALT:
4220     do {
4221       if (is_invalid_quantifier_target(NODE_CAR(node))) return 1;
4222     } while (IS_NOT_NULL(node = NODE_CDR(node)));
4223     break;
4224 
4225   default:
4226     break;
4227   }
4228   return 0;
4229 }
4230 
4231 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
4232 static int
quantifier_type_num(QuantNode * q)4233 quantifier_type_num(QuantNode* q)
4234 {
4235   if (q->greedy) {
4236     if (q->lower == 0) {
4237       if (q->upper == 1) return 0;
4238       else if (IS_INFINITE_REPEAT(q->upper)) return 1;
4239     }
4240     else if (q->lower == 1) {
4241       if (IS_INFINITE_REPEAT(q->upper)) return 2;
4242     }
4243   }
4244   else {
4245     if (q->lower == 0) {
4246       if (q->upper == 1) return 3;
4247       else if (IS_INFINITE_REPEAT(q->upper)) return 4;
4248     }
4249     else if (q->lower == 1) {
4250       if (IS_INFINITE_REPEAT(q->upper)) return 5;
4251     }
4252   }
4253   return -1;
4254 }
4255 
4256 
4257 enum ReduceType {
4258   RQ_ASIS = 0, /* as is */
4259   RQ_DEL  = 1, /* delete parent */
4260   RQ_A,        /* to '*'    */
4261   RQ_AQ,       /* to '*?'   */
4262   RQ_QQ,       /* to '??'   */
4263   RQ_P_QQ,     /* to '+)??' */
4264   RQ_PQ_Q      /* to '+?)?' */
4265 };
4266 
4267 static enum ReduceType ReduceTypeTable[6][6] = {
4268   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
4269   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
4270   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
4271   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
4272   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
4273   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
4274 };
4275 
4276 extern int
onig_reduce_nested_quantifier(Node * pnode)4277 onig_reduce_nested_quantifier(Node* pnode)
4278 {
4279   int pnum, cnum;
4280   QuantNode *p, *c;
4281   Node* cnode;
4282 
4283   cnode = NODE_BODY(pnode);
4284 
4285   p = QUANT_(pnode);
4286   c = QUANT_(cnode);
4287   pnum = quantifier_type_num(p);
4288   cnum = quantifier_type_num(c);
4289   if (pnum < 0 || cnum < 0) {
4290     if (p->lower == p->upper && c->lower == c->upper) {
4291       int n = onig_positive_int_multiply(p->lower, c->lower);
4292       if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4293 
4294       p->lower = p->upper = n;
4295       NODE_BODY(pnode) = NODE_BODY(cnode);
4296       goto remove_cnode;
4297     }
4298 
4299     return 0;
4300   }
4301 
4302   switch(ReduceTypeTable[cnum][pnum]) {
4303   case RQ_DEL:
4304     *pnode = *cnode;
4305     goto remove_cnode;
4306     break;
4307   case RQ_A:
4308     NODE_BODY(pnode) = NODE_BODY(cnode);
4309     p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 1;
4310     goto remove_cnode;
4311     break;
4312   case RQ_AQ:
4313     NODE_BODY(pnode) = NODE_BODY(cnode);
4314     p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 0;
4315     goto remove_cnode;
4316     break;
4317   case RQ_QQ:
4318     NODE_BODY(pnode) = NODE_BODY(cnode);
4319     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
4320     goto remove_cnode;
4321     break;
4322   case RQ_P_QQ:
4323     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
4324     c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 1;
4325     break;
4326   case RQ_PQ_Q:
4327     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
4328     c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 0;
4329     break;
4330   case RQ_ASIS:
4331     break;
4332   }
4333 
4334   return 0;
4335 
4336  remove_cnode:
4337   NODE_BODY(cnode) = NULL_NODE;
4338   onig_node_free(cnode);
4339   return 0;
4340 }
4341 
4342 static int
node_new_general_newline(Node ** node,ScanEnv * env)4343 node_new_general_newline(Node** node, ScanEnv* env)
4344 {
4345   int r;
4346   int dlen, alen;
4347   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
4348   Node* crnl;
4349   Node* ncc;
4350   Node* x;
4351   CClassNode* cc;
4352 
4353   dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf);
4354   if (dlen < 0) return dlen;
4355   alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen);
4356   if (alen < 0) return alen;
4357 
4358   crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE);
4359   CHECK_NULL_RETURN_MEMERR(crnl);
4360 
4361   ncc = node_new_cclass();
4362   if (IS_NULL(ncc)) goto err2;
4363 
4364   cc = CCLASS_(ncc);
4365   if (dlen == 1) {
4366     bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d);
4367   }
4368   else {
4369     r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d);
4370     if (r != 0) {
4371     err1:
4372       onig_node_free(ncc);
4373     err2:
4374       onig_node_free(crnl);
4375       return ONIGERR_MEMORY;
4376     }
4377   }
4378 
4379   if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) {
4380     r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
4381     if (r != 0) goto err1;
4382     r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
4383     if (r != 0) goto err1;
4384   }
4385 
4386   x = node_new_bag_if_else(crnl, NULL_NODE, ncc);
4387   if (IS_NULL(x)) goto err1;
4388 
4389   *node = x;
4390   return 0;
4391 }
4392 
4393 enum TokenSyms {
4394   TK_EOT      = 0,   /* end of token */
4395   TK_CRUDE_BYTE = 1,
4396   TK_CHAR,
4397   TK_STRING,
4398   TK_CODE_POINT,
4399   TK_ANYCHAR,
4400   TK_CHAR_TYPE,
4401   TK_BACKREF,
4402   TK_CALL,
4403   TK_ANCHOR,
4404   TK_REPEAT,
4405   TK_INTERVAL,
4406   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
4407   TK_ALT,
4408   TK_SUBEXP_OPEN,
4409   TK_SUBEXP_CLOSE,
4410   TK_OPEN_CC,
4411   TK_QUOTE_OPEN,
4412   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
4413   TK_KEEP,             /* \K */
4414   TK_GENERAL_NEWLINE,  /* \R */
4415   TK_NO_NEWLINE,       /* \N */
4416   TK_TRUE_ANYCHAR,     /* \O */
4417   TK_TEXT_SEGMENT,     /* \X */
4418 
4419   /* in cc */
4420   TK_CC_CLOSE,
4421   TK_CC_RANGE,
4422   TK_CC_POSIX_BRACKET_OPEN,
4423   TK_CC_AND,           /* && */
4424   TK_CC_OPEN_CC        /* [ */
4425 };
4426 
4427 typedef struct {
4428   enum TokenSyms type;
4429   int code_point_continue;
4430   int escaped;
4431   int base_num;   /* is number: 8, 16 (used in [....]) */
4432   UChar* backp;
4433   union {
4434     UChar* s;
4435     UChar byte;
4436     OnigCodePoint code;
4437     int   anchor;
4438     int   subtype;
4439     struct {
4440       int lower;
4441       int upper;
4442       int greedy;
4443       int possessive;
4444     } repeat;
4445     struct {
4446       int  num;
4447       int  ref1;
4448       int* refs;
4449       int  by_name;
4450 #ifdef USE_BACKREF_WITH_LEVEL
4451       int  exist_level;
4452       int  level;   /* \k<name+n> */
4453 #endif
4454     } backref;
4455     struct {
4456       UChar* name;
4457       UChar* name_end;
4458       int    gnum;
4459       int    by_number;
4460     } call;
4461     struct {
4462       int ctype;
4463       int not;
4464     } prop;
4465   } u;
4466 } PToken;
4467 
4468 static void
ptoken_init(PToken * tok)4469 ptoken_init(PToken* tok)
4470 {
4471   tok->code_point_continue = 0;
4472 }
4473 
4474 static int
fetch_interval(UChar ** src,UChar * end,PToken * tok,ScanEnv * env)4475 fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
4476 {
4477   int low, up, syn_allow, non_low = 0;
4478   int r = 0;
4479   OnigCodePoint c;
4480   OnigEncoding enc = env->enc;
4481   UChar* p = *src;
4482   PFETCH_READY;
4483 
4484   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
4485 
4486   if (PEND) {
4487     if (syn_allow)
4488       return 1;  /* "....{" : OK! */
4489     else
4490       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
4491   }
4492 
4493   if (! syn_allow) {
4494     c = PPEEK;
4495     if (c == ')' || c == '(' || c == '|') {
4496       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
4497     }
4498   }
4499 
4500   low = scan_number(&p, end, env->enc);
4501   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4502   if (low > ONIG_MAX_REPEAT_NUM)
4503     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4504 
4505   if (p == *src) { /* can't read low */
4506     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
4507       /* allow {,n} as {0,n} */
4508       low = 0;
4509       non_low = 1;
4510     }
4511     else
4512       goto invalid;
4513   }
4514 
4515   if (PEND) goto invalid;
4516   PFETCH(c);
4517   if (c == ',') {
4518     UChar* prev = p;
4519     up = scan_number(&p, end, env->enc);
4520     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4521     if (up > ONIG_MAX_REPEAT_NUM)
4522       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4523 
4524     if (p == prev) {
4525       if (non_low != 0)
4526         goto invalid;
4527       up = INFINITE_REPEAT;  /* {n,} : {n,infinite} */
4528     }
4529   }
4530   else {
4531     if (non_low != 0)
4532       goto invalid;
4533 
4534     PUNFETCH;
4535     up = low;  /* {n} : exact n times */
4536     r = 2;     /* fixed */
4537   }
4538 
4539   if (PEND) goto invalid;
4540   PFETCH(c);
4541   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
4542     if (c != MC_ESC(env->syntax) || PEND) goto invalid;
4543     PFETCH(c);
4544   }
4545   if (c != '}') goto invalid;
4546 
4547   if (!IS_INFINITE_REPEAT(up) && low > up) {
4548     /* {n,m}+ supported case */
4549     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))
4550       return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
4551 
4552     tok->u.repeat.possessive = 1;
4553     {
4554       int tmp;
4555       tmp = low; low = up; up = tmp;
4556     }
4557   }
4558   else
4559     tok->u.repeat.possessive = 0;
4560 
4561   tok->type = TK_INTERVAL;
4562   tok->u.repeat.lower = low;
4563   tok->u.repeat.upper = up;
4564   *src = p;
4565   return r; /* 0: normal {n,m}, 2: fixed {n} */
4566 
4567  invalid:
4568   if (syn_allow) {
4569     /* *src = p; */ /* !!! Don't do this line !!! */
4570     return 1;  /* OK */
4571   }
4572   else
4573     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
4574 }
4575 
4576 /* \M-, \C-, \c, or \... */
4577 static int
fetch_escaped_value_raw(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4578 fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4579 {
4580   int v;
4581   OnigCodePoint c;
4582   OnigEncoding enc = env->enc;
4583   UChar* p = *src;
4584 
4585   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4586 
4587   PFETCH_S(c);
4588   switch (c) {
4589   case 'M':
4590     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
4591       if (PEND) return ONIGERR_END_PATTERN_AT_META;
4592       PFETCH_S(c);
4593       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
4594       if (PEND) return ONIGERR_END_PATTERN_AT_META;
4595       PFETCH_S(c);
4596       if (c == MC_ESC(env->syntax)) {
4597         v = fetch_escaped_value_raw(&p, end, env, &c);
4598         if (v < 0) return v;
4599       }
4600       c = ((c & 0xff) | 0x80);
4601     }
4602     else
4603       goto backslash;
4604     break;
4605 
4606   case 'C':
4607     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
4608       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4609       PFETCH_S(c);
4610       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
4611       goto control;
4612     }
4613     else
4614       goto backslash;
4615 
4616   case 'c':
4617     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
4618     control:
4619       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4620       PFETCH_S(c);
4621       if (c == '?') {
4622         c = 0177;
4623       }
4624       else {
4625         if (c == MC_ESC(env->syntax)) {
4626           v = fetch_escaped_value_raw(&p, end, env, &c);
4627           if (v < 0) return v;
4628         }
4629         c &= 0x9f;
4630       }
4631       break;
4632     }
4633     /* fall through */
4634 
4635   default:
4636     {
4637     backslash:
4638       c = conv_backslash_value(c, env);
4639     }
4640     break;
4641   }
4642 
4643   *src = p;
4644   *val = c;
4645   return 0;
4646 }
4647 
4648 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4649 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4650 {
4651   int r;
4652   int len;
4653 
4654   r = fetch_escaped_value_raw(src, end, env, val);
4655   if (r != 0) return r;
4656 
4657   len = ONIGENC_CODE_TO_MBCLEN(env->enc, *val);
4658   if (len < 0) return len;
4659 
4660   return 0;
4661 }
4662 
4663 static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env);
4664 
4665 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)4666 get_name_end_code_point(OnigCodePoint start)
4667 {
4668   switch (start) {
4669   case '<':  return (OnigCodePoint )'>';  break;
4670   case '\'': return (OnigCodePoint )'\''; break;
4671   case '(':  return (OnigCodePoint )')';  break;
4672   default:
4673     break;
4674   }
4675 
4676   return (OnigCodePoint )0;
4677 }
4678 
4679 enum REF_NUM {
4680   IS_NOT_NUM = 0,
4681   IS_ABS_NUM = 1,
4682   IS_REL_NUM = 2
4683 };
4684 
4685 #ifdef USE_BACKREF_WITH_LEVEL
4686 /*
4687    \k<name+n>, \k<name-n>
4688    \k<num+n>,  \k<num-n>
4689    \k<-num+n>, \k<-num-n>
4690    \k<+num+n>, \k<+num-n>
4691 */
4692 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel,enum REF_NUM * num_type)4693 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
4694                       UChar** rname_end, ScanEnv* env,
4695                       int* rback_num, int* rlevel, enum REF_NUM* num_type)
4696 {
4697   int r, sign, exist_level;
4698   int digit_count;
4699   OnigCodePoint end_code;
4700   OnigCodePoint c = 0;
4701   OnigEncoding enc = env->enc;
4702   UChar *name_end;
4703   UChar *pnum_head;
4704   UChar *p = *src;
4705   PFETCH_READY;
4706 
4707   *rback_num = 0;
4708   exist_level = 0;
4709   *num_type = IS_NOT_NUM;
4710   sign = 1;
4711   pnum_head = *src;
4712 
4713   end_code = get_name_end_code_point(start_code);
4714 
4715   digit_count = 0;
4716   name_end = end;
4717   r = 0;
4718   if (PEND) {
4719     return ONIGERR_EMPTY_GROUP_NAME;
4720   }
4721   else {
4722     PFETCH(c);
4723     if (c == end_code)
4724       return ONIGERR_EMPTY_GROUP_NAME;
4725 
4726     if (IS_CODE_DIGIT_ASCII(enc, c)) {
4727       *num_type = IS_ABS_NUM;
4728       digit_count++;
4729     }
4730     else if (c == '-') {
4731       *num_type = IS_REL_NUM;
4732       sign = -1;
4733       pnum_head = p;
4734     }
4735     else if (c == '+') {
4736       *num_type = IS_REL_NUM;
4737       sign = 1;
4738       pnum_head = p;
4739     }
4740     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4741       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4742     }
4743   }
4744 
4745   while (!PEND) {
4746     name_end = p;
4747     PFETCH(c);
4748     if (c == end_code || c == ')' || c == '+' || c == '-') {
4749       if (*num_type != IS_NOT_NUM && digit_count == 0)
4750         r = ONIGERR_INVALID_GROUP_NAME;
4751       break;
4752     }
4753 
4754     if (*num_type != IS_NOT_NUM) {
4755       if (IS_CODE_DIGIT_ASCII(enc, c)) {
4756         digit_count++;
4757       }
4758       else {
4759         r = ONIGERR_INVALID_GROUP_NAME;
4760         *num_type = IS_NOT_NUM;
4761       }
4762     }
4763     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4764       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4765     }
4766   }
4767 
4768   if (r == 0 && c != end_code) {
4769     if (c == '+' || c == '-') {
4770       int level;
4771       int flag = (c == '-' ? -1 : 1);
4772 
4773       if (PEND) {
4774         r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4775         goto end;
4776       }
4777       PFETCH(c);
4778       if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err;
4779       PUNFETCH;
4780       level = scan_number(&p, end, enc);
4781       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
4782       *rlevel = (level * flag);
4783       exist_level = 1;
4784 
4785       if (!PEND) {
4786         PFETCH(c);
4787         if (c == end_code)
4788           goto end;
4789       }
4790     }
4791 
4792   err:
4793     name_end = end;
4794   err2:
4795     r = ONIGERR_INVALID_GROUP_NAME;
4796   }
4797 
4798  end:
4799   if (r == 0) {
4800     if (*num_type != IS_NOT_NUM) {
4801       *rback_num = scan_number(&pnum_head, name_end, enc);
4802       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4803       else if (*rback_num == 0) {
4804         if (*num_type == IS_REL_NUM)
4805           goto err2;
4806       }
4807 
4808       *rback_num *= sign;
4809     }
4810 
4811     *rname_end = name_end;
4812     *src = p;
4813     return (exist_level ? 1 : 0);
4814   }
4815   else {
4816     onig_scan_env_set_error_string(env, r, *src, name_end);
4817     return r;
4818   }
4819 }
4820 #endif /* USE_BACKREF_WITH_LEVEL */
4821 
4822 /*
4823   ref: 0 -> define name    (don't allow number name)
4824        1 -> reference name (allow number name)
4825 */
4826 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,enum REF_NUM * num_type,int is_ref)4827 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
4828            UChar** rname_end, ScanEnv* env, int* rback_num,
4829            enum REF_NUM* num_type, int is_ref)
4830 {
4831   int r, sign;
4832   int digit_count;
4833   OnigCodePoint end_code;
4834   OnigCodePoint c = 0;
4835   OnigEncoding enc = env->enc;
4836   UChar *name_end;
4837   UChar *pnum_head;
4838   UChar *p = *src;
4839 
4840   *rback_num = 0;
4841 
4842   end_code = get_name_end_code_point(start_code);
4843 
4844   digit_count = 0;
4845   name_end = end;
4846   pnum_head = *src;
4847   r = 0;
4848   *num_type = IS_NOT_NUM;
4849   sign = 1;
4850   if (PEND) {
4851     return ONIGERR_EMPTY_GROUP_NAME;
4852   }
4853   else {
4854     PFETCH_S(c);
4855     if (c == end_code)
4856       return ONIGERR_EMPTY_GROUP_NAME;
4857 
4858     if (IS_CODE_DIGIT_ASCII(enc, c)) {
4859       if (is_ref == TRUE)
4860         *num_type = IS_ABS_NUM;
4861       else {
4862         r = ONIGERR_INVALID_GROUP_NAME;
4863       }
4864       digit_count++;
4865     }
4866     else if (c == '-') {
4867       if (is_ref == TRUE) {
4868         *num_type = IS_REL_NUM;
4869         sign = -1;
4870         pnum_head = p;
4871       }
4872       else {
4873         r = ONIGERR_INVALID_GROUP_NAME;
4874       }
4875     }
4876     else if (c == '+') {
4877       if (is_ref == TRUE) {
4878         *num_type = IS_REL_NUM;
4879         sign = 1;
4880         pnum_head = p;
4881       }
4882       else {
4883         r = ONIGERR_INVALID_GROUP_NAME;
4884       }
4885     }
4886     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4887       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4888     }
4889   }
4890 
4891   if (r == 0) {
4892     while (!PEND) {
4893       name_end = p;
4894       PFETCH_S(c);
4895       if (c == end_code || c == ')') {
4896         if (*num_type != IS_NOT_NUM && digit_count == 0)
4897           r = ONIGERR_INVALID_GROUP_NAME;
4898         break;
4899       }
4900 
4901       if (*num_type != IS_NOT_NUM) {
4902         if (IS_CODE_DIGIT_ASCII(enc, c)) {
4903           digit_count++;
4904         }
4905         else {
4906           if (!ONIGENC_IS_CODE_WORD(enc, c))
4907             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4908           else
4909             r = ONIGERR_INVALID_GROUP_NAME;
4910 
4911           *num_type = IS_NOT_NUM;
4912         }
4913       }
4914       else {
4915         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4916           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4917         }
4918       }
4919     }
4920 
4921     if (c != end_code) {
4922       r = ONIGERR_INVALID_GROUP_NAME;
4923       goto err;
4924     }
4925 
4926     if (*num_type != IS_NOT_NUM) {
4927       *rback_num = scan_number(&pnum_head, name_end, enc);
4928       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4929       else if (*rback_num == 0) {
4930         if (*num_type == IS_REL_NUM) {
4931           r = ONIGERR_INVALID_GROUP_NAME;
4932           goto err;
4933         }
4934       }
4935 
4936       *rback_num *= sign;
4937     }
4938 
4939     *rname_end = name_end;
4940     *src = p;
4941     return 0;
4942   }
4943   else {
4944     while (!PEND) {
4945       name_end = p;
4946       PFETCH_S(c);
4947       if (c == end_code || c == ')')
4948         break;
4949     }
4950     if (PEND)
4951       name_end = end;
4952 
4953   err:
4954     onig_scan_env_set_error_string(env, r, *src, name_end);
4955     return r;
4956   }
4957 }
4958 
4959 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)4960 CC_ESC_WARN(ScanEnv* env, UChar *c)
4961 {
4962   if (onig_warn == onig_null_warn) return ;
4963 
4964   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
4965       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
4966     UChar buf[WARN_BUFSIZE];
4967     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4968                                env->pattern, env->pattern_end,
4969                                (UChar* )"character class has '%s' without escape",
4970                                c);
4971     (*onig_warn)((char* )buf);
4972   }
4973 }
4974 
4975 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)4976 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
4977 {
4978   if (onig_warn == onig_null_warn) return ;
4979 
4980   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
4981     UChar buf[WARN_BUFSIZE];
4982     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
4983                          (env)->pattern, (env)->pattern_end,
4984                          (UChar* )"regular expression has '%s' without escape", c);
4985     (*onig_warn)((char* )buf);
4986   }
4987 }
4988 
4989 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)4990 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
4991                   UChar **next, OnigEncoding enc)
4992 {
4993   int i;
4994   OnigCodePoint x;
4995   UChar *q;
4996   UChar *p = from;
4997 
4998   while (p < to) {
4999     x = ONIGENC_MBC_TO_CODE(enc, p, to);
5000     q = p + enclen(enc, p);
5001     if (x == s[0]) {
5002       for (i = 1; i < n && q < to; i++) {
5003         x = ONIGENC_MBC_TO_CODE(enc, q, to);
5004         if (x != s[i]) break;
5005         q += enclen(enc, q);
5006       }
5007       if (i >= n) {
5008         if (IS_NOT_NULL(next))
5009           *next = q;
5010         return p;
5011       }
5012     }
5013     p = q;
5014   }
5015   return NULL_UCHARP;
5016 }
5017 
5018 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)5019 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
5020                          OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
5021 {
5022   int i, in_esc;
5023   OnigCodePoint x;
5024   UChar *q;
5025   UChar *p = from;
5026 
5027   in_esc = 0;
5028   while (p < to) {
5029     if (in_esc) {
5030       in_esc = 0;
5031       p += enclen(enc, p);
5032     }
5033     else {
5034       x = ONIGENC_MBC_TO_CODE(enc, p, to);
5035       q = p + enclen(enc, p);
5036       if (x == s[0]) {
5037         for (i = 1; i < n && q < to; i++) {
5038           x = ONIGENC_MBC_TO_CODE(enc, q, to);
5039           if (x != s[i]) break;
5040           q += enclen(enc, q);
5041         }
5042         if (i >= n) return 1;
5043         p += enclen(enc, p);
5044       }
5045       else {
5046         x = ONIGENC_MBC_TO_CODE(enc, p, to);
5047         if (x == bad) return 0;
5048         else if (x == MC_ESC(syn)) in_esc = 1;
5049         p = q;
5050       }
5051     }
5052   }
5053   return 0;
5054 }
5055 
5056 static int
fetch_token_cc(PToken * tok,UChar ** src,UChar * end,ScanEnv * env,int state)5057 fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
5058 {
5059   int r;
5060   OnigCodePoint code;
5061   OnigCodePoint c, c2;
5062   OnigSyntaxType* syn = env->syntax;
5063   OnigEncoding enc = env->enc;
5064   UChar* prev;
5065   UChar* p = *src;
5066   PFETCH_READY;
5067 
5068   if (tok->code_point_continue != 0) {
5069     r = get_next_code_point(&p, end, tok->base_num, enc, TRUE, &code);
5070     if (r == 1) {
5071       tok->code_point_continue = 0;
5072     }
5073     else if (r == 2) {
5074       tok->type = TK_CC_RANGE;
5075       goto end;
5076     }
5077     else if (r == 0) {
5078       tok->type   = TK_CODE_POINT;
5079       tok->u.code = code;
5080       goto end;
5081     }
5082     else
5083       return r; /* error */
5084   }
5085 
5086   if (PEND) {
5087     tok->type = TK_EOT;
5088     return tok->type;
5089   }
5090 
5091   PFETCH(c);
5092   tok->type = TK_CHAR;
5093   tok->base_num = 0;
5094   tok->u.code   = c;
5095   tok->escaped  = 0;
5096 
5097   if (c == ']') {
5098     tok->type = TK_CC_CLOSE;
5099   }
5100   else if (c == '-') {
5101     tok->type = TK_CC_RANGE;
5102   }
5103   else if (c == MC_ESC(syn)) {
5104     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
5105       goto end;
5106 
5107     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
5108 
5109     PFETCH(c);
5110     tok->escaped = 1;
5111     tok->u.code = c;
5112     switch (c) {
5113     case 'w':
5114       tok->type = TK_CHAR_TYPE;
5115       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5116       tok->u.prop.not   = 0;
5117       break;
5118     case 'W':
5119       tok->type = TK_CHAR_TYPE;
5120       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5121       tok->u.prop.not   = 1;
5122       break;
5123     case 'd':
5124       tok->type = TK_CHAR_TYPE;
5125       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5126       tok->u.prop.not   = 0;
5127       break;
5128     case 'D':
5129       tok->type = TK_CHAR_TYPE;
5130       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5131       tok->u.prop.not   = 1;
5132       break;
5133     case 's':
5134       tok->type = TK_CHAR_TYPE;
5135       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5136       tok->u.prop.not   = 0;
5137       break;
5138     case 'S':
5139       tok->type = TK_CHAR_TYPE;
5140       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5141       tok->u.prop.not   = 1;
5142       break;
5143     case 'h':
5144       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5145       tok->type = TK_CHAR_TYPE;
5146       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5147       tok->u.prop.not   = 0;
5148       break;
5149     case 'H':
5150       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5151       tok->type = TK_CHAR_TYPE;
5152       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5153       tok->u.prop.not   = 1;
5154       break;
5155 
5156     case 'p':
5157     case 'P':
5158       if (PEND) break;
5159 
5160       c2 = PPEEK;
5161       if (c2 == '{' &&
5162           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5163         PINC;
5164         tok->type = TK_CHAR_PROPERTY;
5165         tok->u.prop.not = c == 'P';
5166 
5167         if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5168           PFETCH(c2);
5169           if (c2 == '^') {
5170             tok->u.prop.not = tok->u.prop.not == 0;
5171           }
5172           else
5173             PUNFETCH;
5174         }
5175       }
5176       break;
5177 
5178     case 'o':
5179       if (PEND) break;
5180 
5181       prev = p;
5182       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5183         PINC;
5184         r = scan_octal_number(&p, end, 0, 11, enc, &code);
5185         if (r < 0) return r;
5186         if (!PEND) {
5187           c2 = PPEEK;
5188           if (IS_CODE_DIGIT_ASCII(enc, c2))
5189             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5190         }
5191 
5192         tok->base_num = 8;
5193         goto brace_code_point_entry;
5194       }
5195       break;
5196 
5197     case 'x':
5198       if (PEND) break;
5199 
5200       prev = p;
5201       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5202         PINC;
5203         r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5204         if (r < 0) return r;
5205         if (!PEND) {
5206           c2 = PPEEK;
5207           if (IS_CODE_XDIGIT_ASCII(enc, c2))
5208             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5209         }
5210 
5211         tok->base_num = 16;
5212       brace_code_point_entry:
5213         if ((p > prev + enclen(enc, prev))) {
5214           if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
5215           if (PPEEK_IS('}')) {
5216             PINC;
5217           }
5218           else {
5219             int curr_state;
5220 
5221             curr_state = (state == CS_RANGE) ? CPS_EMPTY : CPS_START;
5222             r = check_code_point_sequence_cc(p, end, tok->base_num, enc,
5223                                              curr_state);
5224             if (r < 0) return r;
5225             if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
5226             tok->code_point_continue = TRUE;
5227           }
5228           tok->type   = TK_CODE_POINT;
5229           tok->u.code = code;
5230         }
5231         else {
5232           /* can't read nothing or invalid format */
5233           p = prev;
5234         }
5235       }
5236       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5237         r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5238         if (r < 0) return r;
5239         if (p == prev) {  /* can't read nothing. */
5240           code = 0; /* but, it's not error */
5241         }
5242         tok->type = TK_CRUDE_BYTE;
5243         tok->base_num = 16;
5244         tok->u.byte   = (UChar )code;
5245       }
5246       break;
5247 
5248     case 'u':
5249       if (PEND) break;
5250 
5251       prev = p;
5252       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5253         r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5254         if (r < 0) return r;
5255         if (p == prev) {  /* can't read nothing. */
5256           code = 0; /* but, it's not error */
5257         }
5258         tok->type = TK_CODE_POINT;
5259         tok->base_num = 16;
5260         tok->u.code   = code;
5261       }
5262       break;
5263 
5264     case '0':
5265     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
5266       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5267         PUNFETCH;
5268         prev = p;
5269         r = scan_octal_number(&p, end, 0, 3, enc, &code);
5270         if (r < 0) return r;
5271         if (code >= 256) return ONIGERR_TOO_BIG_NUMBER;
5272         if (p == prev) {  /* can't read nothing. */
5273           code = 0; /* but, it's not error */
5274         }
5275         tok->type = TK_CRUDE_BYTE;
5276         tok->base_num = 8;
5277         tok->u.byte   = (UChar )code;
5278       }
5279       break;
5280 
5281     default:
5282       PUNFETCH;
5283       r = fetch_escaped_value(&p, end, env, &c2);
5284       if (r < 0) return r;
5285       if (tok->u.code != c2) {
5286         tok->u.code = c2;
5287         tok->type   = TK_CODE_POINT;
5288       }
5289       break;
5290     }
5291   }
5292   else if (c == '[') {
5293     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
5294       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
5295       tok->backp = p; /* point at '[' is read */
5296       PINC;
5297       if (str_exist_check_with_esc(send, 2, p, end,
5298                                    (OnigCodePoint )']', enc, syn)) {
5299         tok->type = TK_CC_POSIX_BRACKET_OPEN;
5300       }
5301       else {
5302         PUNFETCH;
5303         goto cc_in_cc;
5304       }
5305     }
5306     else {
5307     cc_in_cc:
5308       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
5309         tok->type = TK_CC_OPEN_CC;
5310       }
5311       else {
5312         CC_ESC_WARN(env, (UChar* )"[");
5313       }
5314     }
5315   }
5316   else if (c == '&') {
5317     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
5318         !PEND && (PPEEK_IS('&'))) {
5319       PINC;
5320       tok->type = TK_CC_AND;
5321     }
5322   }
5323 
5324  end:
5325   *src = p;
5326   return tok->type;
5327 }
5328 
5329 static int
fetch_token(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)5330 fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
5331 {
5332   int r;
5333   OnigCodePoint code;
5334   OnigCodePoint c;
5335   OnigEncoding enc = env->enc;
5336   OnigSyntaxType* syn = env->syntax;
5337   UChar* prev;
5338   UChar* p = *src;
5339   PFETCH_READY;
5340 
5341   if (tok->code_point_continue != 0) {
5342     r = get_next_code_point(&p, end, tok->base_num, enc, FALSE, &code);
5343     if (r == 1) {
5344       tok->code_point_continue = 0;
5345     }
5346     else if (r == 0) {
5347       tok->type   = TK_CODE_POINT;
5348       tok->u.code = code;
5349       goto out;
5350     }
5351     else
5352       return r; /* error */
5353   }
5354 
5355  start:
5356   if (PEND) {
5357     tok->type = TK_EOT;
5358     return tok->type;
5359   }
5360 
5361   tok->type = TK_STRING;
5362   tok->base_num = 0;
5363   tok->backp    = p;
5364 
5365   PFETCH(c);
5366   if (IS_MC_ESC_CODE(c, syn)) {
5367     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
5368 
5369     tok->backp = p;
5370     PFETCH(c);
5371 
5372     tok->u.code = c;
5373     tok->escaped = 1;
5374     switch (c) {
5375     case '*':
5376       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
5377       tok->type = TK_REPEAT;
5378       tok->u.repeat.lower = 0;
5379       tok->u.repeat.upper = INFINITE_REPEAT;
5380       goto greedy_check;
5381       break;
5382 
5383     case '+':
5384       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
5385       tok->type = TK_REPEAT;
5386       tok->u.repeat.lower = 1;
5387       tok->u.repeat.upper = INFINITE_REPEAT;
5388       goto greedy_check;
5389       break;
5390 
5391     case '?':
5392       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
5393       tok->type = TK_REPEAT;
5394       tok->u.repeat.lower = 0;
5395       tok->u.repeat.upper = 1;
5396     greedy_check:
5397       tok->u.repeat.possessive = 0;
5398     greedy_check2:
5399       if (!PEND && PPEEK_IS('?') &&
5400           IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) &&
5401           tok->u.repeat.possessive == 0) {
5402         PFETCH(c);
5403         tok->u.repeat.greedy = 0;
5404         tok->u.repeat.possessive = 0;
5405       }
5406       else {
5407       possessive_check:
5408         tok->u.repeat.greedy = 1;
5409         if (!PEND && PPEEK_IS('+') &&
5410             ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
5411               tok->type != TK_INTERVAL)  ||
5412              (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
5413               tok->type == TK_INTERVAL)) &&
5414           tok->u.repeat.possessive == 0) {
5415           PFETCH(c);
5416           tok->u.repeat.possessive = 1;
5417         }
5418       }
5419       break;
5420 
5421     case '{':
5422       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
5423       r = fetch_interval(&p, end, tok, env);
5424       if (r < 0) return r;  /* error */
5425       if (r == 0) goto greedy_check2;
5426       else if (r == 2) { /* {n} */
5427         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5428           goto possessive_check;
5429 
5430         goto greedy_check2;
5431       }
5432       /* r == 1 : normal char */
5433       break;
5434 
5435     case '|':
5436       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
5437       tok->type = TK_ALT;
5438       break;
5439 
5440     case '(':
5441       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5442       tok->type = TK_SUBEXP_OPEN;
5443       break;
5444 
5445     case ')':
5446       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5447       tok->type = TK_SUBEXP_CLOSE;
5448       break;
5449 
5450     case 'w':
5451       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5452       tok->type = TK_CHAR_TYPE;
5453       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5454       tok->u.prop.not   = 0;
5455       break;
5456 
5457     case 'W':
5458       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5459       tok->type = TK_CHAR_TYPE;
5460       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5461       tok->u.prop.not   = 1;
5462       break;
5463 
5464     case 'b':
5465       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5466       tok->type = TK_ANCHOR;
5467       tok->u.anchor = ANCR_WORD_BOUNDARY;
5468       break;
5469 
5470     case 'B':
5471       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5472       tok->type = TK_ANCHOR;
5473       tok->u.anchor = ANCR_NO_WORD_BOUNDARY;
5474       break;
5475 
5476     case 'y':
5477       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5478       tok->type = TK_ANCHOR;
5479       tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY;
5480       break;
5481 
5482     case 'Y':
5483       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5484       tok->type = TK_ANCHOR;
5485       tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY;
5486       break;
5487 
5488 #ifdef USE_WORD_BEGIN_END
5489     case '<':
5490       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5491       tok->type = TK_ANCHOR;
5492       tok->u.anchor = ANCR_WORD_BEGIN;
5493       break;
5494 
5495     case '>':
5496       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5497       tok->type = TK_ANCHOR;
5498       tok->u.anchor = ANCR_WORD_END;
5499       break;
5500 #endif
5501 
5502     case 's':
5503       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5504       tok->type = TK_CHAR_TYPE;
5505       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5506       tok->u.prop.not   = 0;
5507       break;
5508 
5509     case 'S':
5510       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5511       tok->type = TK_CHAR_TYPE;
5512       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5513       tok->u.prop.not   = 1;
5514       break;
5515 
5516     case 'd':
5517       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5518       tok->type = TK_CHAR_TYPE;
5519       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5520       tok->u.prop.not   = 0;
5521       break;
5522 
5523     case 'D':
5524       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5525       tok->type = TK_CHAR_TYPE;
5526       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5527       tok->u.prop.not   = 1;
5528       break;
5529 
5530     case 'h':
5531       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5532       tok->type = TK_CHAR_TYPE;
5533       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5534       tok->u.prop.not   = 0;
5535       break;
5536 
5537     case 'H':
5538       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5539       tok->type = TK_CHAR_TYPE;
5540       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5541       tok->u.prop.not   = 1;
5542       break;
5543 
5544     case 'K':
5545       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break;
5546       tok->type = TK_KEEP;
5547       break;
5548 
5549     case 'R':
5550       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break;
5551       tok->type = TK_GENERAL_NEWLINE;
5552       break;
5553 
5554     case 'N':
5555       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5556       tok->type = TK_NO_NEWLINE;
5557       break;
5558 
5559     case 'O':
5560       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5561       tok->type = TK_TRUE_ANYCHAR;
5562       break;
5563 
5564     case 'X':
5565       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5566       tok->type = TK_TEXT_SEGMENT;
5567       break;
5568 
5569     case 'A':
5570       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5571     begin_buf:
5572       tok->type = TK_ANCHOR;
5573       tok->u.subtype = ANCR_BEGIN_BUF;
5574       break;
5575 
5576     case 'Z':
5577       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5578       tok->type = TK_ANCHOR;
5579       tok->u.subtype = ANCR_SEMI_END_BUF;
5580       break;
5581 
5582     case 'z':
5583       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5584     end_buf:
5585       tok->type = TK_ANCHOR;
5586       tok->u.subtype = ANCR_END_BUF;
5587       break;
5588 
5589     case 'G':
5590       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
5591       tok->type = TK_ANCHOR;
5592       tok->u.subtype = ANCR_BEGIN_POSITION;
5593       break;
5594 
5595     case '`':
5596       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5597       goto begin_buf;
5598       break;
5599 
5600     case '\'':
5601       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5602       goto end_buf;
5603       break;
5604 
5605     case 'o':
5606       if (PEND) break;
5607 
5608       prev = p;
5609       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5610         PINC;
5611         r = scan_octal_number(&p, end, 0, 11, enc, &code);
5612         if (r < 0) return r;
5613         if (!PEND) {
5614           if (IS_CODE_DIGIT_ASCII(enc, PPEEK))
5615             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5616         }
5617 
5618         tok->base_num = 8;
5619         goto brace_code_point_entry;
5620       }
5621       break;
5622 
5623     case 'x':
5624       if (PEND) break;
5625 
5626       prev = p;
5627       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5628         PINC;
5629         r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5630         if (r < 0) return r;
5631         if (!PEND) {
5632           if (IS_CODE_XDIGIT_ASCII(enc, PPEEK))
5633             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5634         }
5635 
5636         tok->base_num = 16;
5637       brace_code_point_entry:
5638         if ((p > prev + enclen(enc, prev))) {
5639           if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
5640           if (PPEEK_IS('}')) {
5641             PINC;
5642           }
5643           else {
5644             r = check_code_point_sequence(p, end, tok->base_num, enc);
5645             if (r < 0) return r;
5646             if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
5647             tok->code_point_continue = TRUE;
5648           }
5649           tok->type   = TK_CODE_POINT;
5650           tok->u.code = code;
5651         }
5652         else {
5653           /* can't read nothing or invalid format */
5654           p = prev;
5655         }
5656       }
5657       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5658         r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5659         if (r < 0) return r;
5660         if (p == prev) {  /* can't read nothing. */
5661           code = 0; /* but, it's not error */
5662         }
5663         tok->type = TK_CRUDE_BYTE;
5664         tok->base_num = 16;
5665         tok->u.byte   = (UChar )code;
5666       }
5667       break;
5668 
5669     case 'u':
5670       if (PEND) break;
5671 
5672       prev = p;
5673       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5674         r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5675         if (r < 0) return r;
5676         if (p == prev) {  /* can't read nothing. */
5677           code = 0; /* but, it's not error */
5678         }
5679         tok->type = TK_CODE_POINT;
5680         tok->base_num = 16;
5681         tok->u.code   = code;
5682       }
5683       break;
5684 
5685     case '1': case '2': case '3': case '4':
5686     case '5': case '6': case '7': case '8': case '9':
5687       PUNFETCH;
5688       prev = p;
5689       r = scan_number(&p, end, enc);
5690       if (r < 0 || r > ONIG_MAX_BACKREF_NUM) {
5691         goto skip_backref;
5692       }
5693 
5694       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
5695           (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */
5696         if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5697           if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node))
5698             return ONIGERR_INVALID_BACKREF;
5699         }
5700 
5701         tok->type = TK_BACKREF;
5702         tok->u.backref.num     = 1;
5703         tok->u.backref.ref1    = r;
5704         tok->u.backref.by_name = 0;
5705 #ifdef USE_BACKREF_WITH_LEVEL
5706         tok->u.backref.exist_level = 0;
5707 #endif
5708         break;
5709       }
5710 
5711     skip_backref:
5712       if (c == '8' || c == '9') {
5713         /* normal char */
5714         p = prev; PINC;
5715         break;
5716       }
5717 
5718       p = prev;
5719       /* fall through */
5720     case '0':
5721       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5722         prev = p;
5723         r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code);
5724         if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER;
5725         if (p == prev) {  /* can't read nothing. */
5726           code = 0; /* but, it's not error */
5727         }
5728         tok->type = TK_CRUDE_BYTE;
5729         tok->base_num = 8;
5730         tok->u.byte   = (UChar )code;
5731       }
5732       else if (c != '0') {
5733         PINC;
5734       }
5735       break;
5736 
5737     case 'k':
5738       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
5739         PFETCH(c);
5740         if (c == '<' || c == '\'') {
5741           UChar* name_end;
5742           int* backs;
5743           int back_num;
5744           enum REF_NUM num_type;
5745 
5746           prev = p;
5747 
5748 #ifdef USE_BACKREF_WITH_LEVEL
5749           name_end = NULL_UCHARP; /* no need. escape gcc warning. */
5750           r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
5751                                  env, &back_num, &tok->u.backref.level, &num_type);
5752           if (r == 1) tok->u.backref.exist_level = 1;
5753           else        tok->u.backref.exist_level = 0;
5754 #else
5755           r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE);
5756 #endif
5757           if (r < 0) return r;
5758 
5759           if (num_type != IS_NOT_NUM) {
5760             if (num_type == IS_REL_NUM) {
5761               back_num = backref_rel_to_abs(back_num, env);
5762             }
5763             if (back_num <= 0)
5764               return ONIGERR_INVALID_BACKREF;
5765 
5766             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5767               if (back_num > env->num_mem ||
5768                   IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
5769                 return ONIGERR_INVALID_BACKREF;
5770             }
5771             tok->type = TK_BACKREF;
5772             tok->u.backref.by_name = 0;
5773             tok->u.backref.num  = 1;
5774             tok->u.backref.ref1 = back_num;
5775           }
5776           else {
5777             int num = name_to_group_numbers(env, prev, name_end, &backs);
5778             if (num <= 0) {
5779               return ONIGERR_UNDEFINED_NAME_REFERENCE;
5780             }
5781             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5782               int i;
5783               for (i = 0; i < num; i++) {
5784                 if (backs[i] > env->num_mem ||
5785                     IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
5786                   return ONIGERR_INVALID_BACKREF;
5787               }
5788             }
5789 
5790             tok->type = TK_BACKREF;
5791             tok->u.backref.by_name = 1;
5792             if (num == 1) {
5793               tok->u.backref.num  = 1;
5794               tok->u.backref.ref1 = backs[0];
5795             }
5796             else {
5797               tok->u.backref.num  = num;
5798               tok->u.backref.refs = backs;
5799             }
5800           }
5801         }
5802         else
5803           PUNFETCH;
5804       }
5805       break;
5806 
5807 #ifdef USE_CALL
5808     case 'g':
5809       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
5810         PFETCH(c);
5811         if (c == '<' || c == '\'') {
5812           int gnum;
5813           UChar* name_end;
5814           enum REF_NUM num_type;
5815 
5816           prev = p;
5817           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
5818                          &gnum, &num_type, TRUE);
5819           if (r < 0) return r;
5820 
5821           if (num_type != IS_NOT_NUM) {
5822             if (num_type == IS_REL_NUM) {
5823               gnum = backref_rel_to_abs(gnum, env);
5824               if (gnum < 0) {
5825                 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5826                                                prev, name_end);
5827                 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5828               }
5829             }
5830             tok->u.call.by_number = 1;
5831             tok->u.call.gnum      = gnum;
5832           }
5833           else {
5834             tok->u.call.by_number = 0;
5835             tok->u.call.gnum      = 0;
5836           }
5837 
5838           tok->type = TK_CALL;
5839           tok->u.call.name     = prev;
5840           tok->u.call.name_end = name_end;
5841         }
5842         else
5843           PUNFETCH;
5844       }
5845       break;
5846 #endif
5847 
5848     case 'Q':
5849       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
5850         tok->type = TK_QUOTE_OPEN;
5851       }
5852       break;
5853 
5854     case 'p':
5855     case 'P':
5856       if (!PEND && PPEEK_IS('{') &&
5857           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5858         PINC;
5859         tok->type = TK_CHAR_PROPERTY;
5860         tok->u.prop.not = c == 'P';
5861 
5862         if (!PEND &&
5863             IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5864           PFETCH(c);
5865           if (c == '^') {
5866             tok->u.prop.not = tok->u.prop.not == 0;
5867           }
5868           else
5869             PUNFETCH;
5870         }
5871       }
5872       break;
5873 
5874     default:
5875       {
5876         OnigCodePoint c2;
5877 
5878         PUNFETCH;
5879         r = fetch_escaped_value(&p, end, env, &c2);
5880         if (r < 0) return r;
5881         if (tok->u.code != c2) {
5882           tok->type = TK_CODE_POINT;
5883           tok->u.code = c2;
5884         }
5885         else { /* string */
5886           p = tok->backp + enclen(enc, tok->backp);
5887         }
5888       }
5889       break;
5890     }
5891   }
5892   else {
5893     tok->u.code = c;
5894     tok->escaped = 0;
5895 
5896 #ifdef USE_VARIABLE_META_CHARS
5897     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
5898         IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
5899       if (c == MC_ANYCHAR(syn))
5900         goto any_char;
5901       else if (c == MC_ANYTIME(syn))
5902         goto any_time;
5903       else if (c == MC_ZERO_OR_ONE_TIME(syn))
5904         goto zero_or_one_time;
5905       else if (c == MC_ONE_OR_MORE_TIME(syn))
5906         goto one_or_more_time;
5907       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
5908         tok->type = TK_ANYCHAR_ANYTIME;
5909         goto out;
5910       }
5911     }
5912 #endif
5913 
5914     switch (c) {
5915     case '.':
5916       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
5917 #ifdef USE_VARIABLE_META_CHARS
5918     any_char:
5919 #endif
5920       tok->type = TK_ANYCHAR;
5921       break;
5922 
5923     case '*':
5924       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
5925 #ifdef USE_VARIABLE_META_CHARS
5926     any_time:
5927 #endif
5928       tok->type = TK_REPEAT;
5929       tok->u.repeat.lower = 0;
5930       tok->u.repeat.upper = INFINITE_REPEAT;
5931       goto greedy_check;
5932       break;
5933 
5934     case '+':
5935       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
5936 #ifdef USE_VARIABLE_META_CHARS
5937     one_or_more_time:
5938 #endif
5939       tok->type = TK_REPEAT;
5940       tok->u.repeat.lower = 1;
5941       tok->u.repeat.upper = INFINITE_REPEAT;
5942       goto greedy_check;
5943       break;
5944 
5945     case '?':
5946       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
5947 #ifdef USE_VARIABLE_META_CHARS
5948     zero_or_one_time:
5949 #endif
5950       tok->type = TK_REPEAT;
5951       tok->u.repeat.lower = 0;
5952       tok->u.repeat.upper = 1;
5953       goto greedy_check;
5954       break;
5955 
5956     case '{':
5957       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
5958       r = fetch_interval(&p, end, tok, env);
5959       if (r < 0) return r;  /* error */
5960       if (r == 0) goto greedy_check2;
5961       else if (r == 2) { /* {n} */
5962         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5963           goto possessive_check;
5964 
5965         goto greedy_check2;
5966       }
5967       /* r == 1 : normal char */
5968       break;
5969 
5970     case '|':
5971       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
5972       tok->type = TK_ALT;
5973       break;
5974 
5975     case '(':
5976       if (!PEND && PPEEK_IS('?') &&
5977           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
5978         PINC;
5979         if (! PEND) {
5980           c = PPEEK;
5981           if (c == '#') {
5982             PFETCH(c);
5983             while (1) {
5984               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5985               PFETCH(c);
5986               if (c == MC_ESC(syn)) {
5987                 if (! PEND) PFETCH(c);
5988               }
5989               else {
5990                 if (c == ')') break;
5991               }
5992             }
5993             goto start;
5994           }
5995           else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) {
5996             int gnum;
5997             UChar* name;
5998             UChar* name_end;
5999             enum REF_NUM num_type;
6000 
6001             switch (c) {
6002             case '&':
6003               {
6004                 PINC;
6005                 name = p;
6006                 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
6007                                &gnum, &num_type, FALSE);
6008                 if (r < 0) return r;
6009 
6010                 tok->type = TK_CALL;
6011                 tok->u.call.by_number = 0;
6012                 tok->u.call.gnum      = 0;
6013                 tok->u.call.name      = name;
6014                 tok->u.call.name_end  = name_end;
6015               }
6016               break;
6017 
6018             case 'R':
6019               tok->type = TK_CALL;
6020               tok->u.call.by_number = 1;
6021               tok->u.call.gnum      = 0;
6022               tok->u.call.name      = p;
6023               PINC;
6024               if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
6025               tok->u.call.name_end  = p;
6026               break;
6027 
6028             case '-':
6029             case '+':
6030               goto lparen_qmark_num;
6031               break;
6032             default:
6033               if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end;
6034 
6035             lparen_qmark_num:
6036               {
6037                 name = p;
6038                 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
6039                                &gnum, &num_type, TRUE);
6040                 if (r < 0) return r;
6041 
6042                 if (num_type == IS_NOT_NUM) {
6043                   return ONIGERR_INVALID_GROUP_NAME;
6044                 }
6045                 else {
6046                   if (num_type == IS_REL_NUM) {
6047                     gnum = backref_rel_to_abs(gnum, env);
6048                     if (gnum < 0) {
6049                       onig_scan_env_set_error_string(env,
6050                              ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
6051                       return ONIGERR_UNDEFINED_GROUP_REFERENCE;
6052                     }
6053                   }
6054                   tok->u.call.by_number = 1;
6055                   tok->u.call.gnum      = gnum;
6056                 }
6057 
6058                 tok->type = TK_CALL;
6059                 tok->u.call.name     = name;
6060                 tok->u.call.name_end = name_end;
6061               }
6062               break;
6063             }
6064           }
6065         }
6066       lparen_qmark_end:
6067         PUNFETCH;
6068       }
6069 
6070       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
6071       tok->type = TK_SUBEXP_OPEN;
6072       break;
6073 
6074     case ')':
6075       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
6076       tok->type = TK_SUBEXP_CLOSE;
6077       break;
6078 
6079     case '^':
6080       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
6081       tok->type = TK_ANCHOR;
6082       tok->u.subtype = (OPTON_SINGLELINE(env->options)
6083                         ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE);
6084       break;
6085 
6086     case '$':
6087       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
6088       tok->type = TK_ANCHOR;
6089       tok->u.subtype = (OPTON_SINGLELINE(env->options)
6090                         ? ANCR_SEMI_END_BUF : ANCR_END_LINE);
6091       break;
6092 
6093     case '[':
6094       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
6095       tok->type = TK_OPEN_CC;
6096       break;
6097 
6098     case ']':
6099       if (*src > env->pattern)   /* /].../ is allowed. */
6100         CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
6101       break;
6102 
6103     case '#':
6104       if (OPTON_EXTEND(env->options)) {
6105         while (!PEND) {
6106           PFETCH(c);
6107           if (ONIGENC_IS_CODE_NEWLINE(enc, c))
6108             break;
6109         }
6110         goto start;
6111         break;
6112       }
6113       break;
6114 
6115     case ' ': case '\t': case '\n': case '\r': case '\f':
6116       if (OPTON_EXTEND(env->options))
6117         goto start;
6118       break;
6119 
6120     default:
6121       /* string */
6122       break;
6123     }
6124   }
6125 
6126  out:
6127   *src = p;
6128   return tok->type;
6129 }
6130 
6131 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])6132 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
6133                          OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out,
6134                          const OnigCodePoint mbr[])
6135 {
6136   int i, r;
6137   OnigCodePoint j;
6138 
6139   int n = ONIGENC_CODE_RANGE_NUM(mbr);
6140 
6141   if (not == 0) {
6142     for (i = 0; i < n; i++) {
6143       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
6144            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
6145         if (j >= sb_out) {
6146           if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6147             r = add_code_range_to_buf(&(cc->mbuf), j,
6148                                       ONIGENC_CODE_RANGE_TO(mbr, i));
6149             if (r != 0) return r;
6150             i++;
6151           }
6152 
6153           goto sb_end;
6154         }
6155         BITSET_SET_BIT(cc->bs, j);
6156       }
6157     }
6158 
6159   sb_end:
6160     for ( ; i < n; i++) {
6161       r = add_code_range_to_buf(&(cc->mbuf),
6162                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
6163                                 ONIGENC_CODE_RANGE_TO(mbr, i));
6164       if (r != 0) return r;
6165     }
6166   }
6167   else {
6168     OnigCodePoint prev = 0;
6169 
6170     for (i = 0; i < n; i++) {
6171       for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
6172         if (j >= sb_out) {
6173           goto sb_end2;
6174         }
6175         BITSET_SET_BIT(cc->bs, j);
6176       }
6177       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
6178     }
6179     for (j = prev; j < sb_out; j++) {
6180       BITSET_SET_BIT(cc->bs, j);
6181     }
6182 
6183   sb_end2:
6184     prev = sb_out;
6185 
6186     for (i = 0; i < n; i++) {
6187       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6188         r = add_code_range_to_buf(&(cc->mbuf), prev,
6189                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
6190         if (r != 0) return r;
6191       }
6192       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
6193       if (prev == 0) goto end;
6194     }
6195 
6196     r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
6197     if (r != 0) return r;
6198   }
6199 
6200  end:
6201   return 0;
6202 }
6203 
6204 static int
add_ctype_to_cc_by_range_limit(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[],OnigCodePoint limit)6205 add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,
6206                                OnigEncoding enc ARG_UNUSED,
6207                                OnigCodePoint sb_out,
6208                                const OnigCodePoint mbr[], OnigCodePoint limit)
6209 {
6210   int i, r;
6211   OnigCodePoint j;
6212   OnigCodePoint from;
6213   OnigCodePoint to;
6214 
6215   int n = ONIGENC_CODE_RANGE_NUM(mbr);
6216 
6217   if (not == 0) {
6218     for (i = 0; i < n; i++) {
6219       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
6220            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
6221         if (j > limit) goto end;
6222         if (j >= sb_out) {
6223           if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6224             to = ONIGENC_CODE_RANGE_TO(mbr, i);
6225             if (to > limit) to = limit;
6226             r = add_code_range_to_buf(&(cc->mbuf), j, to);
6227             if (r != 0) return r;
6228             i++;
6229           }
6230 
6231           goto sb_end;
6232         }
6233         BITSET_SET_BIT(cc->bs, j);
6234       }
6235     }
6236 
6237   sb_end:
6238     for ( ; i < n; i++) {
6239       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6240       to   = ONIGENC_CODE_RANGE_TO(mbr, i);
6241       if (from > limit) break;
6242       if (to   > limit) to = limit;
6243       r = add_code_range_to_buf(&(cc->mbuf), from, to);
6244       if (r != 0) return r;
6245     }
6246   }
6247   else {
6248     OnigCodePoint prev = 0;
6249 
6250     for (i = 0; i < n; i++) {
6251       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6252       if (from > limit) {
6253         for (j = prev; j < sb_out; j++) {
6254           BITSET_SET_BIT(cc->bs, j);
6255         }
6256         goto sb_end2;
6257       }
6258       for (j = prev; j < from; j++) {
6259         if (j >= sb_out) goto sb_end2;
6260         BITSET_SET_BIT(cc->bs, j);
6261       }
6262       prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6263       if (prev > limit) prev = limit;
6264       prev++;
6265       if (prev == 0) goto end;
6266     }
6267     for (j = prev; j < sb_out; j++) {
6268       BITSET_SET_BIT(cc->bs, j);
6269     }
6270 
6271   sb_end2:
6272     prev = sb_out;
6273 
6274     for (i = 0; i < n; i++) {
6275       from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6276       if (from > limit) goto last;
6277 
6278       if (prev < from) {
6279         r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1);
6280         if (r != 0) return r;
6281       }
6282       prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6283       if (prev > limit) prev = limit;
6284       prev++;
6285       if (prev == 0) goto end;
6286     }
6287 
6288   last:
6289     r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
6290     if (r != 0) return r;
6291   }
6292 
6293  end:
6294   return 0;
6295 }
6296 
6297 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)6298 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
6299 {
6300   int c, r;
6301   int ascii_mode;
6302   int is_single;
6303   const OnigCodePoint *ranges;
6304   OnigCodePoint limit;
6305   OnigCodePoint sb_out;
6306   OnigEncoding enc = env->enc;
6307 
6308   ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options);
6309 
6310   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
6311   if (r == 0) {
6312     if (ascii_mode == 0)
6313       r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
6314     else
6315       r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out,
6316                                          ranges, ASCII_LIMIT);
6317     return r;
6318   }
6319   else if (r != ONIG_NO_SUPPORT_CONFIG) {
6320     return r;
6321   }
6322 
6323   r = 0;
6324   is_single = ONIGENC_IS_SINGLEBYTE(enc);
6325   limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE;
6326 
6327   switch (ctype) {
6328   case ONIGENC_CTYPE_ALPHA:
6329   case ONIGENC_CTYPE_BLANK:
6330   case ONIGENC_CTYPE_CNTRL:
6331   case ONIGENC_CTYPE_DIGIT:
6332   case ONIGENC_CTYPE_LOWER:
6333   case ONIGENC_CTYPE_PUNCT:
6334   case ONIGENC_CTYPE_SPACE:
6335   case ONIGENC_CTYPE_UPPER:
6336   case ONIGENC_CTYPE_XDIGIT:
6337   case ONIGENC_CTYPE_ASCII:
6338   case ONIGENC_CTYPE_ALNUM:
6339     if (not != 0) {
6340       for (c = 0; c < (int )limit; c++) {
6341         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6342           if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6343             BITSET_SET_BIT(cc->bs, c);
6344         }
6345       }
6346       for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6347         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6348           BITSET_SET_BIT(cc->bs, c);
6349       }
6350 
6351       if (is_single == 0)
6352         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6353     }
6354     else {
6355       for (c = 0; c < (int )limit; c++) {
6356         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6357           if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6358             BITSET_SET_BIT(cc->bs, c);
6359         }
6360       }
6361     }
6362     break;
6363 
6364   case ONIGENC_CTYPE_GRAPH:
6365   case ONIGENC_CTYPE_PRINT:
6366   case ONIGENC_CTYPE_WORD:
6367     if (not != 0) {
6368       for (c = 0; c < (int )limit; c++) {
6369         /* check invalid code point */
6370         if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6371             && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6372           BITSET_SET_BIT(cc->bs, c);
6373       }
6374       for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6375         if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6376           BITSET_SET_BIT(cc->bs, c);
6377       }
6378       if (ascii_mode != 0 && is_single == 0)
6379         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6380     }
6381     else {
6382       for (c = 0; c < (int )limit; c++) {
6383         if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6384             && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6385           BITSET_SET_BIT(cc->bs, c);
6386       }
6387       if (ascii_mode == 0 && is_single == 0)
6388         ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6389     }
6390     break;
6391 
6392   default:
6393     return ONIGERR_PARSER_BUG;
6394     break;
6395   }
6396 
6397   return r;
6398 }
6399 
6400 static int
prs_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)6401 prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
6402 {
6403 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
6404 #define POSIX_BRACKET_NAME_MIN_LEN         4
6405 
6406   static PosixBracketEntryType PBS[] = {
6407     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
6408     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
6409     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
6410     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
6411     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
6412     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
6413     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
6414     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
6415     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
6416     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
6417     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
6418     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
6419     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
6420     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
6421     { (UChar* )NULL,     -1, 0 }
6422   };
6423 
6424   PosixBracketEntryType *pb;
6425   int not, i, r;
6426   OnigCodePoint c;
6427   OnigEncoding enc = env->enc;
6428   UChar *p = *src;
6429 
6430   if (PPEEK_IS('^')) {
6431     PINC_S;
6432     not = 1;
6433   }
6434   else
6435     not = 0;
6436 
6437   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
6438     goto not_posix_bracket;
6439 
6440   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
6441     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
6442       p = (UChar* )onigenc_step(enc, p, end, pb->len);
6443       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
6444         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6445 
6446       r = add_ctype_to_cc(cc, pb->ctype, not, env);
6447       if (r != 0) return r;
6448 
6449       PINC_S; PINC_S;
6450       *src = p;
6451       return 0;
6452     }
6453   }
6454 
6455  not_posix_bracket:
6456   c = 0;
6457   i = 0;
6458   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
6459     PINC_S;
6460     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
6461   }
6462   if (c == ':' && ! PEND) {
6463     PINC_S;
6464     if (! PEND) {
6465       PFETCH_S(c);
6466       if (c == ']')
6467         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6468     }
6469   }
6470 
6471   return 1;  /* 1: is not POSIX bracket, but no error. */
6472 }
6473 
6474 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)6475 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
6476 {
6477   int r;
6478   OnigCodePoint c;
6479   OnigEncoding enc;
6480   UChar *prev, *start, *p;
6481 
6482   p = *src;
6483   enc = env->enc;
6484   r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6485   start = prev = p;
6486 
6487   while (!PEND) {
6488     prev = p;
6489     PFETCH_S(c);
6490     if (c == '}') {
6491       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
6492       if (r >= 0) {
6493         *src = p;
6494       }
6495       else {
6496         onig_scan_env_set_error_string(env, r, *src, prev);
6497       }
6498 
6499       return r;
6500     }
6501     else if (c == '(' || c == ')' || c == '{' || c == '|') {
6502       break;
6503     }
6504   }
6505 
6506   return r;
6507 }
6508 
6509 static int
prs_char_property(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6510 prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6511 {
6512   int r, ctype;
6513   CClassNode* cc;
6514 
6515   ctype = fetch_char_property_to_ctype(src, end, env);
6516   if (ctype < 0) return ctype;
6517 
6518   *np = node_new_cclass();
6519   CHECK_NULL_RETURN_MEMERR(*np);
6520   cc = CCLASS_(*np);
6521   r = add_ctype_to_cc(cc, ctype, FALSE, env);
6522   if (r != 0) return r;
6523   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6524 
6525   return 0;
6526 }
6527 
6528 
6529 static int
cc_cprop_next(CClassNode * cc,OnigCodePoint * pcode,CVAL * val,CSTATE * state,ScanEnv * env)6530 cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
6531               ScanEnv* env)
6532 {
6533   int r;
6534 
6535   if (*state == CS_RANGE)
6536     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
6537 
6538   if (*state == CS_VALUE) {
6539     if (*val == CV_SB)
6540       BITSET_SET_BIT(cc->bs, (int )(*pcode));
6541     else if (*val == CV_MB) {
6542       r = add_code_range(&(cc->mbuf), env, *pcode, *pcode);
6543       if (r < 0) return r;
6544     }
6545   }
6546 
6547   *state = CS_VALUE;
6548   *val   = CV_CPROP;
6549   return 0;
6550 }
6551 
6552 static int
cc_char_next(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_raw,int to_raw,CVAL intype,CVAL * type,CSTATE * state,ScanEnv * env)6553 cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
6554              int* from_raw, int to_raw, CVAL intype, CVAL* type,
6555              CSTATE* state, ScanEnv* env)
6556 {
6557   int r;
6558 
6559   switch (*state) {
6560   case CS_VALUE:
6561     if (*type == CV_SB) {
6562       if (*from > 0xff)
6563           return ONIGERR_INVALID_CODE_POINT_VALUE;
6564 
6565       BITSET_SET_BIT(cc->bs, (int )(*from));
6566     }
6567     else if (*type == CV_MB) {
6568       r = add_code_range(&(cc->mbuf), env, *from, *from);
6569       if (r < 0) return r;
6570     }
6571     break;
6572 
6573   case CS_RANGE:
6574     if (intype == *type) {
6575       if (intype == CV_SB) {
6576         if (*from > 0xff || to > 0xff)
6577           return ONIGERR_INVALID_CODE_POINT_VALUE;
6578 
6579         if (*from > to) {
6580           if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6581             goto ccs_range_end;
6582           else
6583             return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6584         }
6585         bitset_set_range(cc->bs, (int )*from, (int )to);
6586       }
6587       else {
6588         r = add_code_range(&(cc->mbuf), env, *from, to);
6589         if (r < 0) return r;
6590       }
6591     }
6592     else {
6593       if (*from > to) {
6594         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6595           goto ccs_range_end;
6596         else
6597           return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6598       }
6599       bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
6600       r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
6601       if (r < 0) return r;
6602     }
6603   ccs_range_end:
6604     *state = CS_COMPLETE;
6605     break;
6606 
6607   case CS_COMPLETE:
6608   case CS_START:
6609     *state = CS_VALUE;
6610     break;
6611 
6612   default:
6613     break;
6614   }
6615 
6616   *from_raw = to_raw;
6617   *from     = to;
6618   *type     = intype;
6619   return 0;
6620 }
6621 
6622 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)6623 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
6624                  ScanEnv* env)
6625 {
6626   int in_esc;
6627   OnigCodePoint code;
6628   OnigEncoding enc = env->enc;
6629   UChar* p = from;
6630 
6631   in_esc = 0;
6632   while (! PEND) {
6633     if (ignore_escaped && in_esc) {
6634       in_esc = 0;
6635     }
6636     else {
6637       PFETCH_S(code);
6638       if (code == c) return 1;
6639       if (code == MC_ESC(env->syntax)) in_esc = 1;
6640     }
6641   }
6642   return 0;
6643 }
6644 
6645 static int
prs_cc(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6646 prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6647 {
6648   int r, neg, len, fetched, and_start;
6649   OnigCodePoint in_code, curr_code;
6650   UChar *p;
6651   Node* node;
6652   CClassNode *cc, *prev_cc;
6653   CClassNode work_cc;
6654   int curr_raw, in_raw;
6655   CSTATE state;
6656   CVAL in_type;
6657   CVAL curr_type;
6658 
6659   *np = NULL_NODE;
6660   INC_PARSE_DEPTH(env->parse_depth);
6661 
6662   state = CS_START;
6663   prev_cc = (CClassNode* )NULL;
6664   r = fetch_token_cc(tok, src, end, env, state);
6665   if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) {
6666     neg = 1;
6667     r = fetch_token_cc(tok, src, end, env, state);
6668   }
6669   else {
6670     neg = 0;
6671   }
6672 
6673   if (r < 0) return r;
6674   if (r == TK_CC_CLOSE) {
6675     if (! code_exist_check((OnigCodePoint )']',
6676                            *src, env->pattern_end, 1, env))
6677       return ONIGERR_EMPTY_CHAR_CLASS;
6678 
6679     CC_ESC_WARN(env, (UChar* )"]");
6680     r = tok->type = TK_CHAR;  /* allow []...] */
6681   }
6682 
6683   *np = node = node_new_cclass();
6684   CHECK_NULL_RETURN_MEMERR(node);
6685   cc = CCLASS_(node);
6686 
6687   and_start = 0;
6688   curr_type = CV_UNDEF;
6689 
6690   p = *src;
6691   while (r != TK_CC_CLOSE) {
6692     fetched = 0;
6693     switch (r) {
6694     case TK_CHAR:
6695     any_char_in:
6696       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
6697       if (len < 0) {
6698         r = len;
6699         goto err;
6700       }
6701       in_type = (len == 1) ? CV_SB : CV_MB;
6702       in_code = tok->u.code;
6703       in_raw = 0;
6704       goto val_entry2;
6705       break;
6706 
6707     case TK_CRUDE_BYTE:
6708       /* tok->base_num != 0 : octal or hexadec. */
6709       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base_num != 0) {
6710         int i, j;
6711         UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
6712         UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
6713         UChar* psave = p;
6714         int base_num = tok->base_num;
6715 
6716         buf[0] = tok->u.byte;
6717         for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
6718           r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE);
6719           if (r < 0) goto err;
6720           if (r != TK_CRUDE_BYTE || tok->base_num != base_num) {
6721             fetched = 1;
6722             break;
6723           }
6724           buf[i] = tok->u.byte;
6725         }
6726 
6727         if (i < ONIGENC_MBC_MINLEN(env->enc)) {
6728           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6729           goto err;
6730         }
6731 
6732         /* clear buf tail */
6733         for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0';
6734 
6735         len = enclen(env->enc, buf);
6736         if (i < len) {
6737           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6738           goto err;
6739         }
6740         else if (i > len) { /* fetch back */
6741           p = psave;
6742           for (i = 1; i < len; i++) {
6743             r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE);
6744             if (r < 0) goto err;
6745           }
6746           fetched = 0;
6747         }
6748 
6749         if (i == 1) {
6750           in_code = (OnigCodePoint )buf[0];
6751           goto crude_single;
6752         }
6753         else {
6754           in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
6755           in_type = CV_MB;
6756         }
6757       }
6758       else {
6759         in_code = (OnigCodePoint )tok->u.byte;
6760       crude_single:
6761         in_type = CV_SB;
6762       }
6763       in_raw = 1;
6764       goto val_entry2;
6765       break;
6766 
6767     case TK_CODE_POINT:
6768       in_code = tok->u.code;
6769       in_raw  = 1;
6770     val_entry:
6771       len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code);
6772       if (len < 0) {
6773         if (state != CS_RANGE ||
6774             ! IS_SYNTAX_BV(env->syntax,
6775                            ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) ||
6776             in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) {
6777           r = len;
6778           goto err;
6779         }
6780       }
6781       in_type = (len == 1 ? CV_SB : CV_MB);
6782     val_entry2:
6783       r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type,
6784                        &curr_type, &state, env);
6785       if (r != 0) goto err;
6786       break;
6787 
6788     case TK_CC_POSIX_BRACKET_OPEN:
6789       r = prs_posix_bracket(cc, &p, end, env);
6790       if (r < 0) goto err;
6791       if (r == 1) {  /* is not POSIX bracket */
6792         CC_ESC_WARN(env, (UChar* )"[");
6793         p = tok->backp;
6794         in_code = tok->u.code;
6795         in_raw = 0;
6796         goto val_entry;
6797       }
6798       goto next_cprop;
6799       break;
6800 
6801     case TK_CHAR_TYPE:
6802       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
6803       if (r != 0) goto err;
6804 
6805     next_cprop:
6806       r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env);
6807       if (r != 0) goto err;
6808       break;
6809 
6810     case TK_CHAR_PROPERTY:
6811       {
6812         int ctype = fetch_char_property_to_ctype(&p, end, env);
6813         if (ctype < 0) {
6814           r = ctype;
6815           goto err;
6816         }
6817         r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
6818         if (r != 0) goto err;
6819         goto next_cprop;
6820       }
6821       break;
6822 
6823     case TK_CC_RANGE:
6824       if (state == CS_VALUE) {
6825         r = fetch_token_cc(tok, &p, end, env, CS_RANGE);
6826         if (r < 0) goto err;
6827 
6828         fetched = 1;
6829         if (r == TK_CC_CLOSE) { /* allow [x-] */
6830         range_end_val:
6831           in_code = (OnigCodePoint )'-';
6832           in_raw = 0;
6833           goto val_entry;
6834         }
6835         else if (r == TK_CC_AND) {
6836           CC_ESC_WARN(env, (UChar* )"-");
6837           goto range_end_val;
6838         }
6839 
6840         if (curr_type == CV_CPROP) {
6841           r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6842           goto err;
6843         }
6844 
6845         state = CS_RANGE;
6846       }
6847       else if (state == CS_START) {
6848         /* [-xa] is allowed */
6849         in_code = tok->u.code;
6850         in_raw = 0;
6851 
6852         r = fetch_token_cc(tok, &p, end, env, CS_VALUE);
6853         if (r < 0) goto err;
6854 
6855         fetched = 1;
6856         /* [--x] or [a&&-x] is warned. */
6857         if (r == TK_CC_RANGE || and_start != 0)
6858           CC_ESC_WARN(env, (UChar* )"-");
6859 
6860         goto val_entry;
6861       }
6862       else if (state == CS_RANGE) {
6863         CC_ESC_WARN(env, (UChar* )"-");
6864         goto any_char_in;  /* [!--] is allowed */
6865       }
6866       else { /* CS_COMPLETE */
6867         r = fetch_token_cc(tok, &p, end, env, CS_VALUE);
6868         if (r < 0) goto err;
6869 
6870         fetched = 1;
6871         if (r == TK_CC_CLOSE)
6872           goto range_end_val; /* allow [a-b-] */
6873         else if (r == TK_CC_AND) {
6874           CC_ESC_WARN(env, (UChar* )"-");
6875           goto range_end_val;
6876         }
6877 
6878         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
6879           CC_ESC_WARN(env, (UChar* )"-");
6880           goto range_end_val;   /* [0-9-a] is allowed as [0-9\-a] */
6881         }
6882         r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6883         goto err;
6884       }
6885       break;
6886 
6887     case TK_CC_OPEN_CC: /* [ */
6888       {
6889         Node *anode;
6890         CClassNode* acc;
6891 
6892         if (state == CS_VALUE) {
6893           r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6894                            &state, env);
6895           if (r != 0) goto err;
6896         }
6897         state = CS_COMPLETE;
6898 
6899         r = prs_cc(&anode, tok, &p, end, env);
6900         if (r != 0) {
6901           onig_node_free(anode);
6902           goto cc_open_err;
6903         }
6904         acc = CCLASS_(anode);
6905         r = or_cclass(cc, acc, env->enc);
6906         onig_node_free(anode);
6907 
6908       cc_open_err:
6909         if (r != 0) goto err;
6910       }
6911       break;
6912 
6913     case TK_CC_AND: /* && */
6914       {
6915         if (state == CS_VALUE) {
6916           r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6917                            &state, env);
6918           if (r != 0) goto err;
6919         }
6920         /* initialize local variables */
6921         and_start = 1;
6922         state = CS_START;
6923 
6924         if (IS_NOT_NULL(prev_cc)) {
6925           r = and_cclass(prev_cc, cc, env->enc);
6926           if (r != 0) goto err;
6927           bbuf_free(cc->mbuf);
6928         }
6929         else {
6930           prev_cc = cc;
6931           cc = &work_cc;
6932         }
6933         initialize_cclass(cc);
6934       }
6935       break;
6936 
6937     case TK_EOT:
6938       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
6939       goto err;
6940       break;
6941     default:
6942       r = ONIGERR_PARSER_BUG;
6943       goto err;
6944       break;
6945     }
6946 
6947     if (fetched)
6948       r = tok->type;
6949     else {
6950       r = fetch_token_cc(tok, &p, end, env, state);
6951       if (r < 0) goto err;
6952     }
6953   }
6954 
6955   if (state == CS_VALUE) {
6956     r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6957                      &state, env);
6958     if (r != 0) goto err;
6959   }
6960 
6961   if (IS_NOT_NULL(prev_cc)) {
6962     r = and_cclass(prev_cc, cc, env->enc);
6963     if (r != 0) goto err;
6964     bbuf_free(cc->mbuf);
6965     cc = prev_cc;
6966   }
6967 
6968   if (neg != 0)
6969     NCCLASS_SET_NOT(cc);
6970   else
6971     NCCLASS_CLEAR_NOT(cc);
6972   if (IS_NCCLASS_NOT(cc) &&
6973       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
6974     int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
6975     if (is_empty != 0)
6976       BITSET_IS_EMPTY(cc->bs, is_empty);
6977 
6978     if (is_empty == 0) {
6979       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
6980         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
6981           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
6982         else
6983           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
6984       }
6985     }
6986   }
6987   *src = p;
6988   DEC_PARSE_DEPTH(env->parse_depth);
6989   return 0;
6990 
6991  err:
6992   if (cc != CCLASS_(*np))
6993     bbuf_free(cc->mbuf);
6994   return r;
6995 }
6996 
6997 static int prs_alts(Node** top, PToken* tok, int term,
6998                     UChar** src, UChar* end, ScanEnv* env, int group_head);
6999 
7000 #ifdef USE_CALLOUT
7001 
7002 /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */
7003 static int
prs_callout_of_contents(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)7004 prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
7005 {
7006   int r;
7007   int i;
7008   int in;
7009   int num;
7010   OnigCodePoint c;
7011   UChar* code_start;
7012   UChar* code_end;
7013   UChar* contents;
7014   UChar* tag_start;
7015   UChar* tag_end;
7016   int brace_nest;
7017   CalloutListEntry* e;
7018   RegexExt* ext;
7019   OnigEncoding enc = env->enc;
7020   UChar* p = *src;
7021 
7022   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7023 
7024   brace_nest = 0;
7025   while (PPEEK_IS('{')) {
7026     brace_nest++;
7027     PINC_S;
7028     if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7029   }
7030 
7031   in = ONIG_CALLOUT_IN_PROGRESS;
7032   code_start = p;
7033   while (1) {
7034     if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7035 
7036     code_end = p;
7037     PFETCH_S(c);
7038     if (c == '}') {
7039       i = brace_nest;
7040       while (i > 0) {
7041         if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7042         PFETCH_S(c);
7043         if (c == '}') i--;
7044         else break;
7045       }
7046       if (i == 0) break;
7047     }
7048   }
7049 
7050   if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7051 
7052   PFETCH_S(c);
7053   if (c == '[') {
7054     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7055     tag_end = tag_start = p;
7056     while (! PEND) {
7057       if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7058       tag_end = p;
7059       PFETCH_S(c);
7060       if (c == ']') break;
7061     }
7062     if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
7063       return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7064 
7065     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7066     PFETCH_S(c);
7067   }
7068   else {
7069     tag_start = tag_end = 0;
7070   }
7071 
7072   if (c == 'X') {
7073     in |= ONIG_CALLOUT_IN_RETRACTION;
7074     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7075     PFETCH_S(c);
7076   }
7077   else if (c == '<') {
7078     in = ONIG_CALLOUT_IN_RETRACTION;
7079     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7080     PFETCH_S(c);
7081   }
7082   else if (c == '>') { /* no needs (default) */
7083     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7084     PFETCH_S(c);
7085   }
7086 
7087   if (c != cterm)
7088     return ONIGERR_INVALID_CALLOUT_PATTERN;
7089 
7090   r = reg_callout_list_entry(env, &num);
7091   if (r != 0) return r;
7092 
7093   ext = onig_get_regex_ext(env->reg);
7094   CHECK_NULL_RETURN_MEMERR(ext);
7095   if (IS_NULL(ext->pattern)) {
7096     r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7097     if (r != ONIG_NORMAL) return r;
7098   }
7099 
7100   if (tag_start != tag_end) {
7101     r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7102     if (r != ONIG_NORMAL) return r;
7103   }
7104 
7105   contents = onigenc_strdup(enc, code_start, code_end);
7106   CHECK_NULL_RETURN_MEMERR(contents);
7107 
7108   e = onig_reg_callout_list_at(env->reg, num);
7109   if (IS_NULL(e)) {
7110     xfree(contents);
7111     return ONIGERR_MEMORY;
7112   }
7113 
7114   r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env);
7115   if (r != 0) {
7116     xfree(contents);
7117     return r;
7118   }
7119 
7120   e->of      = ONIG_CALLOUT_OF_CONTENTS;
7121   e->in      = in;
7122   e->name_id = ONIG_NON_NAME_ID;
7123   e->u.content.start = contents;
7124   e->u.content.end   = contents + (code_end - code_start);
7125 
7126   *src = p;
7127   return 0;
7128 }
7129 
7130 static long
prs_long(OnigEncoding enc,UChar * s,UChar * end,int sign_on,long max,long * rl)7131 prs_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl)
7132 {
7133   long v;
7134   long d;
7135   int flag;
7136   UChar* p;
7137   OnigCodePoint c;
7138 
7139   if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG;
7140 
7141   flag = 1;
7142   v = 0;
7143   p = s;
7144   while (p < end) {
7145     c = ONIGENC_MBC_TO_CODE(enc, p, end);
7146     p += ONIGENC_MBC_ENC_LEN(enc, p);
7147     if (c >= '0' && c <= '9') {
7148       d = (long )(c - '0');
7149       if (v > (max - d) / 10)
7150         return ONIGERR_INVALID_CALLOUT_ARG;
7151 
7152       v = v * 10 + d;
7153     }
7154     else if (sign_on != 0 && (c == '-' || c == '+')) {
7155       if (c == '-') flag = -1;
7156     }
7157     else
7158       return ONIGERR_INVALID_CALLOUT_ARG;
7159 
7160     sign_on = 0;
7161   }
7162 
7163   *rl = flag * v;
7164   return ONIG_NORMAL;
7165 }
7166 
7167 static void
clear_callout_args(int n,unsigned int types[],OnigValue vals[])7168 clear_callout_args(int n, unsigned int types[], OnigValue vals[])
7169 {
7170   int i;
7171 
7172   for (i = 0; i < n; i++) {
7173     switch (types[i]) {
7174     case ONIG_TYPE_STRING:
7175       if (IS_NOT_NULL(vals[i].s.start))
7176         xfree(vals[i].s.start);
7177       break;
7178     default:
7179       break;
7180     }
7181   }
7182 }
7183 
7184 static int
prs_callout_args(int skip_mode,int cterm,UChar ** src,UChar * end,int max_arg_num,unsigned int types[],OnigValue vals[],ScanEnv * env)7185 prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
7186                  int max_arg_num, unsigned int types[], OnigValue vals[],
7187                  ScanEnv* env)
7188 {
7189 #define MAX_CALLOUT_ARG_BYTE_LENGTH   128
7190 
7191   int r;
7192   int n;
7193   int esc;
7194   int cn;
7195   UChar* s;
7196   UChar* e;
7197   UChar* eesc;
7198   OnigCodePoint c;
7199   UChar* bufend;
7200   UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH];
7201   OnigEncoding enc = env->enc;
7202   UChar* p = *src;
7203 
7204   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7205 
7206   c = 0;
7207   n = 0;
7208   while (n < ONIG_CALLOUT_MAX_ARGS_NUM) {
7209     cn  = 0;
7210     esc = 0;
7211     eesc = 0;
7212     bufend = buf;
7213     s = e = p;
7214     while (1) {
7215       if (PEND) {
7216         r = ONIGERR_INVALID_CALLOUT_PATTERN;
7217         goto err_clear;
7218       }
7219 
7220       e = p;
7221       PFETCH_S(c);
7222       if (esc != 0) {
7223         esc = 0;
7224         if (c == '\\' || c == cterm || c == ',') {
7225           /* */
7226         }
7227         else {
7228           e = eesc;
7229           cn++;
7230         }
7231         goto add_char;
7232       }
7233       else {
7234         if (c == '\\') {
7235           esc = 1;
7236           eesc = e;
7237         }
7238         else if (c == cterm || c == ',')
7239           break;
7240         else {
7241           size_t clen;
7242 
7243         add_char:
7244           if (skip_mode == FALSE) {
7245             clen = p - e;
7246             if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) {
7247               r = ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */
7248               goto err_clear;
7249             }
7250 
7251             xmemcpy(bufend, e, clen);
7252             bufend += clen;
7253           }
7254           cn++;
7255         }
7256       }
7257     }
7258 
7259     if (cn != 0) {
7260       if (max_arg_num >= 0 && n >= max_arg_num) {
7261         r = ONIGERR_INVALID_CALLOUT_ARG;
7262         goto err_clear;
7263       }
7264 
7265       if (skip_mode == FALSE) {
7266         if ((types[n] & ONIG_TYPE_LONG) != 0) {
7267           int fixed = 0;
7268           if (cn > 0) {
7269             long rl;
7270             r = prs_long(enc, buf, bufend, 1, LONG_MAX, &rl);
7271             if (r == ONIG_NORMAL) {
7272               vals[n].l = rl;
7273               fixed = 1;
7274               types[n] = ONIG_TYPE_LONG;
7275             }
7276           }
7277 
7278           if (fixed == 0) {
7279             types[n] = (types[n] & ~ONIG_TYPE_LONG);
7280             if (types[n] == ONIG_TYPE_VOID) {
7281               r = ONIGERR_INVALID_CALLOUT_ARG;
7282               goto err_clear;
7283             }
7284           }
7285         }
7286 
7287         switch (types[n]) {
7288         case ONIG_TYPE_LONG:
7289           break;
7290 
7291         case ONIG_TYPE_CHAR:
7292           if (cn != 1) {
7293             r = ONIGERR_INVALID_CALLOUT_ARG;
7294             goto err_clear;
7295           }
7296           vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend);
7297           break;
7298 
7299         case ONIG_TYPE_STRING:
7300           {
7301             UChar* rs = onigenc_strdup(enc, buf, bufend);
7302             if (IS_NULL(rs)) {
7303               r = ONIGERR_MEMORY; goto err_clear;
7304             }
7305             vals[n].s.start = rs;
7306             vals[n].s.end   = rs + (e - s);
7307           }
7308           break;
7309 
7310         case ONIG_TYPE_TAG:
7311           if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) {
7312             r = ONIGERR_INVALID_CALLOUT_TAG_NAME;
7313             goto err_clear;
7314           }
7315 
7316           vals[n].s.start = s;
7317           vals[n].s.end   = e;
7318           break;
7319 
7320         case ONIG_TYPE_VOID:
7321         case ONIG_TYPE_POINTER:
7322           r = ONIGERR_PARSER_BUG;
7323           goto err_clear;
7324           break;
7325         }
7326       }
7327 
7328       n++;
7329     }
7330 
7331     if (c == cterm) break;
7332   }
7333 
7334   if (c != cterm) {
7335     r = ONIGERR_INVALID_CALLOUT_PATTERN;
7336     goto err_clear;
7337   }
7338 
7339   *src = p;
7340   return n;
7341 
7342  err_clear:
7343   if (skip_mode == FALSE)
7344     clear_callout_args(n, types, vals);
7345   return r;
7346 }
7347 
7348 /* (*name[TAG]) (*name[TAG]{a,b,..}) */
7349 static int
prs_callout_of_name(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)7350 prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
7351 {
7352   int r;
7353   int i;
7354   int in;
7355   int num;
7356   int name_id;
7357   int arg_num;
7358   int max_arg_num;
7359   int opt_arg_num;
7360   int is_not_single;
7361   OnigCodePoint c;
7362   UChar* name_start;
7363   UChar* name_end;
7364   UChar* tag_start;
7365   UChar* tag_end;
7366   Node*  node;
7367   CalloutListEntry* e;
7368   RegexExt* ext;
7369   unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM];
7370   OnigValue    vals[ONIG_CALLOUT_MAX_ARGS_NUM];
7371   OnigEncoding enc = env->enc;
7372   UChar* p = *src;
7373 
7374   /* PFETCH_READY; */
7375   if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7376 
7377   node = 0;
7378   name_start = p;
7379   while (1) {
7380     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7381     name_end = p;
7382     PFETCH_S(c);
7383     if (c == cterm || c == '[' || c == '{') break;
7384   }
7385 
7386   if (! is_allowed_callout_name(enc, name_start, name_end))
7387     return ONIGERR_INVALID_CALLOUT_NAME;
7388 
7389   if (c == '[') {
7390     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7391     tag_end = tag_start = p;
7392     while (! PEND) {
7393       if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7394       tag_end = p;
7395       PFETCH_S(c);
7396       if (c == ']') break;
7397     }
7398     if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
7399       return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7400 
7401     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7402     PFETCH_S(c);
7403   }
7404   else {
7405     tag_start = tag_end = 0;
7406   }
7407 
7408   if (c == '{') {
7409     UChar* save;
7410 
7411     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7412 
7413     /* read for single check only */
7414     save = p;
7415     arg_num = prs_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env);
7416     if (arg_num < 0) return arg_num;
7417 
7418     is_not_single = PPEEK_IS(cterm) ?  0 : 1;
7419     p = save;
7420     r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7421                                     &name_id);
7422     if (r != ONIG_NORMAL) return r;
7423 
7424     max_arg_num = get_callout_arg_num_by_name_id(name_id);
7425     for (i = 0; i < max_arg_num; i++) {
7426       types[i] = get_callout_arg_type_by_name_id(name_id, i);
7427     }
7428 
7429     arg_num = prs_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env);
7430     if (arg_num < 0) return arg_num;
7431 
7432     if (PEND) {
7433       r = ONIGERR_END_PATTERN_IN_GROUP;
7434       goto err_clear;
7435     }
7436     PFETCH_S(c);
7437   }
7438   else {
7439     arg_num = 0;
7440 
7441     is_not_single = 0;
7442     r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7443                                       &name_id);
7444     if (r != ONIG_NORMAL) return r;
7445 
7446     max_arg_num = get_callout_arg_num_by_name_id(name_id);
7447     for (i = 0; i < max_arg_num; i++) {
7448       types[i] = get_callout_arg_type_by_name_id(name_id, i);
7449     }
7450   }
7451 
7452   in = onig_get_callout_in_by_name_id(name_id);
7453   opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id);
7454   if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) {
7455     r = ONIGERR_INVALID_CALLOUT_ARG;
7456     goto err_clear;
7457   }
7458 
7459   if (c != cterm) {
7460     r = ONIGERR_INVALID_CALLOUT_PATTERN;
7461     goto err_clear;
7462   }
7463 
7464   r = reg_callout_list_entry(env, &num);
7465   if (r != 0) goto err_clear;
7466 
7467   ext = onig_get_regex_ext(env->reg);
7468   if (IS_NULL(ext)) {
7469     r = ONIGERR_MEMORY; goto err_clear;
7470   }
7471   if (IS_NULL(ext->pattern)) {
7472     r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7473     if (r != ONIG_NORMAL) goto err_clear;
7474   }
7475 
7476   if (tag_start != tag_end) {
7477     r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7478     if (r != ONIG_NORMAL) goto err_clear;
7479   }
7480 
7481   e = onig_reg_callout_list_at(env->reg, num);
7482   if (IS_NULL(e)) {
7483     r = ONIGERR_MEMORY; goto err_clear;
7484   }
7485 
7486   r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env);
7487   if (r != ONIG_NORMAL) goto err_clear;
7488 
7489   e->of         = ONIG_CALLOUT_OF_NAME;
7490   e->in         = in;
7491   e->name_id    = name_id;
7492   e->type       = onig_get_callout_type_by_name_id(name_id);
7493   e->start_func = onig_get_callout_start_func_by_name_id(name_id);
7494   e->end_func   = onig_get_callout_end_func_by_name_id(name_id);
7495   e->u.arg.num        = max_arg_num;
7496   e->u.arg.passed_num = arg_num;
7497   for (i = 0; i < max_arg_num; i++) {
7498     e->u.arg.types[i] = types[i];
7499     if (i < arg_num)
7500       e->u.arg.vals[i] = vals[i];
7501     else
7502       e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i);
7503   }
7504 
7505   *np = node;
7506   *src = p;
7507   return 0;
7508 
7509  err_clear:
7510   clear_callout_args(arg_num, types, vals);
7511   return r;
7512 }
7513 #endif
7514 
7515 static int
prs_bag(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)7516 prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
7517         ScanEnv* env)
7518 {
7519   int r, num;
7520   Node *target;
7521   OnigOptionType option;
7522   OnigCodePoint c;
7523   int list_capture;
7524   OnigEncoding enc = env->enc;
7525 
7526   UChar* p = *src;
7527   PFETCH_READY;
7528 
7529   *np = NULL;
7530   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7531 
7532   option = env->options;
7533   c = PPEEK;
7534   if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
7535     PINC;
7536     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7537 
7538     PFETCH(c);
7539     switch (c) {
7540     case ':':   /* (?:...) grouping only */
7541     group:
7542       r = fetch_token(tok, &p, end, env);
7543       if (r < 0) return r;
7544       r = prs_alts(np, tok, term, &p, end, env, FALSE);
7545       if (r < 0) return r;
7546       *src = p;
7547       return 1; /* group */
7548       break;
7549 
7550     case '=':
7551       *np = node_new_anchor(ANCR_PREC_READ);
7552       break;
7553     case '!':  /*         preceding read */
7554       *np = node_new_anchor(ANCR_PREC_READ_NOT);
7555       break;
7556     case '>':            /* (?>...) stop backtrack */
7557       *np = node_new_bag(BAG_STOP_BACKTRACK);
7558       break;
7559 
7560     case '\'':
7561       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7562         goto named_group1;
7563       }
7564       else
7565         return ONIGERR_UNDEFINED_GROUP_OPTION;
7566       break;
7567 
7568     case '<':   /* look behind (?<=...), (?<!...) */
7569       if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7570       PFETCH(c);
7571       if (c == '=')
7572         *np = node_new_anchor(ANCR_LOOK_BEHIND);
7573       else if (c == '!')
7574         *np = node_new_anchor(ANCR_LOOK_BEHIND_NOT);
7575       else {
7576         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7577           UChar *name;
7578           UChar *name_end;
7579           enum REF_NUM num_type;
7580 
7581           PUNFETCH;
7582           c = '<';
7583 
7584         named_group1:
7585           list_capture = 0;
7586 
7587 #ifdef USE_CAPTURE_HISTORY
7588         named_group2:
7589 #endif
7590           name = p;
7591           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num,
7592                          &num_type, FALSE);
7593           if (r < 0) return r;
7594 
7595           num = scan_env_add_mem_entry(env);
7596           if (num < 0) return num;
7597           if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM)
7598             return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7599 
7600           r = name_add(env->reg, name, name_end, num, env);
7601           if (r != 0) return r;
7602           *np = node_new_memory(1);
7603           CHECK_NULL_RETURN_MEMERR(*np);
7604           BAG_(*np)->m.regnum = num;
7605           if (list_capture != 0)
7606             MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7607           env->num_named++;
7608         }
7609         else {
7610           return ONIGERR_UNDEFINED_GROUP_OPTION;
7611         }
7612       }
7613       break;
7614 
7615     case '~':
7616       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) {
7617         Node* absent;
7618         Node* expr;
7619         int head_bar;
7620         int is_range_cutter;
7621 
7622         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7623 
7624         if (PPEEK_IS('|')) { /* (?~|generator|absent) */
7625           PINC;
7626           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7627 
7628           head_bar = 1;
7629           if (PPEEK_IS(')')) { /* (?~|)  : range clear */
7630             PINC;
7631             r = make_range_clear(np, env);
7632             if (r != 0) return r;
7633             goto end;
7634           }
7635         }
7636         else
7637           head_bar = 0;
7638 
7639         r = fetch_token(tok, &p, end, env);
7640         if (r < 0) return r;
7641         r = prs_alts(&absent, tok, term, &p, end, env, TRUE);
7642         if (r < 0) {
7643           onig_node_free(absent);
7644           return r;
7645         }
7646 
7647         expr = NULL_NODE;
7648         is_range_cutter = 0;
7649         if (head_bar != 0) {
7650           Node* top = absent;
7651           if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) {
7652             expr = NULL_NODE;
7653             is_range_cutter = 1;
7654             /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */
7655           }
7656           else {
7657             absent = NODE_CAR(top);
7658             expr   = NODE_CDR(top);
7659             NODE_CAR(top) = NULL_NODE;
7660             NODE_CDR(top) = NULL_NODE;
7661             onig_node_free(top);
7662             if (IS_NULL(NODE_CDR(expr))) {
7663               top = expr;
7664               expr = NODE_CAR(top);
7665               NODE_CAR(top) = NULL_NODE;
7666               onig_node_free(top);
7667             }
7668           }
7669         }
7670 
7671         r = make_absent_tree(np, absent, expr, is_range_cutter, env);
7672         if (r != 0) {
7673           return r;
7674         }
7675         goto end;
7676       }
7677       else {
7678         return ONIGERR_UNDEFINED_GROUP_OPTION;
7679       }
7680       break;
7681 
7682 #ifdef USE_CALLOUT
7683     case '{':
7684       if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS))
7685         return ONIGERR_UNDEFINED_GROUP_OPTION;
7686 
7687       r = prs_callout_of_contents(np, ')', &p, end, env);
7688       if (r != 0) return r;
7689 
7690       goto end;
7691       break;
7692 #endif
7693 
7694     case '(':
7695       /* (?()...) */
7696       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) {
7697         UChar *prev;
7698         Node* condition;
7699         int condition_is_checker;
7700 
7701         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7702         PFETCH(c);
7703         if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7704 
7705         if (IS_CODE_DIGIT_ASCII(enc, c)
7706             || c == '-' || c == '+' || c == '<' || c == '\'') {
7707 #ifdef USE_BACKREF_WITH_LEVEL
7708           int exist_level;
7709           int level;
7710 #endif
7711           UChar* name_end;
7712           int back_num;
7713           enum REF_NUM num_type;
7714           int is_enclosed;
7715 
7716           is_enclosed = (c == '<' || c == '\'') ? 1 : 0;
7717           if (! is_enclosed)
7718             PUNFETCH;
7719           prev = p;
7720 #ifdef USE_BACKREF_WITH_LEVEL
7721           exist_level = 0;
7722           name_end = NULL_UCHARP; /* no need. escape gcc warning. */
7723           r = fetch_name_with_level(
7724                     (OnigCodePoint )(is_enclosed != 0 ? c : '('),
7725                     &p, end, &name_end,
7726                     env, &back_num, &level, &num_type);
7727           if (r == 1) exist_level = 1;
7728 #else
7729           r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('),
7730                          &p, end, &name_end, env, &back_num, &num_type, TRUE);
7731 #endif
7732           if (r < 0) {
7733             if (is_enclosed == 0) {
7734               goto any_condition;
7735             }
7736             else
7737               return r;
7738           }
7739 
7740           condition_is_checker = 1;
7741           if (num_type != IS_NOT_NUM) {
7742             if (num_type == IS_REL_NUM) {
7743               back_num = backref_rel_to_abs(back_num, env);
7744             }
7745             if (back_num <= 0)
7746               return ONIGERR_INVALID_BACKREF;
7747 
7748             if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7749               if (back_num > env->num_mem ||
7750                   IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
7751                 return ONIGERR_INVALID_BACKREF;
7752             }
7753 
7754             condition = node_new_backref_checker(1, &back_num, FALSE,
7755 #ifdef USE_BACKREF_WITH_LEVEL
7756                                                  exist_level, level,
7757 #endif
7758                                                  env);
7759           }
7760           else {
7761             int num;
7762             int* backs;
7763 
7764             num = name_to_group_numbers(env, prev, name_end, &backs);
7765             if (num <= 0) {
7766               return ONIGERR_UNDEFINED_NAME_REFERENCE;
7767             }
7768             if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7769               int i;
7770               for (i = 0; i < num; i++) {
7771                 if (backs[i] > env->num_mem ||
7772                     IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
7773                   return ONIGERR_INVALID_BACKREF;
7774               }
7775             }
7776 
7777             condition = node_new_backref_checker(num, backs, TRUE,
7778 #ifdef USE_BACKREF_WITH_LEVEL
7779                                                  exist_level, level,
7780 #endif
7781                                                  env);
7782           }
7783 
7784           if (is_enclosed != 0) {
7785             if (PEND) goto err_if_else;
7786             PFETCH(c);
7787             if (c != ')') goto err_if_else;
7788           }
7789         }
7790 #ifdef USE_CALLOUT
7791         else if (c == '?') {
7792           if (IS_SYNTAX_OP2(env->syntax,
7793                             ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) {
7794             if (! PEND && PPEEK_IS('{')) {
7795               /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */
7796               condition_is_checker = 0;
7797               PFETCH(c);
7798               r = prs_callout_of_contents(&condition, ')', &p, end, env);
7799               if (r != 0) return r;
7800               goto end_condition;
7801             }
7802           }
7803           goto any_condition;
7804         }
7805         else if (c == '*' &&
7806                  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7807           condition_is_checker = 0;
7808           r = prs_callout_of_name(&condition, ')', &p, end, env);
7809           if (r != 0) return r;
7810           goto end_condition;
7811         }
7812 #endif
7813         else {
7814         any_condition:
7815           PUNFETCH;
7816           condition_is_checker = 0;
7817           r = fetch_token(tok, &p, end, env);
7818           if (r < 0) return r;
7819           r = prs_alts(&condition, tok, term, &p, end, env, FALSE);
7820           if (r < 0) {
7821             onig_node_free(condition);
7822             return r;
7823           }
7824         }
7825 
7826 #ifdef USE_CALLOUT
7827       end_condition:
7828 #endif
7829         CHECK_NULL_RETURN_MEMERR(condition);
7830 
7831         if (PEND) {
7832         err_if_else:
7833           onig_node_free(condition);
7834           return ONIGERR_END_PATTERN_IN_GROUP;
7835         }
7836 
7837         if (PPEEK_IS(')')) { /* case: empty body: make backref checker */
7838           if (condition_is_checker == 0) {
7839             onig_node_free(condition);
7840             return ONIGERR_INVALID_IF_ELSE_SYNTAX;
7841           }
7842           PFETCH(c);
7843           *np = condition;
7844         }
7845         else { /* if-else */
7846           int then_is_empty;
7847           Node *Then, *Else;
7848 
7849           Then = 0;
7850           if (PPEEK_IS('|')) {
7851             PFETCH(c);
7852             then_is_empty = 1;
7853           }
7854           else
7855             then_is_empty = 0;
7856 
7857           r = fetch_token(tok, &p, end, env);
7858           if (r < 0) {
7859             onig_node_free(condition);
7860             return r;
7861           }
7862           r = prs_alts(&target, tok, term, &p, end, env, TRUE);
7863           if (r < 0) {
7864             onig_node_free(condition);
7865             onig_node_free(target);
7866             return r;
7867           }
7868 
7869           if (then_is_empty != 0) {
7870             Else = target;
7871           }
7872           else {
7873             if (NODE_TYPE(target) == NODE_ALT) {
7874               Then = NODE_CAR(target);
7875               if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) {
7876                 Else = NODE_CAR(NODE_CDR(target));
7877                 cons_node_free_alone(NODE_CDR(target));
7878               }
7879               else {
7880                 Else = NODE_CDR(target);
7881               }
7882               cons_node_free_alone(target);
7883             }
7884             else {
7885               Then = target;
7886               Else = 0;
7887             }
7888           }
7889 
7890           *np = node_new_bag_if_else(condition, Then, Else);
7891           if (IS_NULL(*np)) {
7892             onig_node_free(condition);
7893             onig_node_free(Then);
7894             onig_node_free(Else);
7895             return ONIGERR_MEMORY;
7896           }
7897         }
7898         goto end;
7899       }
7900       else {
7901         return ONIGERR_UNDEFINED_GROUP_OPTION;
7902       }
7903       break;
7904 
7905 #ifdef USE_CAPTURE_HISTORY
7906     case '@':
7907       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
7908         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7909           PFETCH(c);
7910           if (c == '<' || c == '\'') {
7911             list_capture = 1;
7912             goto named_group2; /* (?@<name>...) */
7913           }
7914           PUNFETCH;
7915         }
7916 
7917         *np = node_new_memory(0);
7918         CHECK_NULL_RETURN_MEMERR(*np);
7919         num = scan_env_add_mem_entry(env);
7920         if (num < 0) {
7921           return num;
7922         }
7923         else if (num >= (int )MEM_STATUS_BITS_NUM) {
7924           return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7925         }
7926         BAG_(*np)->m.regnum = num;
7927         MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7928       }
7929       else {
7930         return ONIGERR_UNDEFINED_GROUP_OPTION;
7931       }
7932       break;
7933 #endif
7934 
7935 #ifdef USE_POSIXLINE_OPTION
7936     case 'p':
7937 #endif
7938     case '-': case 'i': case 'm': case 's': case 'x':
7939     case 'W': case 'D': case 'S': case 'P':
7940     case 'y':
7941       {
7942         int neg = 0;
7943 
7944         while (1) {
7945           switch (c) {
7946           case ':':
7947           case ')':
7948             break;
7949 
7950           case '-':  neg = 1; break;
7951           case 'x':  OPTION_NEGATE(option, ONIG_OPTION_EXTEND,     neg); break;
7952           case 'i':  OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break;
7953           case 's':
7954             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7955               OPTION_NEGATE(option, ONIG_OPTION_MULTILINE,  neg);
7956             }
7957             else
7958               return ONIGERR_UNDEFINED_GROUP_OPTION;
7959             break;
7960 
7961           case 'm':
7962             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7963               OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE));
7964             }
7965             else if (IS_SYNTAX_OP2(env->syntax,
7966                         ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) {
7967               OPTION_NEGATE(option, ONIG_OPTION_MULTILINE,  neg);
7968             }
7969             else
7970               return ONIGERR_UNDEFINED_GROUP_OPTION;
7971             break;
7972 #ifdef USE_POSIXLINE_OPTION
7973           case 'p':
7974             OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
7975             break;
7976 #endif
7977           case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
7978           case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
7979           case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
7980           case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
7981 
7982           case 'y': /* y{g}, y{w} */
7983             {
7984               if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
7985                 return ONIGERR_UNDEFINED_GROUP_OPTION;
7986 
7987               if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION;
7988 
7989               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7990               if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION;
7991               PFETCH(c);
7992               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7993               PFETCH(c);
7994               switch (c) {
7995               case 'g':
7996                 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7997                   return ONIGERR_UNDEFINED_GROUP_OPTION;
7998 
7999                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE);
8000                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE);
8001                 break;
8002 #ifdef USE_UNICODE_WORD_BREAK
8003               case 'w':
8004                 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
8005                   return ONIGERR_UNDEFINED_GROUP_OPTION;
8006 
8007                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE);
8008                 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE);
8009                 break;
8010 #endif
8011               default:
8012                 return ONIGERR_UNDEFINED_GROUP_OPTION;
8013                 break;
8014               }
8015               if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
8016               PFETCH(c);
8017               if (c != '}')
8018                 return ONIGERR_UNDEFINED_GROUP_OPTION;
8019               break;
8020             } /* case 'y' */
8021 
8022           default:
8023             return ONIGERR_UNDEFINED_GROUP_OPTION;
8024           }
8025 
8026           if (c == ')') {
8027             *np = node_new_option(option);
8028             CHECK_NULL_RETURN_MEMERR(*np);
8029             *src = p;
8030             return 2; /* option only */
8031           }
8032           else if (c == ':') {
8033             OnigOptionType prev = env->options;
8034 
8035             env->options = option;
8036             r = fetch_token(tok, &p, end, env);
8037             if (r < 0) return r;
8038             r = prs_alts(&target, tok, term, &p, end, env, FALSE);
8039             env->options = prev;
8040             if (r < 0) {
8041               onig_node_free(target);
8042               return r;
8043             }
8044             *np = node_new_option(option);
8045             CHECK_NULL_RETURN_MEMERR(*np);
8046             NODE_BODY(*np) = target;
8047             *src = p;
8048             return 0;
8049           }
8050 
8051           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
8052           PFETCH(c);
8053         } /* while (1) */
8054       }
8055       break;
8056 
8057     default:
8058       return ONIGERR_UNDEFINED_GROUP_OPTION;
8059     }
8060   }
8061 #ifdef USE_CALLOUT
8062   else if (c == '*' &&
8063            IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
8064     PINC;
8065     r = prs_callout_of_name(np, ')', &p, end, env);
8066     if (r != 0) return r;
8067 
8068     goto end;
8069   }
8070 #endif
8071   else {
8072     if (OPTON_DONT_CAPTURE_GROUP(env->options))
8073       goto group;
8074 
8075     *np = node_new_memory(0);
8076     CHECK_NULL_RETURN_MEMERR(*np);
8077     num = scan_env_add_mem_entry(env);
8078     if (num < 0) return num;
8079     BAG_(*np)->m.regnum = num;
8080   }
8081 
8082   CHECK_NULL_RETURN_MEMERR(*np);
8083   r = fetch_token(tok, &p, end, env);
8084   if (r < 0) return r;
8085   r = prs_alts(&target, tok, term, &p, end, env, FALSE);
8086   if (r < 0) {
8087     onig_node_free(target);
8088     return r;
8089   }
8090 
8091   NODE_BODY(*np) = target;
8092 
8093   if (NODE_TYPE(*np) == NODE_BAG) {
8094     if (BAG_(*np)->type == BAG_MEMORY) {
8095       /* Don't move this to previous of prs_alts() */
8096       r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np);
8097       if (r != 0) return r;
8098     }
8099   }
8100 
8101  end:
8102   *src = p;
8103   return 0;
8104 }
8105 
8106 static const char* PopularQStr[] = {
8107   "?", "*", "+", "??", "*?", "+?"
8108 };
8109 
8110 static const char* ReduceQStr[] = {
8111   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
8112 };
8113 
8114 static int
assign_quantifier_body(Node * qnode,Node * target,int group,ScanEnv * env)8115 assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env)
8116 {
8117   QuantNode* qn;
8118 
8119   qn = QUANT_(qnode);
8120   if (qn->lower == 1 && qn->upper == 1)
8121     return 1;
8122 
8123   switch (NODE_TYPE(target)) {
8124   case NODE_STRING:
8125     if (group == 0) {
8126       if (str_node_can_be_split(target, env->enc)) {
8127         Node* n = str_node_split_last_char(target, env->enc);
8128         if (IS_NOT_NULL(n)) {
8129           NODE_BODY(qnode) = n;
8130           return 2;
8131         }
8132       }
8133     }
8134     break;
8135 
8136   case NODE_QUANT:
8137     { /* check redundant double repeat. */
8138       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
8139       QuantNode* qnt   = QUANT_(target);
8140       int nestq_num   = quantifier_type_num(qn);
8141       int targetq_num = quantifier_type_num(qnt);
8142 
8143 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
8144       if (targetq_num >= 0 && nestq_num >= 0 &&
8145           IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
8146         UChar buf[WARN_BUFSIZE];
8147 
8148         switch(ReduceTypeTable[targetq_num][nestq_num]) {
8149         case RQ_ASIS:
8150           break;
8151 
8152         case RQ_DEL:
8153           if (onig_verb_warn != onig_null_warn) {
8154             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
8155                                   env->pattern, env->pattern_end,
8156                                   (UChar* )"redundant nested repeat operator");
8157             (*onig_verb_warn)((char* )buf);
8158           }
8159           goto warn_exit;
8160           break;
8161 
8162         default:
8163           if (onig_verb_warn != onig_null_warn) {
8164             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
8165                                        env->pattern, env->pattern_end,
8166             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
8167             PopularQStr[targetq_num], PopularQStr[nestq_num],
8168             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
8169             (*onig_verb_warn)((char* )buf);
8170           }
8171           goto warn_exit;
8172           break;
8173         }
8174       }
8175 
8176     warn_exit:
8177 #endif
8178       if (targetq_num >= 0 && nestq_num < 0) {
8179         if (targetq_num == 1 || targetq_num == 2) { /* * or + */
8180           /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
8181           if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {
8182             qn->upper = (qn->lower == 0 ? 1 : qn->lower);
8183           }
8184         }
8185       }
8186       else {
8187         int r;
8188 
8189         NODE_BODY(qnode) = target;
8190         r = onig_reduce_nested_quantifier(qnode);
8191         return r;
8192       }
8193     }
8194     break;
8195 
8196   default:
8197     break;
8198   }
8199 
8200   NODE_BODY(qnode) = target;
8201   return 0;
8202 }
8203 
8204 
8205 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8206 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)8207 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
8208 {
8209   BBuf *tbuf;
8210   int r;
8211 
8212   if (IS_NCCLASS_NOT(cc)) {
8213     bitset_invert(cc->bs);
8214 
8215     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
8216       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
8217       if (r != 0) return r;
8218 
8219       bbuf_free(cc->mbuf);
8220       cc->mbuf = tbuf;
8221     }
8222 
8223     NCCLASS_CLEAR_NOT(cc);
8224   }
8225 
8226   return 0;
8227 }
8228 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
8229 
8230 #define ADD_CODE_INTO_CC(cc, code, enc) do {\
8231   if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\
8232     add_code_range_to_buf(&((cc)->mbuf), code, code);\
8233   }\
8234   else {\
8235     BITSET_SET_BIT((cc)->bs, code);\
8236   }\
8237 } while (0)
8238 
8239 extern int
onig_new_cclass_with_code_list(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])8240 onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,
8241                                int n, OnigCodePoint codes[])
8242 {
8243   int i;
8244   Node* node;
8245   CClassNode* cc;
8246 
8247   *rnode = NULL_NODE;
8248 
8249   node = node_new_cclass();
8250   CHECK_NULL_RETURN_MEMERR(node);
8251 
8252   cc = CCLASS_(node);
8253 
8254   for (i = 0; i < n; i++) {
8255     ADD_CODE_INTO_CC(cc, codes[i], enc);
8256   }
8257 
8258   *rnode = node;
8259   return 0;
8260 }
8261 
8262 typedef struct {
8263   ScanEnv*    env;
8264   CClassNode* cc;
8265   Node*       alt_root;
8266   Node**      ptail;
8267 } IApplyCaseFoldArg;
8268 
8269 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)8270 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
8271 {
8272   IApplyCaseFoldArg* iarg;
8273   ScanEnv* env;
8274   CClassNode* cc;
8275 
8276   iarg = (IApplyCaseFoldArg* )arg;
8277   env = iarg->env;
8278   cc  = iarg->cc;
8279 
8280   if (to_len == 1) {
8281     int is_in = onig_is_code_in_cc(env->enc, from, cc);
8282 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8283     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
8284         (is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
8285       ADD_CODE_INTO_CC(cc, *to, env->enc);
8286     }
8287 #else
8288     if (is_in != 0) {
8289       if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
8290           ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) {
8291         if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
8292         add_code_range(&(cc->mbuf), env, *to, *to);
8293       }
8294       else {
8295         if (IS_NCCLASS_NOT(cc)) {
8296           BITSET_CLEAR_BIT(cc->bs, *to);
8297         }
8298         else
8299           BITSET_SET_BIT(cc->bs, *to);
8300       }
8301     }
8302 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
8303   }
8304   else {
8305     int r, i, len;
8306     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8307 
8308     if (onig_is_code_in_cc(env->enc, from, cc)
8309 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8310         && !IS_NCCLASS_NOT(cc)
8311 #endif
8312         ) {
8313       int n, j, m, index;
8314       Node* list_node;
8315       Node* ns[3];
8316 
8317       n = 0;
8318       for (i = 0; i < to_len; i++) {
8319         OnigCodePoint code;
8320         Node* csnode;
8321         CClassNode* cs_cc;
8322 
8323         index = onigenc_unicode_fold1_key(&to[i]);
8324         if (index >= 0) {
8325           csnode = node_new_cclass();
8326           cs_cc = CCLASS_(csnode);
8327           if (IS_NULL(csnode)) {
8328           err_free_ns:
8329             for (j = 0; j < n; j++) onig_node_free(ns[j]);
8330             return ONIGERR_MEMORY;
8331           }
8332           m = FOLDS1_UNFOLDS_NUM(index);
8333           for (j = 0; j < m; j++) {
8334             code = FOLDS1_UNFOLDS(index)[j];
8335             ADD_CODE_INTO_CC(cs_cc, code, env->enc);
8336           }
8337           ADD_CODE_INTO_CC(cs_cc, to[i], env->enc);
8338           ns[n++] = csnode;
8339         }
8340         else {
8341           len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
8342           if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {
8343             csnode = node_new_str(buf, buf + len);
8344             if (IS_NULL(csnode)) goto err_free_ns;
8345 
8346             NODE_STRING_SET_CASE_EXPANDED(csnode);
8347             ns[n++] = csnode;
8348           }
8349           else {
8350             r = onig_node_str_cat(ns[n-1], buf, buf + len);
8351             if (r < 0) goto err_free_ns;
8352           }
8353         }
8354       }
8355 
8356       if (n == 1)
8357         list_node = ns[0];
8358       else
8359         list_node = make_list(n, ns);
8360 
8361       *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE);
8362       if (IS_NULL(*(iarg->ptail))) {
8363         onig_node_free(list_node);
8364         return ONIGERR_MEMORY;
8365       }
8366       iarg->ptail = &(NODE_CDR((*(iarg->ptail))));
8367     }
8368   }
8369 
8370   return 0;
8371 }
8372 
8373 static int
prs_exp(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8374 prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
8375         ScanEnv* env, int group_head)
8376 {
8377   int r, len, group;
8378   Node* qn;
8379   Node** tp;
8380   unsigned int parse_depth;
8381 
8382  retry:
8383   group = 0;
8384   *np = NULL;
8385   if (tok->type == (enum TokenSyms )term)
8386     goto end_of_token;
8387 
8388   parse_depth = env->parse_depth;
8389 
8390   switch (tok->type) {
8391   case TK_ALT:
8392   case TK_EOT:
8393   end_of_token:
8394     *np = node_new_empty();
8395     CHECK_NULL_RETURN_MEMERR(*np);
8396     return tok->type;
8397   break;
8398 
8399   case TK_SUBEXP_OPEN:
8400     r = prs_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env);
8401     if (r < 0) return r;
8402     if (r == 1) { /* group */
8403       if (group_head == 0)
8404         group = 1;
8405       else {
8406         Node* target = *np;
8407         *np = node_new_group(target);
8408         if (IS_NULL(*np)) {
8409           onig_node_free(target);
8410           return ONIGERR_MEMORY;
8411         }
8412         group = 2;
8413       }
8414     }
8415     else if (r == 2) { /* option only */
8416       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) {
8417         env->options = BAG_(*np)->o.options;
8418         r = fetch_token(tok, src, end, env);
8419         if (r < 0) return r;
8420         onig_node_free(*np);
8421         goto retry;
8422       }
8423       else {
8424         Node* target;
8425         OnigOptionType prev = env->options;
8426 
8427         env->options = BAG_(*np)->o.options;
8428         r = fetch_token(tok, src, end, env);
8429         if (r < 0) return r;
8430         r = prs_alts(&target, tok, term, src, end, env, FALSE);
8431         env->options = prev;
8432         if (r < 0) {
8433           onig_node_free(target);
8434           return r;
8435         }
8436         NODE_BODY(*np) = target;
8437       }
8438       return tok->type;
8439     }
8440     break;
8441 
8442   case TK_SUBEXP_CLOSE:
8443     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
8444       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
8445 
8446     if (tok->escaped) goto tk_crude_byte;
8447     else goto tk_byte;
8448     break;
8449 
8450   case TK_STRING:
8451   tk_byte:
8452     {
8453       *np = node_new_str_with_options(tok->backp, *src, env->options);
8454       CHECK_NULL_RETURN_MEMERR(*np);
8455 
8456       while (1) {
8457         r = fetch_token(tok, src, end, env);
8458         if (r < 0) return r;
8459         if (r != TK_STRING) break;
8460 
8461         r = onig_node_str_cat(*np, tok->backp, *src);
8462         if (r < 0) return r;
8463       }
8464 
8465     string_end:
8466       tp = np;
8467       goto repeat;
8468     }
8469     break;
8470 
8471   case TK_CRUDE_BYTE:
8472   tk_crude_byte:
8473     {
8474       *np = node_new_str_crude_char(tok->u.byte, env->options);
8475       CHECK_NULL_RETURN_MEMERR(*np);
8476       len = 1;
8477       while (1) {
8478         if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
8479           if (len == enclen(env->enc, STR_(*np)->s)) {
8480             r = fetch_token(tok, src, end, env);
8481             goto tk_crude_byte_end;
8482           }
8483         }
8484 
8485         r = fetch_token(tok, src, end, env);
8486         if (r < 0) return r;
8487         if (r != TK_CRUDE_BYTE)
8488           return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
8489 
8490         r = node_str_cat_char(*np, tok->u.byte);
8491         if (r < 0) return r;
8492 
8493         len++;
8494       }
8495 
8496     tk_crude_byte_end:
8497       if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end))
8498         return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8499 
8500       NODE_STRING_CLEAR_CRUDE(*np);
8501       goto string_end;
8502     }
8503     break;
8504 
8505   case TK_CODE_POINT:
8506     {
8507       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8508       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
8509       if (len < 0) return len;
8510       len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
8511 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
8512       *np = node_new_str_crude(buf, buf + len, env->options);
8513 #else
8514       *np = node_new_str_with_options(buf, buf + len, env->options);
8515 #endif
8516       CHECK_NULL_RETURN_MEMERR(*np);
8517     }
8518     break;
8519 
8520   case TK_QUOTE_OPEN:
8521     {
8522       OnigCodePoint end_op[2];
8523       UChar *qstart, *qend, *nextp;
8524 
8525       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
8526       end_op[1] = (OnigCodePoint )'E';
8527       qstart = *src;
8528       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
8529       if (IS_NULL(qend)) {
8530         nextp = qend = end;
8531       }
8532       *np = node_new_str_with_options(qstart, qend, env->options);
8533       CHECK_NULL_RETURN_MEMERR(*np);
8534       *src = nextp;
8535     }
8536     break;
8537 
8538   case TK_CHAR_TYPE:
8539     {
8540       switch (tok->u.prop.ctype) {
8541       case ONIGENC_CTYPE_WORD:
8542         *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options);
8543         CHECK_NULL_RETURN_MEMERR(*np);
8544         break;
8545 
8546       case ONIGENC_CTYPE_SPACE:
8547       case ONIGENC_CTYPE_DIGIT:
8548       case ONIGENC_CTYPE_XDIGIT:
8549         {
8550           CClassNode* cc;
8551 
8552           *np = node_new_cclass();
8553           CHECK_NULL_RETURN_MEMERR(*np);
8554           cc = CCLASS_(*np);
8555           r = add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env);
8556           if (r != 0) {
8557             onig_node_free(*np);
8558             *np = NULL_NODE;
8559             return r;
8560           }
8561           if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
8562         }
8563         break;
8564 
8565       default:
8566         return ONIGERR_PARSER_BUG;
8567         break;
8568       }
8569     }
8570     break;
8571 
8572   case TK_CHAR_PROPERTY:
8573     r = prs_char_property(np, tok, src, end, env);
8574     if (r != 0) return r;
8575     break;
8576 
8577   case TK_OPEN_CC:
8578     {
8579       CClassNode* cc;
8580 
8581       r = prs_cc(np, tok, src, end, env);
8582       if (r != 0) return r;
8583 
8584       cc = CCLASS_(*np);
8585       if (OPTON_IGNORECASE(env->options)) {
8586         IApplyCaseFoldArg iarg;
8587 
8588         iarg.env      = env;
8589         iarg.cc       = cc;
8590         iarg.alt_root = NULL_NODE;
8591         iarg.ptail    = &(iarg.alt_root);
8592 
8593         r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
8594                                         i_apply_case_fold, &iarg);
8595         if (r != 0) {
8596           onig_node_free(iarg.alt_root);
8597           return r;
8598         }
8599         if (IS_NOT_NULL(iarg.alt_root)) {
8600           Node* work = onig_node_new_alt(*np, iarg.alt_root);
8601           if (IS_NULL(work)) {
8602             onig_node_free(iarg.alt_root);
8603             return ONIGERR_MEMORY;
8604           }
8605           *np = work;
8606         }
8607       }
8608     }
8609     break;
8610 
8611   case TK_ANYCHAR:
8612     *np = node_new_anychar(env->options);
8613     CHECK_NULL_RETURN_MEMERR(*np);
8614     break;
8615 
8616   case TK_ANYCHAR_ANYTIME:
8617     *np = node_new_anychar(env->options);
8618     CHECK_NULL_RETURN_MEMERR(*np);
8619     qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
8620     CHECK_NULL_RETURN_MEMERR(qn);
8621     NODE_BODY(qn) = *np;
8622     *np = qn;
8623     break;
8624 
8625   case TK_BACKREF:
8626     len = tok->u.backref.num;
8627     *np = node_new_backref(len,
8628                   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
8629                   tok->u.backref.by_name,
8630 #ifdef USE_BACKREF_WITH_LEVEL
8631                            tok->u.backref.exist_level,
8632                            tok->u.backref.level,
8633 #endif
8634                            env);
8635     CHECK_NULL_RETURN_MEMERR(*np);
8636     break;
8637 
8638 #ifdef USE_CALL
8639   case TK_CALL:
8640     {
8641       int gnum = tok->u.call.gnum;
8642 
8643       *np = node_new_call(tok->u.call.name, tok->u.call.name_end,
8644                           gnum, tok->u.call.by_number);
8645       CHECK_NULL_RETURN_MEMERR(*np);
8646       env->num_call++;
8647       if (tok->u.call.by_number != 0 && gnum == 0) {
8648         env->has_call_zero = 1;
8649       }
8650     }
8651     break;
8652 #endif
8653 
8654   case TK_ANCHOR:
8655     *np = node_new_anchor_with_options(tok->u.anchor, env->options);
8656     CHECK_NULL_RETURN_MEMERR(*np);
8657     break;
8658 
8659   case TK_REPEAT:
8660   case TK_INTERVAL:
8661     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
8662       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
8663         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
8664       else {
8665         *np = node_new_empty();
8666         CHECK_NULL_RETURN_MEMERR(*np);
8667       }
8668     }
8669     else {
8670       goto tk_byte;
8671     }
8672     break;
8673 
8674   case TK_KEEP:
8675     r = node_new_keep(np, env);
8676     if (r < 0) return r;
8677     break;
8678 
8679   case TK_GENERAL_NEWLINE:
8680     r = node_new_general_newline(np, env);
8681     if (r < 0) return r;
8682     break;
8683 
8684   case TK_NO_NEWLINE:
8685     r = node_new_no_newline(np, env);
8686     if (r < 0) return r;
8687     break;
8688 
8689   case TK_TRUE_ANYCHAR:
8690     r = node_new_true_anychar(np);
8691     if (r < 0) return r;
8692     break;
8693 
8694   case TK_TEXT_SEGMENT:
8695     r = make_text_segment(np, env);
8696     if (r < 0) return r;
8697     break;
8698 
8699   default:
8700     return ONIGERR_PARSER_BUG;
8701     break;
8702   }
8703 
8704   {
8705     tp = np;
8706 
8707   re_entry:
8708     r = fetch_token(tok, src, end, env);
8709     if (r < 0) return r;
8710 
8711   repeat:
8712     if (r == TK_REPEAT || r == TK_INTERVAL) {
8713       Node* target;
8714 
8715       if (is_invalid_quantifier_target(*tp))
8716         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
8717 
8718       INC_PARSE_DEPTH(parse_depth);
8719 
8720       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
8721                                r == TK_INTERVAL);
8722       CHECK_NULL_RETURN_MEMERR(qn);
8723       QUANT_(qn)->greedy = tok->u.repeat.greedy;
8724       if (group == 2) {
8725         target = node_drop_group(*tp);
8726         *tp = NULL_NODE;
8727       }
8728       else {
8729         target = *tp;
8730       }
8731       r = assign_quantifier_body(qn, target, group, env);
8732       if (r < 0) {
8733         onig_node_free(qn);
8734         *tp = NULL_NODE;
8735         return r;
8736       }
8737 
8738       if (tok->u.repeat.possessive != 0) {
8739         Node* en;
8740         en = node_new_bag(BAG_STOP_BACKTRACK);
8741         if (IS_NULL(en)) {
8742           onig_node_free(qn);
8743           return ONIGERR_MEMORY;
8744         }
8745         NODE_BODY(en) = qn;
8746         qn = en;
8747       }
8748 
8749       if (r == 0) {
8750         *tp = qn;
8751       }
8752       else if (r == 1) { /* x{1,1} ==> x */
8753         onig_node_free(qn);
8754         *tp = target;
8755       }
8756       else if (r == 2) { /* split case: /abc+/ */
8757         Node *tmp;
8758 
8759         *tp = node_new_list(*tp, NULL);
8760         if (IS_NULL(*tp)) {
8761           onig_node_free(qn);
8762           return ONIGERR_MEMORY;
8763         }
8764         tmp = NODE_CDR(*tp) = node_new_list(qn, NULL);
8765         if (IS_NULL(tmp)) {
8766           onig_node_free(qn);
8767           return ONIGERR_MEMORY;
8768         }
8769         tp = &(NODE_CAR(tmp));
8770       }
8771       group = 0;
8772       goto re_entry;
8773     }
8774   }
8775 
8776   return r;
8777 }
8778 
8779 static int
prs_branch(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8780 prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8781            ScanEnv* env, int group_head)
8782 {
8783   int r;
8784   Node *node, **headp;
8785 
8786   *top = NULL;
8787   INC_PARSE_DEPTH(env->parse_depth);
8788 
8789   r = prs_exp(&node, tok, term, src, end, env, group_head);
8790   if (r < 0) {
8791     onig_node_free(node);
8792     return r;
8793   }
8794 
8795   if (r == TK_EOT || r == term || r == TK_ALT) {
8796     *top = node;
8797   }
8798   else {
8799     *top = node_new_list(node, NULL);
8800     if (IS_NULL(*top)) {
8801       onig_node_free(node);
8802       return ONIGERR_MEMORY;
8803     }
8804 
8805     headp = &(NODE_CDR(*top));
8806     while (r != TK_EOT && r != term && r != TK_ALT) {
8807       r = prs_exp(&node, tok, term, src, end, env, FALSE);
8808       if (r < 0) {
8809         onig_node_free(node);
8810         return r;
8811       }
8812 
8813       if (NODE_TYPE(node) == NODE_LIST) {
8814         *headp = node;
8815         while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node);
8816         headp = &(NODE_CDR(node));
8817       }
8818       else {
8819         *headp = node_new_list(node, NULL);
8820         headp = &(NODE_CDR(*headp));
8821       }
8822     }
8823   }
8824 
8825   DEC_PARSE_DEPTH(env->parse_depth);
8826   return r;
8827 }
8828 
8829 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
8830 static int
prs_alts(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8831 prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8832          ScanEnv* env, int group_head)
8833 {
8834   int r;
8835   Node *node, **headp;
8836   OnigOptionType save_options;
8837 
8838   *top = NULL;
8839   INC_PARSE_DEPTH(env->parse_depth);
8840   save_options = env->options;
8841 
8842   r = prs_branch(&node, tok, term, src, end, env, group_head);
8843   if (r < 0) {
8844     onig_node_free(node);
8845     return r;
8846   }
8847 
8848   if (r == term) {
8849     *top = node;
8850   }
8851   else if (r == TK_ALT) {
8852     *top  = onig_node_new_alt(node, NULL);
8853     if (IS_NULL(*top)) {
8854       onig_node_free(node);
8855       return ONIGERR_MEMORY;
8856     }
8857 
8858     headp = &(NODE_CDR(*top));
8859     while (r == TK_ALT) {
8860       r = fetch_token(tok, src, end, env);
8861       if (r < 0) return r;
8862       r = prs_branch(&node, tok, term, src, end, env, FALSE);
8863       if (r < 0) {
8864         onig_node_free(node);
8865         return r;
8866       }
8867       *headp = onig_node_new_alt(node, NULL);
8868       if (IS_NULL(*headp)) {
8869         onig_node_free(node);
8870         onig_node_free(*top);
8871         return ONIGERR_MEMORY;
8872       }
8873 
8874       headp = &(NODE_CDR(*headp));
8875     }
8876 
8877     if (tok->type != (enum TokenSyms )term)
8878       goto err;
8879   }
8880   else {
8881     onig_node_free(node);
8882   err:
8883     if (term == TK_SUBEXP_CLOSE)
8884       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
8885     else
8886       return ONIGERR_PARSER_BUG;
8887   }
8888 
8889   env->options = save_options;
8890   DEC_PARSE_DEPTH(env->parse_depth);
8891   return r;
8892 }
8893 
8894 static int
prs_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)8895 prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
8896 {
8897   int r;
8898   PToken tok;
8899 
8900   ptoken_init(&tok);
8901   r = fetch_token(&tok, src, end, env);
8902   if (r < 0) return r;
8903   r = prs_alts(top, &tok, TK_EOT, src, end, env, FALSE);
8904   if (r < 0) return r;
8905 
8906   return 0;
8907 }
8908 
8909 #ifdef USE_CALL
8910 static int
make_call_zero_body(Node * node,ScanEnv * env,Node ** rnode)8911 make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
8912 {
8913   int r;
8914 
8915   Node* x = node_new_memory(0 /* 0: is not named */);
8916   CHECK_NULL_RETURN_MEMERR(x);
8917 
8918   NODE_BODY(x) = node;
8919   BAG_(x)->m.regnum = 0;
8920   r = scan_env_set_mem_node(env, 0, x);
8921   if (r != 0) {
8922     onig_node_free(x);
8923     return r;
8924   }
8925 
8926   *rnode = x;
8927   return 0;
8928 }
8929 #endif
8930 
8931 extern int
onig_parse_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)8932 onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
8933                 regex_t* reg, ScanEnv* env)
8934 {
8935   int r;
8936   UChar* p;
8937 #ifdef USE_CALLOUT
8938   RegexExt* ext;
8939 #endif
8940 
8941   reg->string_pool        = 0;
8942   reg->string_pool_end    = 0;
8943   reg->num_mem            = 0;
8944   reg->num_repeat         = 0;
8945   reg->num_empty_check    = 0;
8946   reg->repeat_range_alloc = 0;
8947   reg->repeat_range       = (RepeatRange* )NULL;
8948   reg->empty_status_mem   = 0;
8949 
8950   names_clear(reg);
8951 
8952   scan_env_clear(env);
8953   env->options        = reg->options;
8954   env->case_fold_flag = reg->case_fold_flag;
8955   env->enc            = reg->enc;
8956   env->syntax         = reg->syntax;
8957   env->pattern        = (UChar* )pattern;
8958   env->pattern_end    = (UChar* )end;
8959   env->reg            = reg;
8960 
8961   *root = NULL;
8962 
8963   if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
8964     return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8965 
8966   p = (UChar* )pattern;
8967   r = prs_regexp(root, &p, (UChar* )end, env);
8968   if (r != 0) return r;
8969 
8970 #ifdef USE_CALL
8971   if (env->has_call_zero != 0) {
8972     Node* zero_node;
8973     r = make_call_zero_body(*root, env, &zero_node);
8974     if (r != 0) return r;
8975 
8976     *root = zero_node;
8977   }
8978 #endif
8979 
8980   reg->num_mem = env->num_mem;
8981 
8982 #ifdef USE_CALLOUT
8983   ext = reg->extp;
8984   if (IS_NOT_NULL(ext) && ext->callout_num > 0) {
8985     r = setup_ext_callout_list_values(reg);
8986   }
8987 #endif
8988 
8989   return r;
8990 }
8991 
8992 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)8993 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
8994                                UChar* arg, UChar* arg_end)
8995 {
8996   env->error     = arg;
8997   env->error_end = arg_end;
8998 }
8999