1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2020 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #ifdef DEBUG_NODE_FREE
31 #ifndef NEED_TO_INCLUDE_STDIO
32 #define NEED_TO_INCLUDE_STDIO
33 #endif
34 #endif
35
36 #include "regparse.h"
37 #include "st.h"
38
39 #define INIT_TAG_NAMES_ALLOC_NUM 5
40
41 #define WARN_BUFSIZE 256
42
43 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
44
45 #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \
46 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */)
47 #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \
48 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
49
50 #define OPTON_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE)
51 #define OPTON_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE)
52 #define OPTON_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE)
53 #define OPTON_EXTEND(option) ((option) & ONIG_OPTION_EXTEND)
54 #define OPTON_WORD_ASCII(option) \
55 ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
56 #define OPTON_DIGIT_ASCII(option) \
57 ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
58 #define OPTON_SPACE_ASCII(option) \
59 ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII))
60 #define OPTON_POSIX_ASCII(option) ((option) & ONIG_OPTION_POSIX_IS_ASCII)
61 #define OPTON_TEXT_SEGMENT_WORD(option) ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD)
62
63 #define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \
64 ((ctype) >= 0 && \
65 (((ctype) < ONIGENC_CTYPE_ASCII && OPTON_POSIX_ASCII(options)) ||\
66 ((ctype) == ONIGENC_CTYPE_WORD && OPTON_WORD_ASCII(options)) ||\
67 ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\
68 ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options))))
69
70
71 OnigSyntaxType OnigSyntaxOniguruma = {
72 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
73 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
74 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
75 ONIG_SYN_OP_ESC_CONTROL_CHARS |
76 ONIG_SYN_OP_ESC_C_CONTROL )
77 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
78 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
79 ONIG_SYN_OP2_OPTION_ONIGURUMA |
80 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
81 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
82 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
83 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS |
84 ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME |
85 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
86 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
87 ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT |
88 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
89 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
90 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
91 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
92 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
93 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
94 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
95 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
96 , ( SYN_GNU_REGEX_BV |
97 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
98 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
99 ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND |
100 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
101 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
102 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
103 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |
104 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
105 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
106 , ONIG_OPTION_NONE
107 ,
108 {
109 (OnigCodePoint )'\\' /* esc */
110 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
111 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
112 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
113 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
114 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
115 }
116 };
117
118 OnigSyntaxType OnigSyntaxRuby = {
119 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
120 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
121 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
122 ONIG_SYN_OP_ESC_CONTROL_CHARS |
123 ONIG_SYN_OP_ESC_C_CONTROL )
124 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
125 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
126 ONIG_SYN_OP2_OPTION_RUBY |
127 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
128 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
129 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP |
130 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT |
131 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE |
132 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
133 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
134 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
135 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
136 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
137 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
138 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
139 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 )
140 , ( SYN_GNU_REGEX_BV |
141 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
142 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
143 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
144 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
145 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
146 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
147 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
148 , ONIG_OPTION_NONE
149 ,
150 {
151 (OnigCodePoint )'\\' /* esc */
152 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
153 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
154 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
155 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
156 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
157 }
158 };
159
160 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA;
161
162 typedef enum {
163 CS_VALUE,
164 CS_RANGE,
165 CS_COMPLETE,
166 CS_START
167 } CSTATE;
168
169 typedef enum {
170 CV_UNDEF,
171 CV_SB,
172 CV_MB,
173 CV_CPROP
174 } CVAL;
175
onig_null_warn(const char * s ARG_UNUSED)176 extern void onig_null_warn(const char* s ARG_UNUSED) { }
177
178 #ifdef DEFAULT_WARN_FUNCTION
179 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
180 #else
181 static OnigWarnFunc onig_warn = onig_null_warn;
182 #endif
183
184 #ifdef DEFAULT_VERB_WARN_FUNCTION
185 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
186 #else
187 static OnigWarnFunc onig_verb_warn = onig_null_warn;
188 #endif
189
onig_set_warn_func(OnigWarnFunc f)190 extern void onig_set_warn_func(OnigWarnFunc f)
191 {
192 onig_warn = f;
193 }
194
onig_set_verb_warn_func(OnigWarnFunc f)195 extern void onig_set_verb_warn_func(OnigWarnFunc f)
196 {
197 onig_verb_warn = f;
198 }
199
200 extern void
onig_warning(const char * s)201 onig_warning(const char* s)
202 {
203 if (onig_warn == onig_null_warn) return ;
204
205 (*onig_warn)(s);
206 }
207
208 #define DEFAULT_MAX_CAPTURE_NUM 32767
209
210 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
211
212 extern int
onig_set_capture_num_limit(int num)213 onig_set_capture_num_limit(int num)
214 {
215 if (num < 0) return -1;
216
217 MaxCaptureNum = num;
218 return 0;
219 }
220
221 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
222
223 extern unsigned int
onig_get_parse_depth_limit(void)224 onig_get_parse_depth_limit(void)
225 {
226 return ParseDepthLimit;
227 }
228
229 extern int
onig_set_parse_depth_limit(unsigned int depth)230 onig_set_parse_depth_limit(unsigned int depth)
231 {
232 if (depth == 0)
233 ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
234 else
235 ParseDepthLimit = depth;
236 return 0;
237 }
238
239 #ifdef ONIG_DEBUG_PARSE
240 #define INC_PARSE_DEPTH(d) do {\
241 (d)++;\
242 if (env->max_parse_depth < (d)) env->max_parse_depth = d;\
243 if ((d) > ParseDepthLimit) \
244 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
245 } while (0)
246 #else
247 #define INC_PARSE_DEPTH(d) do {\
248 (d)++;\
249 if ((d) > ParseDepthLimit) \
250 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\
251 } while (0)
252 #endif
253
254 #define DEC_PARSE_DEPTH(d) (d)--
255
256
257 static int
bbuf_init(BBuf * buf,int size)258 bbuf_init(BBuf* buf, int size)
259 {
260 if (size <= 0) {
261 size = 0;
262 buf->p = NULL;
263 }
264 else {
265 buf->p = (UChar* )xmalloc(size);
266 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
267 }
268
269 buf->alloc = size;
270 buf->used = 0;
271 return 0;
272 }
273
274 static void
bbuf_free(BBuf * bbuf)275 bbuf_free(BBuf* bbuf)
276 {
277 if (IS_NOT_NULL(bbuf)) {
278 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
279 xfree(bbuf);
280 }
281 }
282
283 static int
bbuf_clone(BBuf ** rto,BBuf * from)284 bbuf_clone(BBuf** rto, BBuf* from)
285 {
286 int r;
287 BBuf *to;
288
289 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
290 CHECK_NULL_RETURN_MEMERR(to);
291 r = BB_INIT(to, from->alloc);
292 if (r != 0) {
293 bbuf_free(to);
294 *rto = 0;
295 return r;
296 }
297 to->used = from->used;
298 xmemcpy(to->p, from->p, from->used);
299 return 0;
300 }
301
302 static int
backref_rel_to_abs(int rel_no,ScanEnv * env)303 backref_rel_to_abs(int rel_no, ScanEnv* env)
304 {
305 if (rel_no > 0) {
306 if (rel_no > ONIG_INT_MAX - env->num_mem)
307 return ONIGERR_INVALID_BACKREF;
308 return env->num_mem + rel_no;
309 }
310 else {
311 return env->num_mem + 1 + rel_no;
312 }
313 }
314
315 #define OPTION_ON(v,f) ((v) |= (f))
316 #define OPTION_OFF(v,f) ((v) &= ~(f))
317
318 #define OPTION_NEGATE(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
319
320 #define MBCODE_START_POS(enc) \
321 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
322
323 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
324 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
325
326 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
327 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
328 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
329 if (r != 0) return r;\
330 }\
331 } while (0)
332
333
334 #define BITSET_IS_EMPTY(bs,empty) do {\
335 int i;\
336 empty = 1;\
337 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\
338 if ((bs)[i] != 0) {\
339 empty = 0; break;\
340 }\
341 }\
342 } while (0)
343
344 static void
bitset_set_range(BitSetRef bs,int from,int to)345 bitset_set_range(BitSetRef bs, int from, int to)
346 {
347 int i;
348 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
349 BITSET_SET_BIT(bs, i);
350 }
351 }
352
353 static void
bitset_invert(BitSetRef bs)354 bitset_invert(BitSetRef bs)
355 {
356 int i;
357 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); }
358 }
359
360 static void
bitset_invert_to(BitSetRef from,BitSetRef to)361 bitset_invert_to(BitSetRef from, BitSetRef to)
362 {
363 int i;
364 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); }
365 }
366
367 static void
bitset_and(BitSetRef dest,BitSetRef bs)368 bitset_and(BitSetRef dest, BitSetRef bs)
369 {
370 int i;
371 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; }
372 }
373
374 static void
bitset_or(BitSetRef dest,BitSetRef bs)375 bitset_or(BitSetRef dest, BitSetRef bs)
376 {
377 int i;
378 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; }
379 }
380
381 static void
bitset_copy(BitSetRef dest,BitSetRef bs)382 bitset_copy(BitSetRef dest, BitSetRef bs)
383 {
384 int i;
385 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; }
386 }
387
388 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)389 onig_strncmp(const UChar* s1, const UChar* s2, int n)
390 {
391 int x;
392
393 while (n-- > 0) {
394 x = *s2++ - *s1++;
395 if (x) return x;
396 }
397 return 0;
398 }
399
400 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)401 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
402 {
403 int len = (int )(end - src);
404 if (len > 0) {
405 xmemcpy(dest, src, len);
406 dest[len] = (UChar )0;
407 }
408 }
409
410 /* scan pattern methods */
411 #define PEND_VALUE 0
412
413 #define PFETCH_READY UChar* pfetch_prev
414 #define PEND (p < end ? 0 : 1)
415 #define PUNFETCH p = pfetch_prev
416 #define PINC do { \
417 pfetch_prev = p; \
418 p += ONIGENC_MBC_ENC_LEN(enc, p); \
419 } while (0)
420 #define PFETCH(c) do { \
421 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
422 pfetch_prev = p; \
423 p += ONIGENC_MBC_ENC_LEN(enc, p); \
424 } while (0)
425
426 #define PINC_S do { \
427 p += ONIGENC_MBC_ENC_LEN(enc, p); \
428 } while (0)
429 #define PFETCH_S(c) do { \
430 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
431 p += ONIGENC_MBC_ENC_LEN(enc, p); \
432 } while (0)
433
434 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
435 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
436
437 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)438 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
439 int capa)
440 {
441 UChar* r;
442 ptrdiff_t dest_delta = dest_end - dest;
443
444 if (dest)
445 r = (UChar* )xrealloc(dest, capa + 1);
446 else
447 r = (UChar* )xmalloc(capa + 1);
448
449 CHECK_NULL_RETURN(r);
450 onig_strcpy(r + dest_delta, src, src_end);
451 return r;
452 }
453
454 /* dest on static area */
455 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)456 strcat_capa_from_static(UChar* dest, UChar* dest_end,
457 const UChar* src, const UChar* src_end, int capa)
458 {
459 UChar* r;
460
461 r = (UChar* )xmalloc(capa + 1);
462 CHECK_NULL_RETURN(r);
463 onig_strcpy(r, dest, dest_end);
464 onig_strcpy(r + (dest_end - dest), src, src_end);
465 return r;
466 }
467
468
469 #ifdef USE_ST_LIBRARY
470
471 typedef struct {
472 UChar* s;
473 UChar* end;
474 } st_str_end_key;
475
476 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)477 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
478 {
479 UChar *p, *q;
480 int c;
481
482 if ((x->end - x->s) != (y->end - y->s))
483 return 1;
484
485 p = x->s;
486 q = y->s;
487 while (p < x->end) {
488 c = (int )*p - (int )*q;
489 if (c != 0) return c;
490
491 p++; q++;
492 }
493
494 return 0;
495 }
496
497 static int
str_end_hash(st_str_end_key * x)498 str_end_hash(st_str_end_key* x)
499 {
500 UChar *p;
501 unsigned val = 0;
502
503 p = x->s;
504 while (p < x->end) {
505 val = val * 997 + (unsigned )*p++;
506 }
507
508 return (int) (val + (val >> 5));
509 }
510
511 extern hash_table_type
onig_st_init_strend_table_with_size(int size)512 onig_st_init_strend_table_with_size(int size)
513 {
514 static struct st_hash_type hashType = {
515 str_end_cmp,
516 str_end_hash,
517 };
518
519 return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
520 }
521
522 extern int
onig_st_lookup_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type * value)523 onig_st_lookup_strend(hash_table_type table, const UChar* str_key,
524 const UChar* end_key, hash_data_type *value)
525 {
526 st_str_end_key key;
527
528 key.s = (UChar* )str_key;
529 key.end = (UChar* )end_key;
530
531 return onig_st_lookup(table, (st_data_t )(&key), value);
532 }
533
534 extern int
onig_st_insert_strend(hash_table_type table,const UChar * str_key,const UChar * end_key,hash_data_type value)535 onig_st_insert_strend(hash_table_type table, const UChar* str_key,
536 const UChar* end_key, hash_data_type value)
537 {
538 st_str_end_key* key;
539 int result;
540
541 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
542 CHECK_NULL_RETURN_MEMERR(key);
543
544 key->s = (UChar* )str_key;
545 key->end = (UChar* )end_key;
546 result = onig_st_insert(table, (st_data_t )key, value);
547 if (result) {
548 xfree(key);
549 }
550 return result;
551 }
552
553
554 #ifdef USE_CALLOUT
555
556 typedef struct {
557 OnigEncoding enc;
558 int type; /* callout type: single or not */
559 UChar* s;
560 UChar* end;
561 } st_callout_name_key;
562
563 static int
callout_name_table_cmp(st_callout_name_key * x,st_callout_name_key * y)564 callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y)
565 {
566 UChar *p, *q;
567 int c;
568
569 if (x->enc != y->enc) return 1;
570 if (x->type != y->type) return 1;
571 if ((x->end - x->s) != (y->end - y->s))
572 return 1;
573
574 p = x->s;
575 q = y->s;
576 while (p < x->end) {
577 c = (int )*p - (int )*q;
578 if (c != 0) return c;
579
580 p++; q++;
581 }
582
583 return 0;
584 }
585
586 static int
callout_name_table_hash(st_callout_name_key * x)587 callout_name_table_hash(st_callout_name_key* x)
588 {
589 UChar *p;
590 unsigned int val = 0;
591
592 p = x->s;
593 while (p < x->end) {
594 val = val * 997 + (unsigned int )*p++;
595 }
596
597 /* use intptr_t for escape warning in Windows */
598 return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type);
599 }
600
601 extern hash_table_type
onig_st_init_callout_name_table_with_size(int size)602 onig_st_init_callout_name_table_with_size(int size)
603 {
604 static struct st_hash_type hashType = {
605 callout_name_table_cmp,
606 callout_name_table_hash,
607 };
608
609 return (hash_table_type )onig_st_init_table_with_size(&hashType, size);
610 }
611
612 extern int
onig_st_lookup_callout_name_table(hash_table_type table,OnigEncoding enc,int type,const UChar * str_key,const UChar * end_key,hash_data_type * value)613 onig_st_lookup_callout_name_table(hash_table_type table,
614 OnigEncoding enc,
615 int type,
616 const UChar* str_key,
617 const UChar* end_key,
618 hash_data_type *value)
619 {
620 st_callout_name_key key;
621
622 key.enc = enc;
623 key.type = type;
624 key.s = (UChar* )str_key;
625 key.end = (UChar* )end_key;
626
627 return onig_st_lookup(table, (st_data_t )(&key), value);
628 }
629
630 static int
st_insert_callout_name_table(hash_table_type table,OnigEncoding enc,int type,UChar * str_key,UChar * end_key,hash_data_type value)631 st_insert_callout_name_table(hash_table_type table,
632 OnigEncoding enc, int type,
633 UChar* str_key, UChar* end_key,
634 hash_data_type value)
635 {
636 st_callout_name_key* key;
637 int result;
638
639 key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key));
640 CHECK_NULL_RETURN_MEMERR(key);
641
642 /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */
643 key->enc = enc;
644 key->type = type;
645 key->s = str_key;
646 key->end = end_key;
647 result = onig_st_insert(table, (st_data_t )key, value);
648 if (result) {
649 xfree(key);
650 }
651 return result;
652 }
653 #endif
654
655 #endif /* USE_ST_LIBRARY */
656
657
658 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
659
660 typedef struct {
661 UChar* name;
662 int name_len; /* byte length */
663 int back_num; /* number of backrefs */
664 int back_alloc;
665 int back_ref1;
666 int* back_refs;
667 } NameEntry;
668
669 #ifdef USE_ST_LIBRARY
670
671 #define INIT_NAMES_ALLOC_NUM 5
672
673 typedef st_table NameTable;
674 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
675
676 #define NAMEBUF_SIZE 24
677 #define NAMEBUF_SIZE_1 25
678
679 #ifdef ONIG_DEBUG
680 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)681 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
682 {
683 int i;
684 FILE* fp = (FILE* )arg;
685
686 fprintf(fp, "%s: ", e->name);
687 if (e->back_num == 0)
688 fputs("-", fp);
689 else if (e->back_num == 1)
690 fprintf(fp, "%d", e->back_ref1);
691 else {
692 for (i = 0; i < e->back_num; i++) {
693 if (i > 0) fprintf(fp, ", ");
694 fprintf(fp, "%d", e->back_refs[i]);
695 }
696 }
697 fputs("\n", fp);
698 return ST_CONTINUE;
699 }
700
701 extern int
onig_print_names(FILE * fp,regex_t * reg)702 onig_print_names(FILE* fp, regex_t* reg)
703 {
704 NameTable* t = (NameTable* )reg->name_table;
705
706 if (IS_NOT_NULL(t)) {
707 fprintf(fp, "name table\n");
708 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
709 fputs("\n", fp);
710 }
711 return 0;
712 }
713 #endif /* ONIG_DEBUG */
714
715 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)716 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
717 {
718 xfree(e->name);
719 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
720 xfree(key);
721 xfree(e);
722 return ST_DELETE;
723 }
724
725 static int
names_clear(regex_t * reg)726 names_clear(regex_t* reg)
727 {
728 NameTable* t = (NameTable* )reg->name_table;
729
730 if (IS_NOT_NULL(t)) {
731 onig_st_foreach(t, i_free_name_entry, 0);
732 }
733 return 0;
734 }
735
736 extern int
onig_names_free(regex_t * reg)737 onig_names_free(regex_t* reg)
738 {
739 int r;
740 NameTable* t;
741
742 r = names_clear(reg);
743 if (r != 0) return r;
744
745 t = (NameTable* )reg->name_table;
746 if (IS_NOT_NULL(t)) onig_st_free_table(t);
747 reg->name_table = (void* )NULL;
748 return 0;
749 }
750
751 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)752 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
753 {
754 NameEntry* e;
755 NameTable* t = (NameTable* )reg->name_table;
756
757 e = (NameEntry* )NULL;
758 if (IS_NOT_NULL(t)) {
759 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
760 }
761 return e;
762 }
763
764 typedef struct {
765 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
766 regex_t* reg;
767 void* arg;
768 int ret;
769 OnigEncoding enc;
770 } INamesArg;
771
772 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)773 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
774 {
775 int r = (*(arg->func))(e->name,
776 e->name + e->name_len,
777 e->back_num,
778 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
779 arg->reg, arg->arg);
780 if (r != 0) {
781 arg->ret = r;
782 return ST_STOP;
783 }
784 return ST_CONTINUE;
785 }
786
787 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)788 onig_foreach_name(regex_t* reg,
789 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
790 {
791 INamesArg narg;
792 NameTable* t = (NameTable* )reg->name_table;
793
794 narg.ret = 0;
795 if (IS_NOT_NULL(t)) {
796 narg.func = func;
797 narg.reg = reg;
798 narg.arg = arg;
799 narg.enc = reg->enc; /* should be pattern encoding. */
800 onig_st_foreach(t, i_names, (HashDataType )&narg);
801 }
802 return narg.ret;
803 }
804
805 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumMap * map)806 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map)
807 {
808 int i;
809
810 if (e->back_num > 1) {
811 for (i = 0; i < e->back_num; i++) {
812 e->back_refs[i] = map[e->back_refs[i]].new_val;
813 }
814 }
815 else if (e->back_num == 1) {
816 e->back_ref1 = map[e->back_ref1].new_val;
817 }
818
819 return ST_CONTINUE;
820 }
821
822 extern int
onig_renumber_name_table(regex_t * reg,GroupNumMap * map)823 onig_renumber_name_table(regex_t* reg, GroupNumMap* map)
824 {
825 NameTable* t = (NameTable* )reg->name_table;
826
827 if (IS_NOT_NULL(t)) {
828 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
829 }
830 return 0;
831 }
832
833
834 extern int
onig_number_of_names(regex_t * reg)835 onig_number_of_names(regex_t* reg)
836 {
837 NameTable* t = (NameTable* )reg->name_table;
838
839 if (IS_NOT_NULL(t))
840 return t->num_entries;
841 else
842 return 0;
843 }
844
845 #else /* USE_ST_LIBRARY */
846
847 #define INIT_NAMES_ALLOC_NUM 8
848
849 typedef struct {
850 NameEntry* e;
851 int num;
852 int alloc;
853 } NameTable;
854
855 #ifdef ONIG_DEBUG
856 extern int
onig_print_names(FILE * fp,regex_t * reg)857 onig_print_names(FILE* fp, regex_t* reg)
858 {
859 int i, j;
860 NameEntry* e;
861 NameTable* t = (NameTable* )reg->name_table;
862
863 if (IS_NOT_NULL(t) && t->num > 0) {
864 fprintf(fp, "name table\n");
865 for (i = 0; i < t->num; i++) {
866 e = &(t->e[i]);
867 fprintf(fp, "%s: ", e->name);
868 if (e->back_num == 0) {
869 fputs("-", fp);
870 }
871 else if (e->back_num == 1) {
872 fprintf(fp, "%d", e->back_ref1);
873 }
874 else {
875 for (j = 0; j < e->back_num; j++) {
876 if (j > 0) fprintf(fp, ", ");
877 fprintf(fp, "%d", e->back_refs[j]);
878 }
879 }
880 fputs("\n", fp);
881 }
882 fputs("\n", fp);
883 }
884 return 0;
885 }
886 #endif
887
888 static int
names_clear(regex_t * reg)889 names_clear(regex_t* reg)
890 {
891 int i;
892 NameEntry* e;
893 NameTable* t = (NameTable* )reg->name_table;
894
895 if (IS_NOT_NULL(t)) {
896 for (i = 0; i < t->num; i++) {
897 e = &(t->e[i]);
898 if (IS_NOT_NULL(e->name)) {
899 xfree(e->name);
900 e->name = NULL;
901 e->name_len = 0;
902 e->back_num = 0;
903 e->back_alloc = 0;
904 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
905 e->back_refs = (int* )NULL;
906 }
907 }
908 if (IS_NOT_NULL(t->e)) {
909 xfree(t->e);
910 t->e = NULL;
911 }
912 t->num = 0;
913 }
914 return 0;
915 }
916
917 extern int
onig_names_free(regex_t * reg)918 onig_names_free(regex_t* reg)
919 {
920 int r;
921 NameTable* t;
922
923 r = names_clear(reg);
924 if (r != 0) return r;
925
926 t = (NameTable* )reg->name_table;
927 if (IS_NOT_NULL(t)) xfree(t);
928 reg->name_table = NULL;
929 return 0;
930 }
931
932 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)933 name_find(regex_t* reg, UChar* name, UChar* name_end)
934 {
935 int i, len;
936 NameEntry* e;
937 NameTable* t = (NameTable* )reg->name_table;
938
939 if (IS_NOT_NULL(t)) {
940 len = name_end - name;
941 for (i = 0; i < t->num; i++) {
942 e = &(t->e[i]);
943 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
944 return e;
945 }
946 }
947 return (NameEntry* )NULL;
948 }
949
950 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)951 onig_foreach_name(regex_t* reg,
952 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
953 {
954 int i, r;
955 NameEntry* e;
956 NameTable* t = (NameTable* )reg->name_table;
957
958 if (IS_NOT_NULL(t)) {
959 for (i = 0; i < t->num; i++) {
960 e = &(t->e[i]);
961 r = (*func)(e->name, e->name + e->name_len, e->back_num,
962 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
963 reg, arg);
964 if (r != 0) return r;
965 }
966 }
967 return 0;
968 }
969
970 extern int
onig_number_of_names(regex_t * reg)971 onig_number_of_names(regex_t* reg)
972 {
973 NameTable* t = (NameTable* )reg->name_table;
974
975 if (IS_NOT_NULL(t))
976 return t->num;
977 else
978 return 0;
979 }
980
981 #endif /* else USE_ST_LIBRARY */
982
983 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)984 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
985 {
986 int r;
987 int alloc;
988 NameEntry* e;
989 NameTable* t = (NameTable* )reg->name_table;
990
991 if (name_end - name <= 0)
992 return ONIGERR_EMPTY_GROUP_NAME;
993
994 e = name_find(reg, name, name_end);
995 if (IS_NULL(e)) {
996 #ifdef USE_ST_LIBRARY
997 if (IS_NULL(t)) {
998 t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM);
999 CHECK_NULL_RETURN_MEMERR(t);
1000 reg->name_table = (void* )t;
1001 }
1002 e = (NameEntry* )xmalloc(sizeof(NameEntry));
1003 CHECK_NULL_RETURN_MEMERR(e);
1004
1005 e->name = onigenc_strdup(reg->enc, name, name_end);
1006 if (IS_NULL(e->name)) {
1007 xfree(e); return ONIGERR_MEMORY;
1008 }
1009 r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
1010 (HashDataType )e);
1011 if (r < 0) return r;
1012
1013 e->name_len = (int )(name_end - name);
1014 e->back_num = 0;
1015 e->back_alloc = 0;
1016 e->back_refs = (int* )NULL;
1017
1018 #else
1019
1020 if (IS_NULL(t)) {
1021 alloc = INIT_NAMES_ALLOC_NUM;
1022 t = (NameTable* )xmalloc(sizeof(NameTable));
1023 CHECK_NULL_RETURN_MEMERR(t);
1024 t->e = NULL;
1025 t->alloc = 0;
1026 t->num = 0;
1027
1028 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
1029 if (IS_NULL(t->e)) {
1030 xfree(t);
1031 return ONIGERR_MEMORY;
1032 }
1033 t->alloc = alloc;
1034 reg->name_table = t;
1035 goto clear;
1036 }
1037 else if (t->num == t->alloc) {
1038 int i;
1039
1040 alloc = t->alloc * 2;
1041 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
1042 CHECK_NULL_RETURN_MEMERR(t->e);
1043 t->alloc = alloc;
1044
1045 clear:
1046 for (i = t->num; i < t->alloc; i++) {
1047 t->e[i].name = NULL;
1048 t->e[i].name_len = 0;
1049 t->e[i].back_num = 0;
1050 t->e[i].back_alloc = 0;
1051 t->e[i].back_refs = (int* )NULL;
1052 }
1053 }
1054 e = &(t->e[t->num]);
1055 t->num++;
1056 e->name = onigenc_strdup(reg->enc, name, name_end);
1057 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1058 e->name_len = name_end - name;
1059 #endif
1060 }
1061
1062 if (e->back_num >= 1 &&
1063 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
1064 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1065 name, name_end);
1066 return ONIGERR_MULTIPLEX_DEFINED_NAME;
1067 }
1068
1069 e->back_num++;
1070 if (e->back_num == 1) {
1071 e->back_ref1 = backref;
1072 }
1073 else {
1074 if (e->back_num == 2) {
1075 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
1076 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
1077 CHECK_NULL_RETURN_MEMERR(e->back_refs);
1078 e->back_alloc = alloc;
1079 e->back_refs[0] = e->back_ref1;
1080 e->back_refs[1] = backref;
1081 }
1082 else {
1083 if (e->back_num > e->back_alloc) {
1084 alloc = e->back_alloc * 2;
1085 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
1086 CHECK_NULL_RETURN_MEMERR(e->back_refs);
1087 e->back_alloc = alloc;
1088 }
1089 e->back_refs[e->back_num - 1] = backref;
1090 }
1091 }
1092
1093 return 0;
1094 }
1095
1096 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)1097 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
1098 const UChar* name_end, int** nums)
1099 {
1100 NameEntry* e = name_find(reg, name, name_end);
1101
1102 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
1103
1104 switch (e->back_num) {
1105 case 0:
1106 break;
1107 case 1:
1108 *nums = &(e->back_ref1);
1109 break;
1110 default:
1111 *nums = e->back_refs;
1112 break;
1113 }
1114 return e->back_num;
1115 }
1116
1117 static int
name_to_group_numbers(ScanEnv * env,const UChar * name,const UChar * name_end,int ** nums)1118 name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end,
1119 int** nums)
1120 {
1121 regex_t* reg;
1122 NameEntry* e;
1123
1124 reg = env->reg;
1125 e = name_find(reg, name, name_end);
1126
1127 if (IS_NULL(e)) {
1128 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
1129 (UChar* )name, (UChar* )name_end);
1130 return ONIGERR_UNDEFINED_NAME_REFERENCE;
1131 }
1132
1133 switch (e->back_num) {
1134 case 0:
1135 break;
1136 case 1:
1137 *nums = &(e->back_ref1);
1138 break;
1139 default:
1140 *nums = e->back_refs;
1141 break;
1142 }
1143 return e->back_num;
1144 }
1145
1146 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)1147 onig_name_to_backref_number(regex_t* reg, const UChar* name,
1148 const UChar* name_end, OnigRegion *region)
1149 {
1150 int i, n, *nums;
1151
1152 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
1153 if (n < 0)
1154 return n;
1155 else if (n == 0)
1156 return ONIGERR_PARSER_BUG;
1157 else if (n == 1)
1158 return nums[0];
1159 else {
1160 if (IS_NOT_NULL(region)) {
1161 for (i = n - 1; i >= 0; i--) {
1162 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
1163 return nums[i];
1164 }
1165 }
1166 return nums[n - 1];
1167 }
1168 }
1169
1170 extern int
onig_noname_group_capture_is_active(regex_t * reg)1171 onig_noname_group_capture_is_active(regex_t* reg)
1172 {
1173 if (OPTON_DONT_CAPTURE_GROUP(reg->options))
1174 return 0;
1175
1176 if (onig_number_of_names(reg) > 0 &&
1177 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
1178 ! OPTON_CAPTURE_GROUP(reg->options)) {
1179 return 0;
1180 }
1181
1182 return 1;
1183 }
1184
1185 #ifdef USE_CALLOUT
1186
1187 typedef struct {
1188 OnigCalloutType type;
1189 int in;
1190 OnigCalloutFunc start_func;
1191 OnigCalloutFunc end_func;
1192 int arg_num;
1193 int opt_arg_num;
1194 unsigned int arg_types[ONIG_CALLOUT_MAX_ARGS_NUM];
1195 OnigValue opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM];
1196 UChar* name; /* reference to GlobalCalloutNameTable entry: e->name */
1197 } CalloutNameListEntry;
1198
1199 typedef struct {
1200 int n;
1201 int alloc;
1202 CalloutNameListEntry* v;
1203 } CalloutNameListType;
1204
1205 static CalloutNameListType* GlobalCalloutNameList;
1206
1207 static int
make_callout_func_list(CalloutNameListType ** rs,int init_size)1208 make_callout_func_list(CalloutNameListType** rs, int init_size)
1209 {
1210 CalloutNameListType* s;
1211 CalloutNameListEntry* v;
1212
1213 *rs = 0;
1214
1215 s = xmalloc(sizeof(*s));
1216 if (IS_NULL(s)) return ONIGERR_MEMORY;
1217
1218 v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size);
1219 if (IS_NULL(v)) {
1220 xfree(s);
1221 return ONIGERR_MEMORY;
1222 }
1223
1224 s->n = 0;
1225 s->alloc = init_size;
1226 s->v = v;
1227
1228 *rs = s;
1229 return ONIG_NORMAL;
1230 }
1231
1232 static void
free_callout_func_list(CalloutNameListType * s)1233 free_callout_func_list(CalloutNameListType* s)
1234 {
1235 if (IS_NOT_NULL(s)) {
1236 if (IS_NOT_NULL(s->v)) {
1237 int i, j;
1238
1239 for (i = 0; i < s->n; i++) {
1240 CalloutNameListEntry* e = s->v + i;
1241 for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) {
1242 if (e->arg_types[j] == ONIG_TYPE_STRING) {
1243 UChar* p = e->opt_defaults[j].s.start;
1244 if (IS_NOT_NULL(p)) xfree(p);
1245 }
1246 }
1247 }
1248 xfree(s->v);
1249 }
1250 xfree(s);
1251 }
1252 }
1253
1254 static int
callout_func_list_add(CalloutNameListType * s,int * rid)1255 callout_func_list_add(CalloutNameListType* s, int* rid)
1256 {
1257 if (s->n >= s->alloc) {
1258 int new_size = s->alloc * 2;
1259 CalloutNameListEntry* nv = (CalloutNameListEntry* )
1260 xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size);
1261 if (IS_NULL(nv)) return ONIGERR_MEMORY;
1262
1263 s->alloc = new_size;
1264 s->v = nv;
1265 }
1266
1267 *rid = s->n;
1268
1269 xmemset(&(s->v[s->n]), 0, sizeof(*(s->v)));
1270 s->n++;
1271 return ONIG_NORMAL;
1272 }
1273
1274
1275 typedef struct {
1276 UChar* name;
1277 int name_len; /* byte length */
1278 int id;
1279 } CalloutNameEntry;
1280
1281 #ifdef USE_ST_LIBRARY
1282 typedef st_table CalloutNameTable;
1283 #else
1284 typedef struct {
1285 CalloutNameEntry* e;
1286 int num;
1287 int alloc;
1288 } CalloutNameTable;
1289 #endif
1290
1291 static CalloutNameTable* GlobalCalloutNameTable;
1292 static int CalloutNameIDCounter;
1293
1294 #ifdef USE_ST_LIBRARY
1295
1296 static int
i_free_callout_name_entry(st_callout_name_key * key,CalloutNameEntry * e,void * arg ARG_UNUSED)1297 i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e,
1298 void* arg ARG_UNUSED)
1299 {
1300 if (IS_NOT_NULL(e)) {
1301 xfree(e->name);
1302 }
1303 /*xfree(key->s); */ /* is same as e->name */
1304 xfree(key);
1305 xfree(e);
1306 return ST_DELETE;
1307 }
1308
1309 static int
callout_name_table_clear(CalloutNameTable * t)1310 callout_name_table_clear(CalloutNameTable* t)
1311 {
1312 if (IS_NOT_NULL(t)) {
1313 onig_st_foreach(t, i_free_callout_name_entry, 0);
1314 }
1315 return 0;
1316 }
1317
1318 static int
global_callout_name_table_free(void)1319 global_callout_name_table_free(void)
1320 {
1321 if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1322 int r = callout_name_table_clear(GlobalCalloutNameTable);
1323 if (r != 0) return r;
1324
1325 onig_st_free_table(GlobalCalloutNameTable);
1326 GlobalCalloutNameTable = 0;
1327 CalloutNameIDCounter = 0;
1328 }
1329
1330 return 0;
1331 }
1332
1333 static CalloutNameEntry*
callout_name_find(OnigEncoding enc,int is_not_single,const UChar * name,const UChar * name_end)1334 callout_name_find(OnigEncoding enc, int is_not_single,
1335 const UChar* name, const UChar* name_end)
1336 {
1337 int r;
1338 CalloutNameEntry* e;
1339 CalloutNameTable* t = GlobalCalloutNameTable;
1340
1341 e = (CalloutNameEntry* )NULL;
1342 if (IS_NOT_NULL(t)) {
1343 r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1344 (HashDataType* )((void* )(&e)));
1345 if (r == 0) { /* not found */
1346 if (enc != ONIG_ENCODING_ASCII &&
1347 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
1348 enc = ONIG_ENCODING_ASCII;
1349 onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end,
1350 (HashDataType* )((void* )(&e)));
1351 }
1352 }
1353 }
1354 return e;
1355 }
1356
1357 #else
1358
1359 static int
callout_name_table_clear(CalloutNameTable * t)1360 callout_name_table_clear(CalloutNameTable* t)
1361 {
1362 int i;
1363 CalloutNameEntry* e;
1364
1365 if (IS_NOT_NULL(t)) {
1366 for (i = 0; i < t->num; i++) {
1367 e = &(t->e[i]);
1368 if (IS_NOT_NULL(e->name)) {
1369 xfree(e->name);
1370 e->name = NULL;
1371 e->name_len = 0;
1372 e->id = 0;
1373 e->func = 0;
1374 }
1375 }
1376 if (IS_NOT_NULL(t->e)) {
1377 xfree(t->e);
1378 t->e = NULL;
1379 }
1380 t->num = 0;
1381 }
1382 return 0;
1383 }
1384
1385 static int
global_callout_name_table_free(void)1386 global_callout_name_table_free(void)
1387 {
1388 if (IS_NOT_NULL(GlobalCalloutNameTable)) {
1389 int r = callout_name_table_clear(GlobalCalloutNameTable);
1390 if (r != 0) return r;
1391
1392 xfree(GlobalCalloutNameTable);
1393 GlobalCalloutNameTable = 0;
1394 CalloutNameIDCounter = 0;
1395 }
1396 return 0;
1397 }
1398
1399 static CalloutNameEntry*
callout_name_find(UChar * name,UChar * name_end)1400 callout_name_find(UChar* name, UChar* name_end)
1401 {
1402 int i, len;
1403 CalloutNameEntry* e;
1404 CalloutNameTable* t = Calloutnames;
1405
1406 if (IS_NOT_NULL(t)) {
1407 len = name_end - name;
1408 for (i = 0; i < t->num; i++) {
1409 e = &(t->e[i]);
1410 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
1411 return e;
1412 }
1413 }
1414 return (CalloutNameEntry* )NULL;
1415 }
1416
1417 #endif
1418
1419 /* name string must be single byte char string. */
1420 static int
callout_name_entry(CalloutNameEntry ** rentry,OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end)1421 callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc,
1422 int is_not_single, UChar* name, UChar* name_end)
1423 {
1424 int r;
1425 CalloutNameEntry* e;
1426 CalloutNameTable* t = GlobalCalloutNameTable;
1427
1428 *rentry = 0;
1429 if (name_end - name <= 0)
1430 return ONIGERR_INVALID_CALLOUT_NAME;
1431
1432 e = callout_name_find(enc, is_not_single, name, name_end);
1433 if (IS_NULL(e)) {
1434 #ifdef USE_ST_LIBRARY
1435 if (IS_NULL(t)) {
1436 t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM);
1437 CHECK_NULL_RETURN_MEMERR(t);
1438 GlobalCalloutNameTable = t;
1439 }
1440 e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry));
1441 CHECK_NULL_RETURN_MEMERR(e);
1442
1443 e->name = onigenc_strdup(enc, name, name_end);
1444 if (IS_NULL(e->name)) {
1445 xfree(e); return ONIGERR_MEMORY;
1446 }
1447
1448 r = st_insert_callout_name_table(t, enc, is_not_single,
1449 e->name, (e->name + (name_end - name)),
1450 (HashDataType )e);
1451 if (r < 0) return r;
1452
1453 #else
1454
1455 int alloc;
1456
1457 if (IS_NULL(t)) {
1458 alloc = INIT_NAMES_ALLOC_NUM;
1459 t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable));
1460 CHECK_NULL_RETURN_MEMERR(t);
1461 t->e = NULL;
1462 t->alloc = 0;
1463 t->num = 0;
1464
1465 t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc);
1466 if (IS_NULL(t->e)) {
1467 xfree(t);
1468 return ONIGERR_MEMORY;
1469 }
1470 t->alloc = alloc;
1471 GlobalCalloutNameTable = t;
1472 goto clear;
1473 }
1474 else if (t->num == t->alloc) {
1475 int i;
1476
1477 alloc = t->alloc * 2;
1478 t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc);
1479 CHECK_NULL_RETURN_MEMERR(t->e);
1480 t->alloc = alloc;
1481
1482 clear:
1483 for (i = t->num; i < t->alloc; i++) {
1484 t->e[i].name = NULL;
1485 t->e[i].name_len = 0;
1486 t->e[i].id = 0;
1487 }
1488 }
1489 e = &(t->e[t->num]);
1490 t->num++;
1491 e->name = onigenc_strdup(enc, name, name_end);
1492 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
1493 #endif
1494
1495 CalloutNameIDCounter++;
1496 e->id = CalloutNameIDCounter;
1497 e->name_len = (int )(name_end - name);
1498 }
1499
1500 *rentry = e;
1501 return e->id;
1502 }
1503
1504 static int
is_allowed_callout_name(OnigEncoding enc,UChar * name,UChar * name_end)1505 is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end)
1506 {
1507 UChar* p;
1508 OnigCodePoint c;
1509
1510 if (name >= name_end) return 0;
1511
1512 p = name;
1513 while (p < name_end) {
1514 c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1515 if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c))
1516 return 0;
1517
1518 if (p == name) {
1519 if (c >= '0' && c <= '9') return 0;
1520 }
1521
1522 p += ONIGENC_MBC_ENC_LEN(enc, p);
1523 }
1524
1525 return 1;
1526 }
1527
1528 static int
is_allowed_callout_tag_name(OnigEncoding enc,UChar * name,UChar * name_end)1529 is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end)
1530 {
1531 UChar* p;
1532 OnigCodePoint c;
1533
1534 if (name >= name_end) return 0;
1535
1536 p = name;
1537 while (p < name_end) {
1538 c = ONIGENC_MBC_TO_CODE(enc, p, name_end);
1539 if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c))
1540 return 0;
1541
1542 if (p == name) {
1543 if (c >= '0' && c <= '9') return 0;
1544 }
1545
1546 p += ONIGENC_MBC_ENC_LEN(enc, p);
1547 }
1548
1549 return 1;
1550 }
1551
1552 extern int
onig_set_callout_of_name(OnigEncoding enc,OnigCalloutType callout_type,UChar * name,UChar * name_end,int in,OnigCalloutFunc start_func,OnigCalloutFunc end_func,int arg_num,unsigned int arg_types[],int opt_arg_num,OnigValue opt_defaults[])1553 onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type,
1554 UChar* name, UChar* name_end, int in,
1555 OnigCalloutFunc start_func,
1556 OnigCalloutFunc end_func,
1557 int arg_num, unsigned int arg_types[],
1558 int opt_arg_num, OnigValue opt_defaults[])
1559 {
1560 int r;
1561 int i;
1562 int j;
1563 int id;
1564 int is_not_single;
1565 CalloutNameEntry* e;
1566 CalloutNameListEntry* fe;
1567
1568 if (callout_type != ONIG_CALLOUT_TYPE_SINGLE)
1569 return ONIGERR_INVALID_ARGUMENT;
1570
1571 if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM)
1572 return ONIGERR_INVALID_CALLOUT_ARG;
1573
1574 if (opt_arg_num < 0 || opt_arg_num > arg_num)
1575 return ONIGERR_INVALID_CALLOUT_ARG;
1576
1577 if (start_func == 0 && end_func == 0)
1578 return ONIGERR_INVALID_CALLOUT_ARG;
1579
1580 if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0)
1581 return ONIGERR_INVALID_CALLOUT_ARG;
1582
1583 for (i = 0; i < arg_num; i++) {
1584 unsigned int t = arg_types[i];
1585 if (t == ONIG_TYPE_VOID)
1586 return ONIGERR_INVALID_CALLOUT_ARG;
1587 else {
1588 if (i >= arg_num - opt_arg_num) {
1589 if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING &&
1590 t != ONIG_TYPE_TAG)
1591 return ONIGERR_INVALID_CALLOUT_ARG;
1592 }
1593 else {
1594 if (t != ONIG_TYPE_LONG) {
1595 t = t & ~ONIG_TYPE_LONG;
1596 if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG)
1597 return ONIGERR_INVALID_CALLOUT_ARG;
1598 }
1599 }
1600 }
1601 }
1602
1603 if (! is_allowed_callout_name(enc, name, name_end)) {
1604 return ONIGERR_INVALID_CALLOUT_NAME;
1605 }
1606
1607 is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE);
1608 id = callout_name_entry(&e, enc, is_not_single, name, name_end);
1609 if (id < 0) return id;
1610
1611 r = ONIG_NORMAL;
1612 if (IS_NULL(GlobalCalloutNameList)) {
1613 r = make_callout_func_list(&GlobalCalloutNameList, 10);
1614 if (r != ONIG_NORMAL) return r;
1615 }
1616
1617 while (id >= GlobalCalloutNameList->n) {
1618 int rid;
1619 r = callout_func_list_add(GlobalCalloutNameList, &rid);
1620 if (r != ONIG_NORMAL) return r;
1621 }
1622
1623 fe = GlobalCalloutNameList->v + id;
1624 fe->type = callout_type;
1625 fe->in = in;
1626 fe->start_func = start_func;
1627 fe->end_func = end_func;
1628 fe->arg_num = arg_num;
1629 fe->opt_arg_num = opt_arg_num;
1630 fe->name = e->name;
1631
1632 for (i = 0; i < arg_num; i++) {
1633 fe->arg_types[i] = arg_types[i];
1634 }
1635 for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) {
1636 if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT;
1637 if (fe->arg_types[i] == ONIG_TYPE_STRING) {
1638 OnigValue* val;
1639 UChar* ds;
1640
1641 val = opt_defaults + j;
1642 ds = onigenc_strdup(enc, val->s.start, val->s.end);
1643 CHECK_NULL_RETURN_MEMERR(ds);
1644
1645 fe->opt_defaults[i].s.start = ds;
1646 fe->opt_defaults[i].s.end = ds + (val->s.end - val->s.start);
1647 }
1648 else {
1649 fe->opt_defaults[i] = opt_defaults[j];
1650 }
1651 }
1652
1653 r = id;
1654 return r;
1655 }
1656
1657 static int
get_callout_name_id_by_name(OnigEncoding enc,int is_not_single,UChar * name,UChar * name_end,int * rid)1658 get_callout_name_id_by_name(OnigEncoding enc, int is_not_single,
1659 UChar* name, UChar* name_end, int* rid)
1660 {
1661 int r;
1662 CalloutNameEntry* e;
1663
1664 if (! is_allowed_callout_name(enc, name, name_end)) {
1665 return ONIGERR_INVALID_CALLOUT_NAME;
1666 }
1667
1668 e = callout_name_find(enc, is_not_single, name, name_end);
1669 if (IS_NULL(e)) {
1670 return ONIGERR_UNDEFINED_CALLOUT_NAME;
1671 }
1672
1673 r = ONIG_NORMAL;
1674 *rid = e->id;
1675
1676 return r;
1677 }
1678
1679 extern OnigCalloutFunc
onig_get_callout_start_func(regex_t * reg,int callout_num)1680 onig_get_callout_start_func(regex_t* reg, int callout_num)
1681 {
1682 /* If used for callouts of contents, return 0. */
1683 CalloutListEntry* e;
1684
1685 e = onig_reg_callout_list_at(reg, callout_num);
1686 CHECK_NULL_RETURN(e);
1687 return e->start_func;
1688 }
1689
1690 extern const UChar*
onig_get_callout_tag_start(regex_t * reg,int callout_num)1691 onig_get_callout_tag_start(regex_t* reg, int callout_num)
1692 {
1693 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1694 CHECK_NULL_RETURN(e);
1695 return e->tag_start;
1696 }
1697
1698 extern const UChar*
onig_get_callout_tag_end(regex_t * reg,int callout_num)1699 onig_get_callout_tag_end(regex_t* reg, int callout_num)
1700 {
1701 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num);
1702 CHECK_NULL_RETURN(e);
1703 return e->tag_end;
1704 }
1705
1706
1707 extern OnigCalloutType
onig_get_callout_type_by_name_id(int name_id)1708 onig_get_callout_type_by_name_id(int name_id)
1709 {
1710 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1711 return 0;
1712
1713 return GlobalCalloutNameList->v[name_id].type;
1714 }
1715
1716 extern OnigCalloutFunc
onig_get_callout_start_func_by_name_id(int name_id)1717 onig_get_callout_start_func_by_name_id(int name_id)
1718 {
1719 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1720 return 0;
1721
1722 return GlobalCalloutNameList->v[name_id].start_func;
1723 }
1724
1725 extern OnigCalloutFunc
onig_get_callout_end_func_by_name_id(int name_id)1726 onig_get_callout_end_func_by_name_id(int name_id)
1727 {
1728 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1729 return 0;
1730
1731 return GlobalCalloutNameList->v[name_id].end_func;
1732 }
1733
1734 extern int
onig_get_callout_in_by_name_id(int name_id)1735 onig_get_callout_in_by_name_id(int name_id)
1736 {
1737 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1738 return 0;
1739
1740 return GlobalCalloutNameList->v[name_id].in;
1741 }
1742
1743 static int
get_callout_arg_num_by_name_id(int name_id)1744 get_callout_arg_num_by_name_id(int name_id)
1745 {
1746 return GlobalCalloutNameList->v[name_id].arg_num;
1747 }
1748
1749 static int
get_callout_opt_arg_num_by_name_id(int name_id)1750 get_callout_opt_arg_num_by_name_id(int name_id)
1751 {
1752 return GlobalCalloutNameList->v[name_id].opt_arg_num;
1753 }
1754
1755 static unsigned int
get_callout_arg_type_by_name_id(int name_id,int index)1756 get_callout_arg_type_by_name_id(int name_id, int index)
1757 {
1758 return GlobalCalloutNameList->v[name_id].arg_types[index];
1759 }
1760
1761 static OnigValue
get_callout_opt_default_by_name_id(int name_id,int index)1762 get_callout_opt_default_by_name_id(int name_id, int index)
1763 {
1764 return GlobalCalloutNameList->v[name_id].opt_defaults[index];
1765 }
1766
1767 extern UChar*
onig_get_callout_name_by_name_id(int name_id)1768 onig_get_callout_name_by_name_id(int name_id)
1769 {
1770 if (name_id < 0 || name_id >= GlobalCalloutNameList->n)
1771 return 0;
1772
1773 return GlobalCalloutNameList->v[name_id].name;
1774 }
1775
1776 extern int
onig_global_callout_names_free(void)1777 onig_global_callout_names_free(void)
1778 {
1779 free_callout_func_list(GlobalCalloutNameList);
1780 GlobalCalloutNameList = 0;
1781
1782 global_callout_name_table_free();
1783 return ONIG_NORMAL;
1784 }
1785
1786
1787 typedef st_table CalloutTagTable;
1788 typedef intptr_t CalloutTagVal;
1789
1790 #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST (1<<0)
1791
1792 static int
i_callout_callout_list_set(UChar * key,CalloutTagVal e,void * arg)1793 i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg)
1794 {
1795 int num;
1796 RegexExt* ext = (RegexExt* )arg;
1797
1798 num = (int )e - 1;
1799 ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST;
1800 return ST_CONTINUE;
1801 }
1802
1803 static int
setup_ext_callout_list_values(regex_t * reg)1804 setup_ext_callout_list_values(regex_t* reg)
1805 {
1806 int i, j;
1807 RegexExt* ext;
1808
1809 ext = reg->extp;
1810 if (IS_NOT_NULL(ext->tag_table)) {
1811 onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set,
1812 (st_data_t )ext);
1813 }
1814
1815 for (i = 0; i < ext->callout_num; i++) {
1816 CalloutListEntry* e = ext->callout_list + i;
1817 if (e->of == ONIG_CALLOUT_OF_NAME) {
1818 for (j = 0; j < e->u.arg.num; j++) {
1819 if (e->u.arg.types[j] == ONIG_TYPE_TAG) {
1820 UChar* start;
1821 UChar* end;
1822 int num;
1823 start = e->u.arg.vals[j].s.start;
1824 end = e->u.arg.vals[j].s.end;
1825 num = onig_get_callout_num_by_tag(reg, start, end);
1826 if (num < 0) return num;
1827 e->u.arg.vals[j].tag = num;
1828 }
1829 }
1830 }
1831 }
1832
1833 return ONIG_NORMAL;
1834 }
1835
1836 extern int
onig_callout_tag_is_exist_at_callout_num(regex_t * reg,int callout_num)1837 onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num)
1838 {
1839 RegexExt* ext = reg->extp;
1840
1841 if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0;
1842 if (callout_num > ext->callout_num) return 0;
1843
1844 return (ext->callout_list[callout_num].flag &
1845 CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0;
1846 }
1847
1848 static int
i_free_callout_tag_entry(UChar * key,CalloutTagVal e,void * arg ARG_UNUSED)1849 i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED)
1850 {
1851 xfree(key);
1852 return ST_DELETE;
1853 }
1854
1855 static int
callout_tag_table_clear(CalloutTagTable * t)1856 callout_tag_table_clear(CalloutTagTable* t)
1857 {
1858 if (IS_NOT_NULL(t)) {
1859 onig_st_foreach(t, i_free_callout_tag_entry, 0);
1860 }
1861 return 0;
1862 }
1863
1864 extern int
onig_callout_tag_table_free(void * table)1865 onig_callout_tag_table_free(void* table)
1866 {
1867 CalloutTagTable* t = (CalloutTagTable* )table;
1868
1869 if (IS_NOT_NULL(t)) {
1870 int r = callout_tag_table_clear(t);
1871 if (r != 0) return r;
1872
1873 onig_st_free_table(t);
1874 }
1875
1876 return 0;
1877 }
1878
1879 extern int
onig_get_callout_num_by_tag(regex_t * reg,const UChar * tag,const UChar * tag_end)1880 onig_get_callout_num_by_tag(regex_t* reg,
1881 const UChar* tag, const UChar* tag_end)
1882 {
1883 int r;
1884 RegexExt* ext;
1885 CalloutTagVal e;
1886
1887 ext = reg->extp;
1888 if (IS_NULL(ext) || IS_NULL(ext->tag_table))
1889 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1890
1891 r = onig_st_lookup_strend(ext->tag_table, tag, tag_end,
1892 (HashDataType* )((void* )(&e)));
1893 if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1894 return (int )e;
1895 }
1896
1897 static CalloutTagVal
callout_tag_find(CalloutTagTable * t,const UChar * name,const UChar * name_end)1898 callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end)
1899 {
1900 CalloutTagVal e;
1901
1902 e = -1;
1903 if (IS_NOT_NULL(t)) {
1904 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
1905 }
1906 return e;
1907 }
1908
1909 static int
callout_tag_table_new(CalloutTagTable ** rt)1910 callout_tag_table_new(CalloutTagTable** rt)
1911 {
1912 CalloutTagTable* t;
1913
1914 *rt = 0;
1915 t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM);
1916 CHECK_NULL_RETURN_MEMERR(t);
1917
1918 *rt = t;
1919 return ONIG_NORMAL;
1920 }
1921
1922 static int
callout_tag_entry_raw(ScanEnv * env,CalloutTagTable * t,UChar * name,UChar * name_end,CalloutTagVal entry_val)1923 callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name,
1924 UChar* name_end, CalloutTagVal entry_val)
1925 {
1926 int r;
1927 CalloutTagVal val;
1928
1929 if (name_end - name <= 0)
1930 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
1931
1932 val = callout_tag_find(t, name, name_end);
1933 if (val >= 0) {
1934 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
1935 name, name_end);
1936 return ONIGERR_MULTIPLEX_DEFINED_NAME;
1937 }
1938
1939 r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);
1940 if (r < 0) return r;
1941
1942 return ONIG_NORMAL;
1943 }
1944
1945 static int
ext_ensure_tag_table(regex_t * reg)1946 ext_ensure_tag_table(regex_t* reg)
1947 {
1948 int r;
1949 RegexExt* ext;
1950 CalloutTagTable* t;
1951
1952 ext = onig_get_regex_ext(reg);
1953 CHECK_NULL_RETURN_MEMERR(ext);
1954
1955 if (IS_NULL(ext->tag_table)) {
1956 r = callout_tag_table_new(&t);
1957 if (r != ONIG_NORMAL) return r;
1958
1959 ext->tag_table = t;
1960 }
1961
1962 return ONIG_NORMAL;
1963 }
1964
1965 static int
callout_tag_entry(ScanEnv * env,regex_t * reg,UChar * name,UChar * name_end,CalloutTagVal entry_val)1966 callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
1967 CalloutTagVal entry_val)
1968 {
1969 int r;
1970 RegexExt* ext;
1971 CalloutListEntry* e;
1972
1973 r = ext_ensure_tag_table(reg);
1974 if (r != ONIG_NORMAL) return r;
1975
1976 ext = onig_get_regex_ext(reg);
1977 CHECK_NULL_RETURN_MEMERR(ext);
1978 r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);
1979
1980 e = onig_reg_callout_list_at(reg, (int )entry_val);
1981 CHECK_NULL_RETURN_MEMERR(e);
1982 e->tag_start = name;
1983 e->tag_end = name_end;
1984
1985 return r;
1986 }
1987
1988 #endif /* USE_CALLOUT */
1989
1990
1991 #define INIT_SCANENV_MEMENV_ALLOC_SIZE 16
1992
1993 static void
scan_env_clear(ScanEnv * env)1994 scan_env_clear(ScanEnv* env)
1995 {
1996 MEM_STATUS_CLEAR(env->cap_history);
1997 MEM_STATUS_CLEAR(env->backtrack_mem);
1998 MEM_STATUS_CLEAR(env->backrefed_mem);
1999 env->error = (UChar* )NULL;
2000 env->error_end = (UChar* )NULL;
2001 env->num_call = 0;
2002
2003 #ifdef USE_CALL
2004 env->unset_addr_list = NULL;
2005 env->has_call_zero = 0;
2006 #endif
2007
2008 env->num_mem = 0;
2009 env->num_named = 0;
2010 env->mem_alloc = 0;
2011 env->mem_env_dynamic = (MemEnv* )NULL;
2012
2013 xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static));
2014
2015 env->parse_depth = 0;
2016 #ifdef ONIG_DEBUG_PARSE
2017 env->max_parse_depth = 0;
2018 #endif
2019 env->backref_num = 0;
2020 env->keep_num = 0;
2021 env->id_num = 0;
2022 env->save_alloc_num = 0;
2023 env->saves = 0;
2024 }
2025
2026 static int
scan_env_add_mem_entry(ScanEnv * env)2027 scan_env_add_mem_entry(ScanEnv* env)
2028 {
2029 int i, need, alloc;
2030 MemEnv* p;
2031
2032 need = env->num_mem + 1;
2033 if (need > MaxCaptureNum && MaxCaptureNum != 0)
2034 return ONIGERR_TOO_MANY_CAPTURES;
2035
2036 if (need >= SCANENV_MEMENV_SIZE) {
2037 if (env->mem_alloc <= need) {
2038 if (IS_NULL(env->mem_env_dynamic)) {
2039 alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE;
2040 p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);
2041 CHECK_NULL_RETURN_MEMERR(p);
2042 xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static));
2043 }
2044 else {
2045 alloc = env->mem_alloc * 2;
2046 p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc);
2047 CHECK_NULL_RETURN_MEMERR(p);
2048 }
2049
2050 for (i = env->num_mem + 1; i < alloc; i++) {
2051 p[i].mem_node = NULL_NODE;
2052 p[i].empty_repeat_node = NULL_NODE;
2053 }
2054
2055 env->mem_env_dynamic = p;
2056 env->mem_alloc = alloc;
2057 }
2058 }
2059
2060 env->num_mem++;
2061 return env->num_mem;
2062 }
2063
2064 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)2065 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
2066 {
2067 if (env->num_mem >= num)
2068 SCANENV_MEMENV(env)[num].mem_node = node;
2069 else
2070 return ONIGERR_PARSER_BUG;
2071 return 0;
2072 }
2073
2074 static void
node_free_body(Node * node)2075 node_free_body(Node* node)
2076 {
2077 if (IS_NULL(node)) return ;
2078
2079 switch (NODE_TYPE(node)) {
2080 case NODE_STRING:
2081 if (STR_(node)->capacity != 0 &&
2082 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
2083 xfree(STR_(node)->s);
2084 }
2085 break;
2086
2087 case NODE_LIST:
2088 case NODE_ALT:
2089 onig_node_free(NODE_CAR(node));
2090 node = NODE_CDR(node);
2091 while (IS_NOT_NULL(node)) {
2092 Node* next = NODE_CDR(node);
2093 onig_node_free(NODE_CAR(node));
2094 xfree(node);
2095 node = next;
2096 }
2097 break;
2098
2099 case NODE_CCLASS:
2100 {
2101 CClassNode* cc = CCLASS_(node);
2102
2103 if (cc->mbuf)
2104 bbuf_free(cc->mbuf);
2105 }
2106 break;
2107
2108 case NODE_BACKREF:
2109 if (IS_NOT_NULL(BACKREF_(node)->back_dynamic))
2110 xfree(BACKREF_(node)->back_dynamic);
2111 break;
2112
2113 case NODE_BAG:
2114 if (NODE_BODY(node))
2115 onig_node_free(NODE_BODY(node));
2116
2117 {
2118 BagNode* en = BAG_(node);
2119 if (en->type == BAG_IF_ELSE) {
2120 onig_node_free(en->te.Then);
2121 onig_node_free(en->te.Else);
2122 }
2123 }
2124 break;
2125
2126 case NODE_QUANT:
2127 if (NODE_BODY(node))
2128 onig_node_free(NODE_BODY(node));
2129 break;
2130
2131 case NODE_ANCHOR:
2132 if (NODE_BODY(node))
2133 onig_node_free(NODE_BODY(node));
2134 if (IS_NOT_NULL(ANCHOR_(node)->lead_node))
2135 onig_node_free(ANCHOR_(node)->lead_node);
2136 break;
2137
2138 case NODE_CTYPE:
2139 case NODE_CALL:
2140 case NODE_GIMMICK:
2141 break;
2142 }
2143 }
2144
2145 extern void
onig_node_free(Node * node)2146 onig_node_free(Node* node)
2147 {
2148 if (IS_NULL(node)) return ;
2149
2150 #ifdef DEBUG_NODE_FREE
2151 fprintf(stderr, "onig_node_free: %p\n", node);
2152 #endif
2153
2154 node_free_body(node);
2155 xfree(node);
2156 }
2157
2158 static void
cons_node_free_alone(Node * node)2159 cons_node_free_alone(Node* node)
2160 {
2161 NODE_CAR(node) = 0;
2162 NODE_CDR(node) = 0;
2163 onig_node_free(node);
2164 }
2165
2166 static Node*
node_new(void)2167 node_new(void)
2168 {
2169 Node* node;
2170
2171 node = (Node* )xmalloc(sizeof(Node));
2172 CHECK_NULL_RETURN(node);
2173 xmemset(node, 0, sizeof(*node));
2174
2175 #ifdef DEBUG_NODE_FREE
2176 fprintf(stderr, "node_new: %p\n", node);
2177 #endif
2178 return node;
2179 }
2180
2181 extern int
onig_node_copy(Node ** rcopy,Node * from)2182 onig_node_copy(Node** rcopy, Node* from)
2183 {
2184 int r;
2185 Node* copy;
2186
2187 *rcopy = NULL_NODE;
2188
2189 switch (NODE_TYPE(from)) {
2190 case NODE_LIST:
2191 case NODE_ALT:
2192 case NODE_ANCHOR:
2193 /* These node's link to other nodes are processed by caller. */
2194 break;
2195 case NODE_STRING:
2196 case NODE_CCLASS:
2197 case NODE_CTYPE:
2198 /* Fixed contents after copy. */
2199 break;
2200 default:
2201 /* Not supported yet. */
2202 return ONIGERR_TYPE_BUG;
2203 break;
2204 }
2205
2206 copy = node_new();
2207 CHECK_NULL_RETURN_MEMERR(copy);
2208 xmemcpy(copy, from, sizeof(*copy));
2209
2210 switch (NODE_TYPE(copy)) {
2211 case NODE_STRING:
2212 r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE);
2213 if (r != 0) {
2214 err:
2215 onig_node_free(copy);
2216 return r;
2217 }
2218 break;
2219
2220 case NODE_CCLASS:
2221 {
2222 CClassNode *fcc, *tcc;
2223
2224 fcc = CCLASS_(from);
2225 tcc = CCLASS_(copy);
2226 if (IS_NOT_NULL(fcc->mbuf)) {
2227 r = bbuf_clone(&(tcc->mbuf), fcc->mbuf);
2228 if (r != 0) goto err;
2229 }
2230 }
2231 break;
2232
2233 default:
2234 break;
2235 }
2236
2237 *rcopy = copy;
2238 return ONIG_NORMAL;
2239 }
2240
2241
2242 static void
initialize_cclass(CClassNode * cc)2243 initialize_cclass(CClassNode* cc)
2244 {
2245 BITSET_CLEAR(cc->bs);
2246 cc->flags = 0;
2247 cc->mbuf = NULL;
2248 }
2249
2250 static Node*
node_new_cclass(void)2251 node_new_cclass(void)
2252 {
2253 Node* node = node_new();
2254 CHECK_NULL_RETURN(node);
2255
2256 NODE_SET_TYPE(node, NODE_CCLASS);
2257 initialize_cclass(CCLASS_(node));
2258 return node;
2259 }
2260
2261 static Node*
node_new_ctype(int type,int not,OnigOptionType options)2262 node_new_ctype(int type, int not, OnigOptionType options)
2263 {
2264 Node* node = node_new();
2265 CHECK_NULL_RETURN(node);
2266
2267 NODE_SET_TYPE(node, NODE_CTYPE);
2268 CTYPE_(node)->ctype = type;
2269 CTYPE_(node)->not = not;
2270 CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options);
2271 return node;
2272 }
2273
2274 static Node*
node_new_anychar(OnigOptionType options)2275 node_new_anychar(OnigOptionType options)
2276 {
2277 Node* node;
2278
2279 node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options);
2280 CHECK_NULL_RETURN(node);
2281
2282 if (OPTON_MULTILINE(options))
2283 NODE_STATUS_ADD(node, MULTILINE);
2284 return node;
2285 }
2286
2287 static int
node_new_no_newline(Node ** node,ScanEnv * env)2288 node_new_no_newline(Node** node, ScanEnv* env)
2289 {
2290 Node* n;
2291
2292 n = node_new_anychar(ONIG_OPTION_NONE);
2293 CHECK_NULL_RETURN_MEMERR(n);
2294 *node = n;
2295 return 0;
2296 }
2297
2298 static int
node_new_true_anychar(Node ** node)2299 node_new_true_anychar(Node** node)
2300 {
2301 Node* n;
2302
2303 n = node_new_anychar(ONIG_OPTION_MULTILINE);
2304 CHECK_NULL_RETURN_MEMERR(n);
2305 *node = n;
2306 return 0;
2307 }
2308
2309 static Node*
node_new_list(Node * left,Node * right)2310 node_new_list(Node* left, Node* right)
2311 {
2312 Node* node = node_new();
2313 CHECK_NULL_RETURN(node);
2314
2315 NODE_SET_TYPE(node, NODE_LIST);
2316 NODE_CAR(node) = left;
2317 NODE_CDR(node) = right;
2318 return node;
2319 }
2320
2321 extern Node*
onig_node_new_list(Node * left,Node * right)2322 onig_node_new_list(Node* left, Node* right)
2323 {
2324 return node_new_list(left, right);
2325 }
2326
2327 extern Node*
onig_node_new_alt(Node * left,Node * right)2328 onig_node_new_alt(Node* left, Node* right)
2329 {
2330 Node* node = node_new();
2331 CHECK_NULL_RETURN(node);
2332
2333 NODE_SET_TYPE(node, NODE_ALT);
2334 NODE_CAR(node) = left;
2335 NODE_CDR(node) = right;
2336 return node;
2337 }
2338
2339 static Node*
make_list_or_alt(NodeType type,int n,Node * ns[])2340 make_list_or_alt(NodeType type, int n, Node* ns[])
2341 {
2342 Node* r;
2343
2344 if (n <= 0) return NULL_NODE;
2345
2346 if (n == 1) {
2347 r = node_new();
2348 CHECK_NULL_RETURN(r);
2349 NODE_SET_TYPE(r, type);
2350 NODE_CAR(r) = ns[0];
2351 NODE_CDR(r) = NULL_NODE;
2352 }
2353 else {
2354 Node* right;
2355
2356 r = node_new();
2357 CHECK_NULL_RETURN(r);
2358
2359 right = make_list_or_alt(type, n - 1, ns + 1);
2360 if (IS_NULL(right)) {
2361 onig_node_free(r);
2362 return NULL_NODE;
2363 }
2364
2365 NODE_SET_TYPE(r, type);
2366 NODE_CAR(r) = ns[0];
2367 NODE_CDR(r) = right;
2368 }
2369
2370 return r;
2371 }
2372
2373 static Node*
make_list(int n,Node * ns[])2374 make_list(int n, Node* ns[])
2375 {
2376 return make_list_or_alt(NODE_LIST, n, ns);
2377 }
2378
2379 static Node*
make_alt(int n,Node * ns[])2380 make_alt(int n, Node* ns[])
2381 {
2382 return make_list_or_alt(NODE_ALT, n, ns);
2383 }
2384
2385 static Node*
node_new_anchor(int type)2386 node_new_anchor(int type)
2387 {
2388 Node* node;
2389
2390 node = node_new();
2391 CHECK_NULL_RETURN(node);
2392
2393 NODE_SET_TYPE(node, NODE_ANCHOR);
2394 ANCHOR_(node)->type = type;
2395 ANCHOR_(node)->char_min_len = 0;
2396 ANCHOR_(node)->char_max_len = INFINITE_LEN;
2397 ANCHOR_(node)->ascii_mode = 0;
2398 ANCHOR_(node)->lead_node = NULL_NODE;
2399 return node;
2400 }
2401
2402 static Node*
node_new_anchor_with_options(int type,OnigOptionType options)2403 node_new_anchor_with_options(int type, OnigOptionType options)
2404 {
2405 int ascii_mode;
2406 Node* node;
2407
2408 node = node_new_anchor(type);
2409 CHECK_NULL_RETURN(node);
2410
2411 ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0;
2412 ANCHOR_(node)->ascii_mode = ascii_mode;
2413
2414 if (type == ANCR_TEXT_SEGMENT_BOUNDARY ||
2415 type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) {
2416 if (OPTON_TEXT_SEGMENT_WORD(options))
2417 NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD);
2418 }
2419
2420 return node;
2421 }
2422
2423 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2424 node_new_backref(int back_num, int* backrefs, int by_name,
2425 #ifdef USE_BACKREF_WITH_LEVEL
2426 int exist_level, int nest_level,
2427 #endif
2428 ScanEnv* env)
2429 {
2430 int i;
2431 Node* node;
2432
2433 node = node_new();
2434 CHECK_NULL_RETURN(node);
2435
2436 NODE_SET_TYPE(node, NODE_BACKREF);
2437 BACKREF_(node)->back_num = back_num;
2438 BACKREF_(node)->back_dynamic = (int* )NULL;
2439 if (by_name != 0)
2440 NODE_STATUS_ADD(node, BY_NAME);
2441
2442 if (OPTON_IGNORECASE(env->options))
2443 NODE_STATUS_ADD(node, IGNORECASE);
2444
2445 #ifdef USE_BACKREF_WITH_LEVEL
2446 if (exist_level != 0) {
2447 NODE_STATUS_ADD(node, NEST_LEVEL);
2448 BACKREF_(node)->nest_level = nest_level;
2449 }
2450 #endif
2451
2452 for (i = 0; i < back_num; i++) {
2453 if (backrefs[i] <= env->num_mem &&
2454 IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) {
2455 NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */
2456 break;
2457 }
2458 }
2459
2460 if (back_num <= NODE_BACKREFS_SIZE) {
2461 for (i = 0; i < back_num; i++)
2462 BACKREF_(node)->back_static[i] = backrefs[i];
2463 }
2464 else {
2465 int* p = (int* )xmalloc(sizeof(int) * back_num);
2466 if (IS_NULL(p)) {
2467 onig_node_free(node);
2468 return NULL;
2469 }
2470 BACKREF_(node)->back_dynamic = p;
2471 for (i = 0; i < back_num; i++)
2472 p[i] = backrefs[i];
2473 }
2474
2475 env->backref_num++;
2476 return node;
2477 }
2478
2479 static Node*
node_new_backref_checker(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)2480 node_new_backref_checker(int back_num, int* backrefs, int by_name,
2481 #ifdef USE_BACKREF_WITH_LEVEL
2482 int exist_level, int nest_level,
2483 #endif
2484 ScanEnv* env)
2485 {
2486 Node* node;
2487
2488 node = node_new_backref(back_num, backrefs, by_name,
2489 #ifdef USE_BACKREF_WITH_LEVEL
2490 exist_level, nest_level,
2491 #endif
2492 env);
2493 CHECK_NULL_RETURN(node);
2494
2495 NODE_STATUS_ADD(node, CHECKER);
2496 return node;
2497 }
2498
2499 #ifdef USE_CALL
2500 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum,int by_number)2501 node_new_call(UChar* name, UChar* name_end, int gnum, int by_number)
2502 {
2503 Node* node = node_new();
2504 CHECK_NULL_RETURN(node);
2505
2506 NODE_SET_TYPE(node, NODE_CALL);
2507 CALL_(node)->by_number = by_number;
2508 CALL_(node)->name = name;
2509 CALL_(node)->name_end = name_end;
2510 CALL_(node)->called_gnum = gnum;
2511 CALL_(node)->entry_count = 1;
2512 return node;
2513 }
2514 #endif
2515
2516 static Node*
node_new_quantifier(int lower,int upper,int by_number)2517 node_new_quantifier(int lower, int upper, int by_number)
2518 {
2519 Node* node = node_new();
2520 CHECK_NULL_RETURN(node);
2521
2522 NODE_SET_TYPE(node, NODE_QUANT);
2523 QUANT_(node)->lower = lower;
2524 QUANT_(node)->upper = upper;
2525 QUANT_(node)->greedy = 1;
2526 QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY;
2527 QUANT_(node)->head_exact = NULL_NODE;
2528 QUANT_(node)->next_head_exact = NULL_NODE;
2529 QUANT_(node)->include_referred = 0;
2530 if (by_number != 0)
2531 NODE_STATUS_ADD(node, BY_NUMBER);
2532
2533 return node;
2534 }
2535
2536 static Node*
node_new_bag(enum BagType type)2537 node_new_bag(enum BagType type)
2538 {
2539 Node* node = node_new();
2540 CHECK_NULL_RETURN(node);
2541
2542 NODE_SET_TYPE(node, NODE_BAG);
2543 BAG_(node)->type = type;
2544
2545 switch (type) {
2546 case BAG_MEMORY:
2547 BAG_(node)->m.regnum = 0;
2548 BAG_(node)->m.called_addr = -1;
2549 BAG_(node)->m.entry_count = 1;
2550 BAG_(node)->m.called_state = 0;
2551 break;
2552
2553 case BAG_OPTION:
2554 BAG_(node)->o.options = 0;
2555 break;
2556
2557 case BAG_STOP_BACKTRACK:
2558 break;
2559
2560 case BAG_IF_ELSE:
2561 BAG_(node)->te.Then = 0;
2562 BAG_(node)->te.Else = 0;
2563 break;
2564 }
2565
2566 BAG_(node)->opt_count = 0;
2567 return node;
2568 }
2569
2570 extern Node*
onig_node_new_bag(enum BagType type)2571 onig_node_new_bag(enum BagType type)
2572 {
2573 return node_new_bag(type);
2574 }
2575
2576 static Node*
node_new_bag_if_else(Node * cond,Node * Then,Node * Else)2577 node_new_bag_if_else(Node* cond, Node* Then, Node* Else)
2578 {
2579 Node* n;
2580 n = node_new_bag(BAG_IF_ELSE);
2581 CHECK_NULL_RETURN(n);
2582
2583 NODE_BODY(n) = cond;
2584 BAG_(n)->te.Then = Then;
2585 BAG_(n)->te.Else = Else;
2586 return n;
2587 }
2588
2589 static Node*
node_new_memory(int is_named)2590 node_new_memory(int is_named)
2591 {
2592 Node* node = node_new_bag(BAG_MEMORY);
2593 CHECK_NULL_RETURN(node);
2594 if (is_named != 0)
2595 NODE_STATUS_ADD(node, NAMED_GROUP);
2596
2597 return node;
2598 }
2599
2600 static Node*
node_new_option(OnigOptionType option)2601 node_new_option(OnigOptionType option)
2602 {
2603 Node* node = node_new_bag(BAG_OPTION);
2604 CHECK_NULL_RETURN(node);
2605 BAG_(node)->o.options = option;
2606 return node;
2607 }
2608
2609 static Node*
node_new_group(Node * content)2610 node_new_group(Node* content)
2611 {
2612 Node* node;
2613
2614 node = node_new();
2615 CHECK_NULL_RETURN(node);
2616 NODE_SET_TYPE(node, NODE_LIST);
2617 NODE_CAR(node) = content;
2618 NODE_CDR(node) = NULL_NODE;
2619
2620 return node;
2621 }
2622
2623 static Node*
node_drop_group(Node * group)2624 node_drop_group(Node* group)
2625 {
2626 Node* content;
2627
2628 content = NODE_CAR(group);
2629 NODE_CAR(group) = NULL_NODE;
2630 onig_node_free(group);
2631 return content;
2632 }
2633
2634 static int
node_set_fail(Node * node)2635 node_set_fail(Node* node)
2636 {
2637 NODE_SET_TYPE(node, NODE_GIMMICK);
2638 GIMMICK_(node)->type = GIMMICK_FAIL;
2639 return ONIG_NORMAL;
2640 }
2641
2642 static int
node_new_fail(Node ** node,ScanEnv * env)2643 node_new_fail(Node** node, ScanEnv* env)
2644 {
2645 *node = node_new();
2646 CHECK_NULL_RETURN_MEMERR(*node);
2647
2648 return node_set_fail(*node);
2649 }
2650
2651 extern int
onig_node_reset_fail(Node * node)2652 onig_node_reset_fail(Node* node)
2653 {
2654 node_free_body(node);
2655 return node_set_fail(node);
2656 }
2657
2658 static int
node_new_save_gimmick(Node ** node,enum SaveType save_type,ScanEnv * env)2659 node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
2660 {
2661 int id;
2662
2663 ID_ENTRY(env, id);
2664
2665 *node = node_new();
2666 CHECK_NULL_RETURN_MEMERR(*node);
2667
2668 NODE_SET_TYPE(*node, NODE_GIMMICK);
2669 GIMMICK_(*node)->id = id;
2670 GIMMICK_(*node)->type = GIMMICK_SAVE;
2671 GIMMICK_(*node)->detail_type = (int )save_type;
2672
2673 return ONIG_NORMAL;
2674 }
2675
2676 static int
node_new_update_var_gimmick(Node ** node,enum UpdateVarType update_var_type,int id,ScanEnv * env)2677 node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
2678 int id, ScanEnv* env)
2679 {
2680 *node = node_new();
2681 CHECK_NULL_RETURN_MEMERR(*node);
2682
2683 NODE_SET_TYPE(*node, NODE_GIMMICK);
2684 GIMMICK_(*node)->id = id;
2685 GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR;
2686 GIMMICK_(*node)->detail_type = (int )update_var_type;
2687
2688 return ONIG_NORMAL;
2689 }
2690
2691 static int
node_new_keep(Node ** node,ScanEnv * env)2692 node_new_keep(Node** node, ScanEnv* env)
2693 {
2694 int r;
2695
2696 r = node_new_save_gimmick(node, SAVE_KEEP, env);
2697 if (r != 0) return r;
2698
2699 env->keep_num++;
2700 return ONIG_NORMAL;
2701 }
2702
2703 #ifdef USE_CALLOUT
2704
2705 extern void
onig_free_reg_callout_list(int n,CalloutListEntry * list)2706 onig_free_reg_callout_list(int n, CalloutListEntry* list)
2707 {
2708 int i;
2709 int j;
2710
2711 if (IS_NULL(list)) return ;
2712
2713 for (i = 0; i < n; i++) {
2714 if (list[i].of == ONIG_CALLOUT_OF_NAME) {
2715 for (j = 0; j < list[i].u.arg.passed_num; j++) {
2716 if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) {
2717 if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start))
2718 xfree(list[i].u.arg.vals[j].s.start);
2719 }
2720 }
2721 }
2722 else { /* ONIG_CALLOUT_OF_CONTENTS */
2723 if (IS_NOT_NULL(list[i].u.content.start)) {
2724 xfree((void* )list[i].u.content.start);
2725 }
2726 }
2727 }
2728
2729 xfree(list);
2730 }
2731
2732 extern CalloutListEntry*
onig_reg_callout_list_at(regex_t * reg,int num)2733 onig_reg_callout_list_at(regex_t* reg, int num)
2734 {
2735 RegexExt* ext = reg->extp;
2736 CHECK_NULL_RETURN(ext);
2737
2738 if (num <= 0 || num > ext->callout_num)
2739 return 0;
2740
2741 num--;
2742 return ext->callout_list + num;
2743 }
2744
2745 static int
reg_callout_list_entry(ScanEnv * env,int * rnum)2746 reg_callout_list_entry(ScanEnv* env, int* rnum)
2747 {
2748 #define INIT_CALLOUT_LIST_NUM 3
2749
2750 int num;
2751 CalloutListEntry* list;
2752 CalloutListEntry* e;
2753 RegexExt* ext;
2754
2755 ext = onig_get_regex_ext(env->reg);
2756 CHECK_NULL_RETURN_MEMERR(ext);
2757
2758 if (IS_NULL(ext->callout_list)) {
2759 list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM);
2760 CHECK_NULL_RETURN_MEMERR(list);
2761
2762 ext->callout_list = list;
2763 ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM;
2764 ext->callout_num = 0;
2765 }
2766
2767 num = ext->callout_num + 1;
2768 if (num > ext->callout_list_alloc) {
2769 int alloc = ext->callout_list_alloc * 2;
2770 list = (CalloutListEntry* )xrealloc(ext->callout_list,
2771 sizeof(CalloutListEntry) * alloc);
2772 CHECK_NULL_RETURN_MEMERR(list);
2773
2774 ext->callout_list = list;
2775 ext->callout_list_alloc = alloc;
2776 }
2777
2778 e = ext->callout_list + (num - 1);
2779
2780 e->flag = 0;
2781 e->of = 0;
2782 e->in = ONIG_CALLOUT_OF_CONTENTS;
2783 e->type = 0;
2784 e->tag_start = 0;
2785 e->tag_end = 0;
2786 e->start_func = 0;
2787 e->end_func = 0;
2788 e->u.arg.num = 0;
2789 e->u.arg.passed_num = 0;
2790
2791 ext->callout_num = num;
2792 *rnum = num;
2793 return ONIG_NORMAL;
2794 }
2795
2796 static int
node_new_callout(Node ** node,OnigCalloutOf callout_of,int num,int id,ScanEnv * env)2797 node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
2798 ScanEnv* env)
2799 {
2800 *node = node_new();
2801 CHECK_NULL_RETURN_MEMERR(*node);
2802
2803 NODE_SET_TYPE(*node, NODE_GIMMICK);
2804 GIMMICK_(*node)->id = id;
2805 GIMMICK_(*node)->num = num;
2806 GIMMICK_(*node)->type = GIMMICK_CALLOUT;
2807 GIMMICK_(*node)->detail_type = (int )callout_of;
2808
2809 return ONIG_NORMAL;
2810 }
2811 #endif
2812
2813 static int
make_text_segment(Node ** node,ScanEnv * env)2814 make_text_segment(Node** node, ScanEnv* env)
2815 {
2816 int r;
2817 int i;
2818 Node* x;
2819 Node* ns[2];
2820
2821 /* \X == (?>\O(?:\Y\O)*) */
2822
2823 ns[1] = NULL_NODE;
2824
2825 r = ONIGERR_MEMORY;
2826 ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options);
2827 if (IS_NULL(ns[0])) goto err;
2828
2829 r = node_new_true_anychar(&ns[1]);
2830 if (r != 0) goto err1;
2831
2832 x = make_list(2, ns);
2833 if (IS_NULL(x)) goto err;
2834 ns[0] = x;
2835 ns[1] = NULL_NODE;
2836
2837 x = node_new_quantifier(0, INFINITE_REPEAT, TRUE);
2838 if (IS_NULL(x)) goto err;
2839
2840 NODE_BODY(x) = ns[0];
2841 ns[0] = NULL_NODE;
2842 ns[1] = x;
2843
2844 r = node_new_true_anychar(&ns[0]);
2845 if (r != 0) goto err1;
2846
2847 x = make_list(2, ns);
2848 if (IS_NULL(x)) goto err;
2849
2850 ns[0] = x;
2851 ns[1] = NULL_NODE;
2852
2853 x = node_new_bag(BAG_STOP_BACKTRACK);
2854 if (IS_NULL(x)) goto err;
2855
2856 NODE_BODY(x) = ns[0];
2857
2858 *node = x;
2859 return ONIG_NORMAL;
2860
2861 err:
2862 r = ONIGERR_MEMORY;
2863 err1:
2864 for (i = 0; i < 2; i++) onig_node_free(ns[i]);
2865 return r;
2866 }
2867
2868 static int
make_absent_engine(Node ** node,int pre_save_right_id,Node * absent,Node * step_one,int lower,int upper,int possessive,int is_range_cutter,ScanEnv * env)2869 make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
2870 Node* step_one, int lower, int upper, int possessive,
2871 int is_range_cutter, ScanEnv* env)
2872 {
2873 int r;
2874 int i;
2875 int id;
2876 Node* x;
2877 Node* ns[4];
2878
2879 for (i = 0; i < 4; i++) ns[i] = NULL_NODE;
2880
2881 ns[1] = absent;
2882 ns[3] = step_one; /* for err */
2883 r = node_new_save_gimmick(&ns[0], SAVE_S, env);
2884 if (r != 0) goto err;
2885
2886 id = GIMMICK_(ns[0])->id;
2887 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK,
2888 id, env);
2889 if (r != 0) goto err;
2890
2891 if (is_range_cutter != 0)
2892 NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS);
2893
2894 r = node_new_fail(&ns[3], env);
2895 if (r != 0) goto err;
2896
2897 x = make_list(4, ns);
2898 if (IS_NULL(x)) goto err0;
2899
2900 ns[0] = x;
2901 ns[1] = step_one;
2902 ns[2] = ns[3] = NULL_NODE;
2903
2904 x = make_alt(2, ns);
2905 if (IS_NULL(x)) goto err0;
2906
2907 ns[0] = x;
2908
2909 x = node_new_quantifier(lower, upper, FALSE);
2910 if (IS_NULL(x)) goto err0;
2911
2912 NODE_BODY(x) = ns[0];
2913 ns[0] = x;
2914
2915 if (possessive != 0) {
2916 x = node_new_bag(BAG_STOP_BACKTRACK);
2917 if (IS_NULL(x)) goto err0;
2918
2919 NODE_BODY(x) = ns[0];
2920 ns[0] = x;
2921 }
2922
2923 r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2924 pre_save_right_id, env);
2925 if (r != 0) goto err;
2926
2927 r = node_new_fail(&ns[2], env);
2928 if (r != 0) goto err;
2929
2930 x = make_list(2, ns + 1);
2931 if (IS_NULL(x)) goto err0;
2932
2933 ns[1] = x; ns[2] = NULL_NODE;
2934
2935 x = make_alt(2, ns);
2936 if (IS_NULL(x)) goto err0;
2937
2938 if (is_range_cutter != FALSE)
2939 NODE_STATUS_ADD(x, SUPER);
2940
2941 *node = x;
2942 return ONIG_NORMAL;
2943
2944 err0:
2945 r = ONIGERR_MEMORY;
2946 err:
2947 for (i = 0; i < 4; i++) onig_node_free(ns[i]);
2948 return r;
2949 }
2950
2951 static int
make_absent_tail(Node ** node1,Node ** node2,int pre_save_right_id,ScanEnv * env)2952 make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
2953 ScanEnv* env)
2954 {
2955 int r;
2956 int id;
2957 Node* save;
2958 Node* x;
2959 Node* ns[2];
2960
2961 *node1 = *node2 = NULL_NODE;
2962 save = ns[0] = ns[1] = NULL_NODE;
2963
2964 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
2965 if (r != 0) goto err;
2966
2967 id = GIMMICK_(save)->id;
2968 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2969 id, env);
2970 if (r != 0) goto err;
2971
2972 r = node_new_fail(&ns[1], env);
2973 if (r != 0) goto err;
2974
2975 x = make_list(2, ns);
2976 if (IS_NULL(x)) goto err0;
2977
2978 ns[0] = NULL_NODE; ns[1] = x;
2979
2980 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
2981 pre_save_right_id, env);
2982 if (r != 0) goto err;
2983
2984 x = make_alt(2, ns);
2985 if (IS_NULL(x)) goto err0;
2986
2987 *node1 = save;
2988 *node2 = x;
2989 return ONIG_NORMAL;
2990
2991 err0:
2992 r = ONIGERR_MEMORY;
2993 err:
2994 onig_node_free(save);
2995 onig_node_free(ns[0]);
2996 onig_node_free(ns[1]);
2997 return r;
2998 }
2999
3000 static int
make_range_clear(Node ** node,ScanEnv * env)3001 make_range_clear(Node** node, ScanEnv* env)
3002 {
3003 int r;
3004 int id;
3005 Node* save;
3006 Node* x;
3007 Node* ns[2];
3008
3009 *node = NULL_NODE;
3010 save = ns[0] = ns[1] = NULL_NODE;
3011
3012 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env);
3013 if (r != 0) goto err;
3014
3015 id = GIMMICK_(save)->id;
3016 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3017 id, env);
3018 if (r != 0) goto err;
3019
3020 r = node_new_fail(&ns[1], env);
3021 if (r != 0) goto err;
3022
3023 x = make_list(2, ns);
3024 if (IS_NULL(x)) goto err0;
3025
3026 ns[0] = NULL_NODE; ns[1] = x;
3027
3028 #define ID_NOT_USED_DONT_CARE_ME 0
3029
3030 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT,
3031 ID_NOT_USED_DONT_CARE_ME, env);
3032 if (r != 0) goto err;
3033 NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS);
3034
3035 x = make_alt(2, ns);
3036 if (IS_NULL(x)) goto err0;
3037
3038 NODE_STATUS_ADD(x, SUPER);
3039
3040 ns[0] = save;
3041 ns[1] = x;
3042 save = NULL_NODE;
3043 x = make_list(2, ns);
3044 if (IS_NULL(x)) goto err0;
3045
3046 *node = x;
3047 return ONIG_NORMAL;
3048
3049 err0:
3050 r = ONIGERR_MEMORY;
3051 err:
3052 onig_node_free(save);
3053 onig_node_free(ns[0]);
3054 onig_node_free(ns[1]);
3055 return r;
3056 }
3057
3058 static int
is_simple_one_char_repeat(Node * node,Node ** rquant,Node ** rbody,int * is_possessive,ScanEnv * env)3059 is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
3060 int* is_possessive, ScanEnv* env)
3061 {
3062 Node* quant;
3063 Node* body;
3064
3065 *rquant = *rbody = 0;
3066 *is_possessive = 0;
3067
3068 if (NODE_TYPE(node) == NODE_QUANT) {
3069 quant = node;
3070 }
3071 else {
3072 if (NODE_TYPE(node) == NODE_BAG) {
3073 BagNode* en = BAG_(node);
3074 if (en->type == BAG_STOP_BACKTRACK) {
3075 *is_possessive = 1;
3076 quant = NODE_BAG_BODY(en);
3077 if (NODE_TYPE(quant) != NODE_QUANT)
3078 return 0;
3079 }
3080 else
3081 return 0;
3082 }
3083 else
3084 return 0;
3085 }
3086
3087 if (QUANT_(quant)->greedy == 0)
3088 return 0;
3089
3090 body = NODE_BODY(quant);
3091 switch (NODE_TYPE(body)) {
3092 case NODE_STRING:
3093 {
3094 int len;
3095 StrNode* sn = STR_(body);
3096 UChar *s = sn->s;
3097
3098 len = 0;
3099 while (s < sn->end) {
3100 s += enclen(env->enc, s);
3101 len++;
3102 }
3103 if (len != 1)
3104 return 0;
3105 }
3106
3107 case NODE_CCLASS:
3108 break;
3109
3110 default:
3111 return 0;
3112 break;
3113 }
3114
3115 if (node != quant) {
3116 NODE_BODY(node) = 0;
3117 onig_node_free(node);
3118 }
3119 NODE_BODY(quant) = NULL_NODE;
3120 *rquant = quant;
3121 *rbody = body;
3122 return 1;
3123 }
3124
3125 static int
make_absent_tree_for_simple_one_char_repeat(Node ** node,Node * absent,Node * quant,Node * body,int possessive,ScanEnv * env)3126 make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant,
3127 Node* body, int possessive, ScanEnv* env)
3128 {
3129 int r;
3130 int i;
3131 int id1;
3132 int lower, upper;
3133 Node* x;
3134 Node* ns[4];
3135
3136 *node = NULL_NODE;
3137 r = ONIGERR_MEMORY;
3138 ns[0] = ns[1] = NULL_NODE;
3139 ns[2] = body, ns[3] = absent;
3140
3141 lower = QUANT_(quant)->lower;
3142 upper = QUANT_(quant)->upper;
3143
3144 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3145 if (r != 0) goto err;
3146
3147 id1 = GIMMICK_(ns[0])->id;
3148
3149 r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive,
3150 FALSE, env);
3151 if (r != 0) goto err;
3152
3153 ns[2] = ns[3] = NULL_NODE;
3154
3155 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK,
3156 id1, env);
3157 if (r != 0) goto err;
3158
3159 x = make_list(3, ns);
3160 if (IS_NULL(x)) goto err0;
3161
3162 *node = x;
3163 return ONIG_NORMAL;
3164
3165 err0:
3166 r = ONIGERR_MEMORY;
3167 err:
3168 for (i = 0; i < 4; i++) onig_node_free(ns[i]);
3169 return r;
3170 }
3171
3172 static int
make_absent_tree(Node ** node,Node * absent,Node * expr,int is_range_cutter,ScanEnv * env)3173 make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,
3174 ScanEnv* env)
3175 {
3176 int r;
3177 int i;
3178 int id1, id2;
3179 int possessive;
3180 Node* x;
3181 Node* ns[7];
3182
3183 r = ONIGERR_MEMORY;
3184 for (i = 0; i < 7; i++) ns[i] = NULL_NODE;
3185 ns[4] = expr; ns[5] = absent;
3186
3187 if (is_range_cutter == 0) {
3188 Node* quant;
3189 Node* body;
3190
3191 if (expr == NULL_NODE) {
3192 /* default expr \O* */
3193 quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
3194 if (IS_NULL(quant)) goto err0;
3195
3196 r = node_new_true_anychar(&body);
3197 if (r != 0) {
3198 onig_node_free(quant);
3199 goto err;
3200 }
3201 possessive = 0;
3202 goto simple;
3203 }
3204 else {
3205 if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) {
3206 simple:
3207 r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant,
3208 body, possessive, env);
3209 onig_node_free(quant);
3210 if (r != 0) {
3211 ns[4] = NULL_NODE;
3212 onig_node_free(body);
3213 goto err;
3214 }
3215
3216 return ONIG_NORMAL;
3217 }
3218 }
3219 }
3220
3221 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env);
3222 if (r != 0) goto err;
3223
3224 id1 = GIMMICK_(ns[0])->id;
3225
3226 r = node_new_save_gimmick(&ns[1], SAVE_S, env);
3227 if (r != 0) goto err;
3228
3229 id2 = GIMMICK_(ns[1])->id;
3230
3231 r = node_new_true_anychar(&ns[3]);
3232 if (r != 0) goto err;
3233
3234 possessive = 1;
3235 r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,
3236 possessive, is_range_cutter, env);
3237 if (r != 0) goto err;
3238
3239 ns[3] = NULL_NODE;
3240 ns[5] = NULL_NODE;
3241
3242 r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env);
3243 if (r != 0) goto err;
3244
3245 if (is_range_cutter != 0) {
3246 x = make_list(4, ns);
3247 if (IS_NULL(x)) goto err0;
3248 }
3249 else {
3250 r = make_absent_tail(&ns[5], &ns[6], id1, env);
3251 if (r != 0) goto err;
3252
3253 x = make_list(7, ns);
3254 if (IS_NULL(x)) goto err0;
3255 }
3256
3257 *node = x;
3258 return ONIG_NORMAL;
3259
3260 err0:
3261 r = ONIGERR_MEMORY;
3262 err:
3263 for (i = 0; i < 7; i++) onig_node_free(ns[i]);
3264 return r;
3265 }
3266
3267 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)3268 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
3269 {
3270 int addlen = (int )(end - s);
3271
3272 if (addlen > 0) {
3273 int len = (int )(STR_(node)->end - STR_(node)->s);
3274
3275 if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) {
3276 UChar* p;
3277 int capa = len + addlen + NODE_STRING_MARGIN;
3278
3279 if (capa <= STR_(node)->capacity) {
3280 onig_strcpy(STR_(node)->s + len, s, end);
3281 }
3282 else {
3283 if (STR_(node)->s == STR_(node)->buf)
3284 p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end,
3285 s, end, capa);
3286 else
3287 p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa);
3288
3289 CHECK_NULL_RETURN_MEMERR(p);
3290 STR_(node)->s = p;
3291 STR_(node)->capacity = capa;
3292 }
3293 }
3294 else {
3295 onig_strcpy(STR_(node)->s + len, s, end);
3296 }
3297 STR_(node)->end = STR_(node)->s + len + addlen;
3298 }
3299
3300 return 0;
3301 }
3302
3303 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end,int need_free)3304 onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free)
3305 {
3306 onig_node_str_clear(node, need_free);
3307 return onig_node_str_cat(node, s, end);
3308 }
3309
3310 static int
node_str_cat_char(Node * node,UChar c)3311 node_str_cat_char(Node* node, UChar c)
3312 {
3313 UChar s[1];
3314
3315 s[0] = c;
3316 return onig_node_str_cat(node, s, s + 1);
3317 }
3318
3319 extern void
onig_node_str_clear(Node * node,int need_free)3320 onig_node_str_clear(Node* node, int need_free)
3321 {
3322 if (need_free != 0 &&
3323 STR_(node)->capacity != 0 &&
3324 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) {
3325 xfree(STR_(node)->s);
3326 }
3327
3328 STR_(node)->flag = 0;
3329 STR_(node)->s = STR_(node)->buf;
3330 STR_(node)->end = STR_(node)->buf;
3331 STR_(node)->capacity = 0;
3332 }
3333
3334 static int
node_set_str(Node * node,const UChar * s,const UChar * end)3335 node_set_str(Node* node, const UChar* s, const UChar* end)
3336 {
3337 int r;
3338
3339 NODE_SET_TYPE(node, NODE_STRING);
3340 STR_(node)->flag = 0;
3341 STR_(node)->s = STR_(node)->buf;
3342 STR_(node)->end = STR_(node)->buf;
3343 STR_(node)->capacity = 0;
3344
3345 r = onig_node_str_cat(node, s, end);
3346 return r;
3347 }
3348
3349 static Node*
node_new_str(const UChar * s,const UChar * end)3350 node_new_str(const UChar* s, const UChar* end)
3351 {
3352 int r;
3353 Node* node = node_new();
3354 CHECK_NULL_RETURN(node);
3355
3356 r = node_set_str(node, s, end);
3357 if (r != 0) {
3358 onig_node_free(node);
3359 return NULL;
3360 }
3361
3362 return node;
3363 }
3364
3365 static int
node_reset_str(Node * node,const UChar * s,const UChar * end)3366 node_reset_str(Node* node, const UChar* s, const UChar* end)
3367 {
3368 node_free_body(node);
3369 return node_set_str(node, s, end);
3370 }
3371
3372 extern int
onig_node_reset_empty(Node * node)3373 onig_node_reset_empty(Node* node)
3374 {
3375 return node_reset_str(node, NULL, NULL);
3376 }
3377
3378 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)3379 onig_node_new_str(const UChar* s, const UChar* end)
3380 {
3381 return node_new_str(s, end);
3382 }
3383
3384 static Node*
node_new_str_with_options(const UChar * s,const UChar * end,OnigOptionType options)3385 node_new_str_with_options(const UChar* s, const UChar* end,
3386 OnigOptionType options)
3387 {
3388 Node* node;
3389 node = node_new_str(s, end);
3390
3391 if (OPTON_IGNORECASE(options))
3392 NODE_STATUS_ADD(node, IGNORECASE);
3393
3394 return node;
3395 }
3396
3397 static Node*
node_new_str_crude(UChar * s,UChar * end,OnigOptionType options)3398 node_new_str_crude(UChar* s, UChar* end, OnigOptionType options)
3399 {
3400 Node* node = node_new_str_with_options(s, end, options);
3401 CHECK_NULL_RETURN(node);
3402 NODE_STRING_SET_CRUDE(node);
3403 return node;
3404 }
3405
3406 static Node*
node_new_empty(void)3407 node_new_empty(void)
3408 {
3409 return node_new_str(NULL, NULL);
3410 }
3411
3412 static Node*
node_new_str_crude_char(UChar c,OnigOptionType options)3413 node_new_str_crude_char(UChar c, OnigOptionType options)
3414 {
3415 int i;
3416 UChar p[1];
3417 Node* node;
3418
3419 p[0] = c;
3420 node = node_new_str_crude(p, p + 1, options);
3421
3422 /* clear buf tail */
3423 for (i = 1; i < NODE_STRING_BUF_SIZE; i++)
3424 STR_(node)->buf[i] = '\0';
3425
3426 return node;
3427 }
3428
3429 static Node*
str_node_split_last_char(Node * node,OnigEncoding enc)3430 str_node_split_last_char(Node* node, OnigEncoding enc)
3431 {
3432 const UChar *p;
3433 Node* rn;
3434 StrNode* sn;
3435
3436 sn = STR_(node);
3437 rn = NULL_NODE;
3438 if (sn->end > sn->s) {
3439 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
3440 if (p && p > sn->s) { /* can be split. */
3441 rn = node_new_str(p, sn->end);
3442 CHECK_NULL_RETURN(rn);
3443
3444 sn->end = (UChar* )p;
3445 STR_(rn)->flag = sn->flag;
3446 NODE_STATUS(rn) = NODE_STATUS(node);
3447 }
3448 }
3449
3450 return rn;
3451 }
3452
3453 static int
str_node_can_be_split(Node * node,OnigEncoding enc)3454 str_node_can_be_split(Node* node, OnigEncoding enc)
3455 {
3456 StrNode* sn = STR_(node);
3457 if (sn->end > sn->s) {
3458 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
3459 }
3460 return 0;
3461 }
3462
3463 static int
scan_number(UChar ** src,const UChar * end,OnigEncoding enc)3464 scan_number(UChar** src, const UChar* end, OnigEncoding enc)
3465 {
3466 int num, val;
3467 OnigCodePoint c;
3468 UChar* p = *src;
3469 PFETCH_READY;
3470
3471 num = 0;
3472 while (! PEND) {
3473 PFETCH(c);
3474 if (IS_CODE_DIGIT_ASCII(enc, c)) {
3475 val = (int )DIGITVAL(c);
3476 if ((ONIG_INT_MAX - val) / 10 < num)
3477 return -1; /* overflow */
3478
3479 num = num * 10 + val;
3480 }
3481 else {
3482 PUNFETCH;
3483 break;
3484 }
3485 }
3486 *src = p;
3487 return num;
3488 }
3489
3490 static int
scan_hexadecimal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3491 scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen,
3492 OnigEncoding enc, OnigCodePoint* rcode)
3493 {
3494 OnigCodePoint code;
3495 OnigCodePoint c;
3496 unsigned int val;
3497 int n;
3498 UChar* p = *src;
3499 PFETCH_READY;
3500
3501 code = 0;
3502 n = 0;
3503 while (! PEND && n < maxlen) {
3504 PFETCH(c);
3505 if (IS_CODE_XDIGIT_ASCII(enc, c)) {
3506 n++;
3507 val = (unsigned int )XDIGITVAL(enc, c);
3508 if ((UINT_MAX - val) / 16UL < code)
3509 return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3510
3511 code = (code << 4) + val;
3512 }
3513 else {
3514 PUNFETCH;
3515 break;
3516 }
3517 }
3518
3519 if (n < minlen)
3520 return ONIGERR_INVALID_CODE_POINT_VALUE;
3521
3522 *rcode = code;
3523 *src = p;
3524 return ONIG_NORMAL;
3525 }
3526
3527 static int
scan_octal_number(UChar ** src,UChar * end,int minlen,int maxlen,OnigEncoding enc,OnigCodePoint * rcode)3528 scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen,
3529 OnigEncoding enc, OnigCodePoint* rcode)
3530 {
3531 OnigCodePoint code;
3532 OnigCodePoint c;
3533 unsigned int val;
3534 int n;
3535 UChar* p = *src;
3536 PFETCH_READY;
3537
3538 code = 0;
3539 n = 0;
3540 while (! PEND && n < maxlen) {
3541 PFETCH(c);
3542 if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') {
3543 n++;
3544 val = (unsigned int )ODIGITVAL(c);
3545 if ((UINT_MAX - val) / 8UL < code)
3546 return ONIGERR_TOO_BIG_NUMBER; /* overflow */
3547
3548 code = (code << 3) + val;
3549 }
3550 else {
3551 PUNFETCH;
3552 break;
3553 }
3554 }
3555
3556 if (n < minlen)
3557 return ONIGERR_INVALID_CODE_POINT_VALUE;
3558
3559 *rcode = code;
3560 *src = p;
3561 return ONIG_NORMAL;
3562 }
3563
3564 static int
scan_number_of_base(UChar ** src,UChar * end,int minlen,OnigEncoding enc,OnigCodePoint * rcode,int base)3565 scan_number_of_base(UChar** src, UChar* end, int minlen,
3566 OnigEncoding enc, OnigCodePoint* rcode, int base)
3567 {
3568 int r;
3569
3570 if (base == 16)
3571 r = scan_hexadecimal_number(src, end, minlen, 8, enc, rcode);
3572 else if (base == 8)
3573 r = scan_octal_number(src, end, minlen, 11, enc, rcode);
3574 else
3575 r = ONIGERR_INVALID_CODE_POINT_VALUE;
3576
3577 return r;
3578 }
3579
3580 #define IS_CODE_POINT_DIVIDE(c) ((c) == ' ' || (c) == '\n')
3581
3582 enum CPS_STATE {
3583 CPS_EMPTY = 0,
3584 CPS_START = 1,
3585 CPS_RANGE = 2
3586 };
3587
3588 static int
check_code_point_sequence_cc(UChar * p,UChar * end,int base,OnigEncoding enc,int state)3589 check_code_point_sequence_cc(UChar* p, UChar* end, int base,
3590 OnigEncoding enc, int state)
3591 {
3592 int r;
3593 int n;
3594 int end_digit;
3595 OnigCodePoint code;
3596 OnigCodePoint c;
3597 PFETCH_READY;
3598
3599 end_digit = FALSE;
3600 n = 0;
3601 while (! PEND) {
3602 start:
3603 PFETCH(c);
3604 if (c == '}') {
3605 end_char:
3606 if (state == CPS_RANGE) return ONIGERR_INVALID_CODE_POINT_VALUE;
3607 return n;
3608 }
3609
3610 if (IS_CODE_POINT_DIVIDE(c)) {
3611 while (! PEND) {
3612 PFETCH(c);
3613 if (! IS_CODE_POINT_DIVIDE(c)) break;
3614 }
3615 if (IS_CODE_POINT_DIVIDE(c))
3616 return ONIGERR_INVALID_CODE_POINT_VALUE;
3617 }
3618 else if (c == '-') {
3619 range:
3620 if (state != CPS_START) return ONIGERR_INVALID_CODE_POINT_VALUE;
3621 if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
3622 end_digit = FALSE;
3623 state = CPS_RANGE;
3624 goto start;
3625 }
3626 else if (end_digit == TRUE) {
3627 if (base == 16) {
3628 if (IS_CODE_XDIGIT_ASCII(enc, c))
3629 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3630 }
3631 else if (base == 8) {
3632 if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8')
3633 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3634 }
3635
3636 return ONIGERR_INVALID_CODE_POINT_VALUE;
3637 }
3638
3639 if (c == '}') goto end_char;
3640 if (c == '-') goto range;
3641
3642 PUNFETCH;
3643 r = scan_number_of_base(&p, end, 1, enc, &code, base);
3644 if (r != 0) return r;
3645 n++;
3646 end_digit = TRUE;
3647 state = (state == CPS_RANGE) ? CPS_EMPTY : CPS_START;
3648 }
3649
3650 return ONIGERR_INVALID_CODE_POINT_VALUE;
3651 }
3652
3653 static int
check_code_point_sequence(UChar * p,UChar * end,int base,OnigEncoding enc)3654 check_code_point_sequence(UChar* p, UChar* end, int base, OnigEncoding enc)
3655 {
3656 int r;
3657 int n;
3658 int end_digit;
3659 OnigCodePoint code;
3660 OnigCodePoint c;
3661 PFETCH_READY;
3662
3663 end_digit = FALSE;
3664 n = 0;
3665 while (! PEND) {
3666 PFETCH(c);
3667 if (c == '}') {
3668 end_char:
3669 return n;
3670 }
3671
3672 if (IS_CODE_POINT_DIVIDE(c)) {
3673 while (! PEND) {
3674 PFETCH(c);
3675 if (! IS_CODE_POINT_DIVIDE(c)) break;
3676 }
3677 if (IS_CODE_POINT_DIVIDE(c))
3678 return ONIGERR_INVALID_CODE_POINT_VALUE;
3679 }
3680 else if (end_digit == TRUE) {
3681 if (base == 16) {
3682 if (IS_CODE_XDIGIT_ASCII(enc, c))
3683 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3684 }
3685 else if (base == 8) {
3686 if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8')
3687 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3688 }
3689
3690 return ONIGERR_INVALID_CODE_POINT_VALUE;
3691 }
3692
3693 if (c == '}') goto end_char;
3694
3695 PUNFETCH;
3696 r = scan_number_of_base(&p, end, 1, enc, &code, base);
3697 if (r != 0) return r;
3698 n++;
3699 end_digit = TRUE;
3700 }
3701
3702 return ONIGERR_INVALID_CODE_POINT_VALUE;
3703 }
3704
3705 static int
get_next_code_point(UChar ** src,UChar * end,int base,OnigEncoding enc,int in_cc,OnigCodePoint * rcode)3706 get_next_code_point(UChar** src, UChar* end, int base, OnigEncoding enc, int in_cc, OnigCodePoint* rcode)
3707 {
3708 int r;
3709 OnigCodePoint c;
3710 UChar* p = *src;
3711 PFETCH_READY;
3712
3713 while (! PEND) {
3714 PFETCH(c);
3715 if (! IS_CODE_POINT_DIVIDE(c)) {
3716 if (c == '}') {
3717 *src = p;
3718 return 1; /* end of sequence */
3719 }
3720 else if (c == '-' && in_cc == TRUE) {
3721 *src = p;
3722 return 2; /* range */
3723 }
3724 PUNFETCH;
3725 break;
3726 }
3727 else {
3728 if (PEND)
3729 return ONIGERR_INVALID_CODE_POINT_VALUE;
3730 }
3731 }
3732
3733 r = scan_number_of_base(&p, end, 1, enc, rcode, base);
3734 if (r != 0) return r;
3735
3736 *src = p;
3737 return ONIG_NORMAL;
3738 }
3739
3740
3741 #define BB_WRITE_CODE_POINT(bbuf,pos,code) \
3742 BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
3743
3744 /* data format:
3745 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
3746 (all data size is OnigCodePoint)
3747 */
3748 static int
new_code_range(BBuf ** pbuf)3749 new_code_range(BBuf** pbuf)
3750 {
3751 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
3752 int r;
3753 OnigCodePoint n;
3754 BBuf* bbuf;
3755
3756 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
3757 CHECK_NULL_RETURN_MEMERR(bbuf);
3758 r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE);
3759 if (r != 0) {
3760 xfree(bbuf);
3761 *pbuf = 0;
3762 return r;
3763 }
3764
3765 n = 0;
3766 BB_WRITE_CODE_POINT(bbuf, 0, n);
3767 return 0;
3768 }
3769
3770 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)3771 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
3772 {
3773 int r, inc_n, pos;
3774 int low, high, bound, x;
3775 OnigCodePoint n, *data;
3776 BBuf* bbuf;
3777
3778 if (from > to) {
3779 n = from; from = to; to = n;
3780 }
3781
3782 if (IS_NULL(*pbuf)) {
3783 r = new_code_range(pbuf);
3784 if (r != 0) return r;
3785 bbuf = *pbuf;
3786 n = 0;
3787 }
3788 else {
3789 bbuf = *pbuf;
3790 GET_CODE_POINT(n, bbuf->p);
3791 }
3792 data = (OnigCodePoint* )(bbuf->p);
3793 data++;
3794
3795 for (low = 0, bound = n; low < bound; ) {
3796 x = (low + bound) >> 1;
3797 if (from > data[x*2 + 1])
3798 low = x + 1;
3799 else
3800 bound = x;
3801 }
3802
3803 high = (to == ~((OnigCodePoint )0)) ? n : low;
3804 for (bound = n; high < bound; ) {
3805 x = (high + bound) >> 1;
3806 if (to + 1 >= data[x*2])
3807 high = x + 1;
3808 else
3809 bound = x;
3810 }
3811
3812 inc_n = low + 1 - high;
3813 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
3814 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
3815
3816 if (inc_n != 1) {
3817 if (from > data[low*2])
3818 from = data[low*2];
3819 if (to < data[(high - 1)*2 + 1])
3820 to = data[(high - 1)*2 + 1];
3821 }
3822
3823 if (inc_n != 0 && (OnigCodePoint )high < n) {
3824 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
3825 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
3826 int size = (n - high) * 2 * SIZE_CODE_POINT;
3827
3828 if (inc_n > 0) {
3829 BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
3830 }
3831 else {
3832 BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
3833 }
3834 }
3835
3836 pos = SIZE_CODE_POINT * (1 + low * 2);
3837 BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
3838 BB_WRITE_CODE_POINT(bbuf, pos, from);
3839 BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
3840 n += inc_n;
3841 BB_WRITE_CODE_POINT(bbuf, 0, n);
3842
3843 return 0;
3844 }
3845
3846 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)3847 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
3848 {
3849 if (from > to) {
3850 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3851 return 0;
3852 else
3853 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3854 }
3855
3856 return add_code_range_to_buf(pbuf, from, to);
3857 }
3858
3859 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)3860 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
3861 {
3862 int r, i, n;
3863 OnigCodePoint pre, from, *data, to = 0;
3864
3865 *pbuf = (BBuf* )NULL;
3866 if (IS_NULL(bbuf)) {
3867 set_all:
3868 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3869 }
3870
3871 data = (OnigCodePoint* )(bbuf->p);
3872 GET_CODE_POINT(n, data);
3873 data++;
3874 if (n <= 0) goto set_all;
3875
3876 r = 0;
3877 pre = MBCODE_START_POS(enc);
3878 for (i = 0; i < n; i++) {
3879 from = data[i*2];
3880 to = data[i*2+1];
3881 if (pre <= from - 1) {
3882 r = add_code_range_to_buf(pbuf, pre, from - 1);
3883 if (r != 0) {
3884 bbuf_free(*pbuf);
3885 return r;
3886 }
3887 }
3888 if (to == ~((OnigCodePoint )0)) break;
3889 pre = to + 1;
3890 }
3891 if (to < ~((OnigCodePoint )0)) {
3892 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
3893 if (r != 0) bbuf_free(*pbuf);
3894 }
3895 return r;
3896 }
3897
3898 #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\
3899 BBuf *tbuf; \
3900 int tnot; \
3901 tnot = not1; not1 = not2; not2 = tnot; \
3902 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
3903 } while (0)
3904
3905 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)3906 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
3907 BBuf* bbuf2, int not2, BBuf** pbuf)
3908 {
3909 int r;
3910 OnigCodePoint i, n1, *data1;
3911 OnigCodePoint from, to;
3912
3913 *pbuf = (BBuf* )NULL;
3914 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
3915 if (not1 != 0 || not2 != 0)
3916 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3917 return 0;
3918 }
3919
3920 r = 0;
3921 if (IS_NULL(bbuf2))
3922 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3923
3924 if (IS_NULL(bbuf1)) {
3925 if (not1 != 0) {
3926 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
3927 }
3928 else {
3929 if (not2 == 0) {
3930 return bbuf_clone(pbuf, bbuf2);
3931 }
3932 else {
3933 return not_code_range_buf(enc, bbuf2, pbuf);
3934 }
3935 }
3936 }
3937
3938 if (not1 != 0)
3939 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
3940
3941 data1 = (OnigCodePoint* )(bbuf1->p);
3942 GET_CODE_POINT(n1, data1);
3943 data1++;
3944
3945 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
3946 r = bbuf_clone(pbuf, bbuf2);
3947 }
3948 else if (not1 == 0) { /* 1 OR (not 2) */
3949 r = not_code_range_buf(enc, bbuf2, pbuf);
3950 }
3951 if (r != 0) return r;
3952
3953 for (i = 0; i < n1; i++) {
3954 from = data1[i*2];
3955 to = data1[i*2+1];
3956 r = add_code_range_to_buf(pbuf, from, to);
3957 if (r != 0) return r;
3958 }
3959 return 0;
3960 }
3961
3962 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)3963 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
3964 OnigCodePoint* data, int n)
3965 {
3966 int i, r;
3967 OnigCodePoint from2, to2;
3968
3969 for (i = 0; i < n; i++) {
3970 from2 = data[i*2];
3971 to2 = data[i*2+1];
3972 if (from2 < from1) {
3973 if (to2 < from1) continue;
3974 else {
3975 from1 = to2 + 1;
3976 }
3977 }
3978 else if (from2 <= to1) {
3979 if (to2 < to1) {
3980 if (from1 <= from2 - 1) {
3981 r = add_code_range_to_buf(pbuf, from1, from2-1);
3982 if (r != 0) return r;
3983 }
3984 from1 = to2 + 1;
3985 }
3986 else {
3987 to1 = from2 - 1;
3988 }
3989 }
3990 else {
3991 from1 = from2;
3992 }
3993 if (from1 > to1) break;
3994 }
3995 if (from1 <= to1) {
3996 r = add_code_range_to_buf(pbuf, from1, to1);
3997 if (r != 0) return r;
3998 }
3999 return 0;
4000 }
4001
4002 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)4003 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
4004 {
4005 int r;
4006 OnigCodePoint i, j, n1, n2, *data1, *data2;
4007 OnigCodePoint from, to, from1, to1, from2, to2;
4008
4009 *pbuf = (BBuf* )NULL;
4010 if (IS_NULL(bbuf1)) {
4011 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
4012 return bbuf_clone(pbuf, bbuf2);
4013 return 0;
4014 }
4015 else if (IS_NULL(bbuf2)) {
4016 if (not2 != 0)
4017 return bbuf_clone(pbuf, bbuf1);
4018 return 0;
4019 }
4020
4021 if (not1 != 0)
4022 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2);
4023
4024 data1 = (OnigCodePoint* )(bbuf1->p);
4025 data2 = (OnigCodePoint* )(bbuf2->p);
4026 GET_CODE_POINT(n1, data1);
4027 GET_CODE_POINT(n2, data2);
4028 data1++;
4029 data2++;
4030
4031 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
4032 for (i = 0; i < n1; i++) {
4033 from1 = data1[i*2];
4034 to1 = data1[i*2+1];
4035 for (j = 0; j < n2; j++) {
4036 from2 = data2[j*2];
4037 to2 = data2[j*2+1];
4038 if (from2 > to1) break;
4039 if (to2 < from1) continue;
4040 from = MAX(from1, from2);
4041 to = MIN(to1, to2);
4042 r = add_code_range_to_buf(pbuf, from, to);
4043 if (r != 0) return r;
4044 }
4045 }
4046 }
4047 else if (not1 == 0) { /* 1 AND (not 2) */
4048 for (i = 0; i < n1; i++) {
4049 from1 = data1[i*2];
4050 to1 = data1[i*2+1];
4051 r = and_code_range1(pbuf, from1, to1, data2, n2);
4052 if (r != 0) return r;
4053 }
4054 }
4055
4056 return 0;
4057 }
4058
4059 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)4060 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
4061 {
4062 int r, not1, not2;
4063 BBuf *buf1, *buf2, *pbuf;
4064 BitSetRef bsr1, bsr2;
4065 BitSet bs1, bs2;
4066
4067 not1 = IS_NCCLASS_NOT(dest);
4068 bsr1 = dest->bs;
4069 buf1 = dest->mbuf;
4070 not2 = IS_NCCLASS_NOT(cc);
4071 bsr2 = cc->bs;
4072 buf2 = cc->mbuf;
4073
4074 if (not1 != 0) {
4075 bitset_invert_to(bsr1, bs1);
4076 bsr1 = bs1;
4077 }
4078 if (not2 != 0) {
4079 bitset_invert_to(bsr2, bs2);
4080 bsr2 = bs2;
4081 }
4082 bitset_and(bsr1, bsr2);
4083 if (bsr1 != dest->bs) {
4084 bitset_copy(dest->bs, bsr1);
4085 }
4086 if (not1 != 0) {
4087 bitset_invert(dest->bs);
4088 }
4089
4090 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4091 if (not1 != 0 && not2 != 0) {
4092 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
4093 }
4094 else {
4095 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
4096 if (r == 0 && not1 != 0) {
4097 BBuf *tbuf;
4098 r = not_code_range_buf(enc, pbuf, &tbuf);
4099 if (r != 0) {
4100 bbuf_free(pbuf);
4101 return r;
4102 }
4103 bbuf_free(pbuf);
4104 pbuf = tbuf;
4105 }
4106 }
4107 if (r != 0) return r;
4108
4109 dest->mbuf = pbuf;
4110 bbuf_free(buf1);
4111 return r;
4112 }
4113 return 0;
4114 }
4115
4116 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)4117 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
4118 {
4119 int r, not1, not2;
4120 BBuf *buf1, *buf2, *pbuf;
4121 BitSetRef bsr1, bsr2;
4122 BitSet bs1, bs2;
4123
4124 not1 = IS_NCCLASS_NOT(dest);
4125 bsr1 = dest->bs;
4126 buf1 = dest->mbuf;
4127 not2 = IS_NCCLASS_NOT(cc);
4128 bsr2 = cc->bs;
4129 buf2 = cc->mbuf;
4130
4131 if (not1 != 0) {
4132 bitset_invert_to(bsr1, bs1);
4133 bsr1 = bs1;
4134 }
4135 if (not2 != 0) {
4136 bitset_invert_to(bsr2, bs2);
4137 bsr2 = bs2;
4138 }
4139 bitset_or(bsr1, bsr2);
4140 if (bsr1 != dest->bs) {
4141 bitset_copy(dest->bs, bsr1);
4142 }
4143 if (not1 != 0) {
4144 bitset_invert(dest->bs);
4145 }
4146
4147 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4148 if (not1 != 0 && not2 != 0) {
4149 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
4150 }
4151 else {
4152 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
4153 if (r == 0 && not1 != 0) {
4154 BBuf *tbuf;
4155 r = not_code_range_buf(enc, pbuf, &tbuf);
4156 if (r != 0) {
4157 bbuf_free(pbuf);
4158 return r;
4159 }
4160 bbuf_free(pbuf);
4161 pbuf = tbuf;
4162 }
4163 }
4164 if (r != 0) return r;
4165
4166 dest->mbuf = pbuf;
4167 bbuf_free(buf1);
4168 return r;
4169 }
4170 else
4171 return 0;
4172 }
4173
4174 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)4175 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
4176 {
4177 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
4178 switch (c) {
4179 case 'n': return '\n';
4180 case 't': return '\t';
4181 case 'r': return '\r';
4182 case 'f': return '\f';
4183 case 'a': return '\007';
4184 case 'b': return '\010';
4185 case 'e': return '\033';
4186 case 'v':
4187 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
4188 return '\v';
4189 break;
4190
4191 default:
4192 break;
4193 }
4194 }
4195 return c;
4196 }
4197
4198 static int
is_invalid_quantifier_target(Node * node)4199 is_invalid_quantifier_target(Node* node)
4200 {
4201 switch (NODE_TYPE(node)) {
4202 case NODE_ANCHOR:
4203 case NODE_GIMMICK:
4204 return 1;
4205 break;
4206
4207 case NODE_BAG:
4208 /* allow enclosed elements */
4209 /* return is_invalid_quantifier_target(NODE_BODY(node)); */
4210 break;
4211
4212 case NODE_LIST:
4213 do {
4214 if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0;
4215 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4216 return 0;
4217 break;
4218
4219 case NODE_ALT:
4220 do {
4221 if (is_invalid_quantifier_target(NODE_CAR(node))) return 1;
4222 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4223 break;
4224
4225 default:
4226 break;
4227 }
4228 return 0;
4229 }
4230
4231 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
4232 static int
quantifier_type_num(QuantNode * q)4233 quantifier_type_num(QuantNode* q)
4234 {
4235 if (q->greedy) {
4236 if (q->lower == 0) {
4237 if (q->upper == 1) return 0;
4238 else if (IS_INFINITE_REPEAT(q->upper)) return 1;
4239 }
4240 else if (q->lower == 1) {
4241 if (IS_INFINITE_REPEAT(q->upper)) return 2;
4242 }
4243 }
4244 else {
4245 if (q->lower == 0) {
4246 if (q->upper == 1) return 3;
4247 else if (IS_INFINITE_REPEAT(q->upper)) return 4;
4248 }
4249 else if (q->lower == 1) {
4250 if (IS_INFINITE_REPEAT(q->upper)) return 5;
4251 }
4252 }
4253 return -1;
4254 }
4255
4256
4257 enum ReduceType {
4258 RQ_ASIS = 0, /* as is */
4259 RQ_DEL = 1, /* delete parent */
4260 RQ_A, /* to '*' */
4261 RQ_AQ, /* to '*?' */
4262 RQ_QQ, /* to '??' */
4263 RQ_P_QQ, /* to '+)??' */
4264 RQ_PQ_Q /* to '+?)?' */
4265 };
4266
4267 static enum ReduceType ReduceTypeTable[6][6] = {
4268 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
4269 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
4270 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
4271 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
4272 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
4273 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
4274 };
4275
4276 extern int
onig_reduce_nested_quantifier(Node * pnode)4277 onig_reduce_nested_quantifier(Node* pnode)
4278 {
4279 int pnum, cnum;
4280 QuantNode *p, *c;
4281 Node* cnode;
4282
4283 cnode = NODE_BODY(pnode);
4284
4285 p = QUANT_(pnode);
4286 c = QUANT_(cnode);
4287 pnum = quantifier_type_num(p);
4288 cnum = quantifier_type_num(c);
4289 if (pnum < 0 || cnum < 0) {
4290 if (p->lower == p->upper && c->lower == c->upper) {
4291 int n = onig_positive_int_multiply(p->lower, c->lower);
4292 if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4293
4294 p->lower = p->upper = n;
4295 NODE_BODY(pnode) = NODE_BODY(cnode);
4296 goto remove_cnode;
4297 }
4298
4299 return 0;
4300 }
4301
4302 switch(ReduceTypeTable[cnum][pnum]) {
4303 case RQ_DEL:
4304 *pnode = *cnode;
4305 goto remove_cnode;
4306 break;
4307 case RQ_A:
4308 NODE_BODY(pnode) = NODE_BODY(cnode);
4309 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1;
4310 goto remove_cnode;
4311 break;
4312 case RQ_AQ:
4313 NODE_BODY(pnode) = NODE_BODY(cnode);
4314 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0;
4315 goto remove_cnode;
4316 break;
4317 case RQ_QQ:
4318 NODE_BODY(pnode) = NODE_BODY(cnode);
4319 p->lower = 0; p->upper = 1; p->greedy = 0;
4320 goto remove_cnode;
4321 break;
4322 case RQ_P_QQ:
4323 p->lower = 0; p->upper = 1; p->greedy = 0;
4324 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1;
4325 break;
4326 case RQ_PQ_Q:
4327 p->lower = 0; p->upper = 1; p->greedy = 1;
4328 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0;
4329 break;
4330 case RQ_ASIS:
4331 break;
4332 }
4333
4334 return 0;
4335
4336 remove_cnode:
4337 NODE_BODY(cnode) = NULL_NODE;
4338 onig_node_free(cnode);
4339 return 0;
4340 }
4341
4342 static int
node_new_general_newline(Node ** node,ScanEnv * env)4343 node_new_general_newline(Node** node, ScanEnv* env)
4344 {
4345 int r;
4346 int dlen, alen;
4347 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
4348 Node* crnl;
4349 Node* ncc;
4350 Node* x;
4351 CClassNode* cc;
4352
4353 dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf);
4354 if (dlen < 0) return dlen;
4355 alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen);
4356 if (alen < 0) return alen;
4357
4358 crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE);
4359 CHECK_NULL_RETURN_MEMERR(crnl);
4360
4361 ncc = node_new_cclass();
4362 if (IS_NULL(ncc)) goto err2;
4363
4364 cc = CCLASS_(ncc);
4365 if (dlen == 1) {
4366 bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d);
4367 }
4368 else {
4369 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d);
4370 if (r != 0) {
4371 err1:
4372 onig_node_free(ncc);
4373 err2:
4374 onig_node_free(crnl);
4375 return ONIGERR_MEMORY;
4376 }
4377 }
4378
4379 if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) {
4380 r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
4381 if (r != 0) goto err1;
4382 r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
4383 if (r != 0) goto err1;
4384 }
4385
4386 x = node_new_bag_if_else(crnl, NULL_NODE, ncc);
4387 if (IS_NULL(x)) goto err1;
4388
4389 *node = x;
4390 return 0;
4391 }
4392
4393 enum TokenSyms {
4394 TK_EOT = 0, /* end of token */
4395 TK_CRUDE_BYTE = 1,
4396 TK_CHAR,
4397 TK_STRING,
4398 TK_CODE_POINT,
4399 TK_ANYCHAR,
4400 TK_CHAR_TYPE,
4401 TK_BACKREF,
4402 TK_CALL,
4403 TK_ANCHOR,
4404 TK_REPEAT,
4405 TK_INTERVAL,
4406 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
4407 TK_ALT,
4408 TK_SUBEXP_OPEN,
4409 TK_SUBEXP_CLOSE,
4410 TK_OPEN_CC,
4411 TK_QUOTE_OPEN,
4412 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
4413 TK_KEEP, /* \K */
4414 TK_GENERAL_NEWLINE, /* \R */
4415 TK_NO_NEWLINE, /* \N */
4416 TK_TRUE_ANYCHAR, /* \O */
4417 TK_TEXT_SEGMENT, /* \X */
4418
4419 /* in cc */
4420 TK_CC_CLOSE,
4421 TK_CC_RANGE,
4422 TK_CC_POSIX_BRACKET_OPEN,
4423 TK_CC_AND, /* && */
4424 TK_CC_OPEN_CC /* [ */
4425 };
4426
4427 typedef struct {
4428 enum TokenSyms type;
4429 int code_point_continue;
4430 int escaped;
4431 int base_num; /* is number: 8, 16 (used in [....]) */
4432 UChar* backp;
4433 union {
4434 UChar* s;
4435 UChar byte;
4436 OnigCodePoint code;
4437 int anchor;
4438 int subtype;
4439 struct {
4440 int lower;
4441 int upper;
4442 int greedy;
4443 int possessive;
4444 } repeat;
4445 struct {
4446 int num;
4447 int ref1;
4448 int* refs;
4449 int by_name;
4450 #ifdef USE_BACKREF_WITH_LEVEL
4451 int exist_level;
4452 int level; /* \k<name+n> */
4453 #endif
4454 } backref;
4455 struct {
4456 UChar* name;
4457 UChar* name_end;
4458 int gnum;
4459 int by_number;
4460 } call;
4461 struct {
4462 int ctype;
4463 int not;
4464 } prop;
4465 } u;
4466 } PToken;
4467
4468 static void
ptoken_init(PToken * tok)4469 ptoken_init(PToken* tok)
4470 {
4471 tok->code_point_continue = 0;
4472 }
4473
4474 static int
fetch_interval(UChar ** src,UChar * end,PToken * tok,ScanEnv * env)4475 fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
4476 {
4477 int low, up, syn_allow, non_low = 0;
4478 int r = 0;
4479 OnigCodePoint c;
4480 OnigEncoding enc = env->enc;
4481 UChar* p = *src;
4482 PFETCH_READY;
4483
4484 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
4485
4486 if (PEND) {
4487 if (syn_allow)
4488 return 1; /* "....{" : OK! */
4489 else
4490 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
4491 }
4492
4493 if (! syn_allow) {
4494 c = PPEEK;
4495 if (c == ')' || c == '(' || c == '|') {
4496 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
4497 }
4498 }
4499
4500 low = scan_number(&p, end, env->enc);
4501 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4502 if (low > ONIG_MAX_REPEAT_NUM)
4503 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4504
4505 if (p == *src) { /* can't read low */
4506 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
4507 /* allow {,n} as {0,n} */
4508 low = 0;
4509 non_low = 1;
4510 }
4511 else
4512 goto invalid;
4513 }
4514
4515 if (PEND) goto invalid;
4516 PFETCH(c);
4517 if (c == ',') {
4518 UChar* prev = p;
4519 up = scan_number(&p, end, env->enc);
4520 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4521 if (up > ONIG_MAX_REPEAT_NUM)
4522 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
4523
4524 if (p == prev) {
4525 if (non_low != 0)
4526 goto invalid;
4527 up = INFINITE_REPEAT; /* {n,} : {n,infinite} */
4528 }
4529 }
4530 else {
4531 if (non_low != 0)
4532 goto invalid;
4533
4534 PUNFETCH;
4535 up = low; /* {n} : exact n times */
4536 r = 2; /* fixed */
4537 }
4538
4539 if (PEND) goto invalid;
4540 PFETCH(c);
4541 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
4542 if (c != MC_ESC(env->syntax) || PEND) goto invalid;
4543 PFETCH(c);
4544 }
4545 if (c != '}') goto invalid;
4546
4547 if (!IS_INFINITE_REPEAT(up) && low > up) {
4548 /* {n,m}+ supported case */
4549 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))
4550 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
4551
4552 tok->u.repeat.possessive = 1;
4553 {
4554 int tmp;
4555 tmp = low; low = up; up = tmp;
4556 }
4557 }
4558 else
4559 tok->u.repeat.possessive = 0;
4560
4561 tok->type = TK_INTERVAL;
4562 tok->u.repeat.lower = low;
4563 tok->u.repeat.upper = up;
4564 *src = p;
4565 return r; /* 0: normal {n,m}, 2: fixed {n} */
4566
4567 invalid:
4568 if (syn_allow) {
4569 /* *src = p; */ /* !!! Don't do this line !!! */
4570 return 1; /* OK */
4571 }
4572 else
4573 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
4574 }
4575
4576 /* \M-, \C-, \c, or \... */
4577 static int
fetch_escaped_value_raw(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4578 fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4579 {
4580 int v;
4581 OnigCodePoint c;
4582 OnigEncoding enc = env->enc;
4583 UChar* p = *src;
4584
4585 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
4586
4587 PFETCH_S(c);
4588 switch (c) {
4589 case 'M':
4590 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
4591 if (PEND) return ONIGERR_END_PATTERN_AT_META;
4592 PFETCH_S(c);
4593 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
4594 if (PEND) return ONIGERR_END_PATTERN_AT_META;
4595 PFETCH_S(c);
4596 if (c == MC_ESC(env->syntax)) {
4597 v = fetch_escaped_value_raw(&p, end, env, &c);
4598 if (v < 0) return v;
4599 }
4600 c = ((c & 0xff) | 0x80);
4601 }
4602 else
4603 goto backslash;
4604 break;
4605
4606 case 'C':
4607 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
4608 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4609 PFETCH_S(c);
4610 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
4611 goto control;
4612 }
4613 else
4614 goto backslash;
4615
4616 case 'c':
4617 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
4618 control:
4619 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
4620 PFETCH_S(c);
4621 if (c == '?') {
4622 c = 0177;
4623 }
4624 else {
4625 if (c == MC_ESC(env->syntax)) {
4626 v = fetch_escaped_value_raw(&p, end, env, &c);
4627 if (v < 0) return v;
4628 }
4629 c &= 0x9f;
4630 }
4631 break;
4632 }
4633 /* fall through */
4634
4635 default:
4636 {
4637 backslash:
4638 c = conv_backslash_value(c, env);
4639 }
4640 break;
4641 }
4642
4643 *src = p;
4644 *val = c;
4645 return 0;
4646 }
4647
4648 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)4649 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
4650 {
4651 int r;
4652 int len;
4653
4654 r = fetch_escaped_value_raw(src, end, env, val);
4655 if (r != 0) return r;
4656
4657 len = ONIGENC_CODE_TO_MBCLEN(env->enc, *val);
4658 if (len < 0) return len;
4659
4660 return 0;
4661 }
4662
4663 static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env);
4664
4665 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)4666 get_name_end_code_point(OnigCodePoint start)
4667 {
4668 switch (start) {
4669 case '<': return (OnigCodePoint )'>'; break;
4670 case '\'': return (OnigCodePoint )'\''; break;
4671 case '(': return (OnigCodePoint )')'; break;
4672 default:
4673 break;
4674 }
4675
4676 return (OnigCodePoint )0;
4677 }
4678
4679 enum REF_NUM {
4680 IS_NOT_NUM = 0,
4681 IS_ABS_NUM = 1,
4682 IS_REL_NUM = 2
4683 };
4684
4685 #ifdef USE_BACKREF_WITH_LEVEL
4686 /*
4687 \k<name+n>, \k<name-n>
4688 \k<num+n>, \k<num-n>
4689 \k<-num+n>, \k<-num-n>
4690 \k<+num+n>, \k<+num-n>
4691 */
4692 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel,enum REF_NUM * num_type)4693 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
4694 UChar** rname_end, ScanEnv* env,
4695 int* rback_num, int* rlevel, enum REF_NUM* num_type)
4696 {
4697 int r, sign, exist_level;
4698 int digit_count;
4699 OnigCodePoint end_code;
4700 OnigCodePoint c = 0;
4701 OnigEncoding enc = env->enc;
4702 UChar *name_end;
4703 UChar *pnum_head;
4704 UChar *p = *src;
4705 PFETCH_READY;
4706
4707 *rback_num = 0;
4708 exist_level = 0;
4709 *num_type = IS_NOT_NUM;
4710 sign = 1;
4711 pnum_head = *src;
4712
4713 end_code = get_name_end_code_point(start_code);
4714
4715 digit_count = 0;
4716 name_end = end;
4717 r = 0;
4718 if (PEND) {
4719 return ONIGERR_EMPTY_GROUP_NAME;
4720 }
4721 else {
4722 PFETCH(c);
4723 if (c == end_code)
4724 return ONIGERR_EMPTY_GROUP_NAME;
4725
4726 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4727 *num_type = IS_ABS_NUM;
4728 digit_count++;
4729 }
4730 else if (c == '-') {
4731 *num_type = IS_REL_NUM;
4732 sign = -1;
4733 pnum_head = p;
4734 }
4735 else if (c == '+') {
4736 *num_type = IS_REL_NUM;
4737 sign = 1;
4738 pnum_head = p;
4739 }
4740 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4741 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4742 }
4743 }
4744
4745 while (!PEND) {
4746 name_end = p;
4747 PFETCH(c);
4748 if (c == end_code || c == ')' || c == '+' || c == '-') {
4749 if (*num_type != IS_NOT_NUM && digit_count == 0)
4750 r = ONIGERR_INVALID_GROUP_NAME;
4751 break;
4752 }
4753
4754 if (*num_type != IS_NOT_NUM) {
4755 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4756 digit_count++;
4757 }
4758 else {
4759 r = ONIGERR_INVALID_GROUP_NAME;
4760 *num_type = IS_NOT_NUM;
4761 }
4762 }
4763 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4764 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4765 }
4766 }
4767
4768 if (r == 0 && c != end_code) {
4769 if (c == '+' || c == '-') {
4770 int level;
4771 int flag = (c == '-' ? -1 : 1);
4772
4773 if (PEND) {
4774 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4775 goto end;
4776 }
4777 PFETCH(c);
4778 if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err;
4779 PUNFETCH;
4780 level = scan_number(&p, end, enc);
4781 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
4782 *rlevel = (level * flag);
4783 exist_level = 1;
4784
4785 if (!PEND) {
4786 PFETCH(c);
4787 if (c == end_code)
4788 goto end;
4789 }
4790 }
4791
4792 err:
4793 name_end = end;
4794 err2:
4795 r = ONIGERR_INVALID_GROUP_NAME;
4796 }
4797
4798 end:
4799 if (r == 0) {
4800 if (*num_type != IS_NOT_NUM) {
4801 *rback_num = scan_number(&pnum_head, name_end, enc);
4802 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4803 else if (*rback_num == 0) {
4804 if (*num_type == IS_REL_NUM)
4805 goto err2;
4806 }
4807
4808 *rback_num *= sign;
4809 }
4810
4811 *rname_end = name_end;
4812 *src = p;
4813 return (exist_level ? 1 : 0);
4814 }
4815 else {
4816 onig_scan_env_set_error_string(env, r, *src, name_end);
4817 return r;
4818 }
4819 }
4820 #endif /* USE_BACKREF_WITH_LEVEL */
4821
4822 /*
4823 ref: 0 -> define name (don't allow number name)
4824 1 -> reference name (allow number name)
4825 */
4826 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,enum REF_NUM * num_type,int is_ref)4827 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
4828 UChar** rname_end, ScanEnv* env, int* rback_num,
4829 enum REF_NUM* num_type, int is_ref)
4830 {
4831 int r, sign;
4832 int digit_count;
4833 OnigCodePoint end_code;
4834 OnigCodePoint c = 0;
4835 OnigEncoding enc = env->enc;
4836 UChar *name_end;
4837 UChar *pnum_head;
4838 UChar *p = *src;
4839
4840 *rback_num = 0;
4841
4842 end_code = get_name_end_code_point(start_code);
4843
4844 digit_count = 0;
4845 name_end = end;
4846 pnum_head = *src;
4847 r = 0;
4848 *num_type = IS_NOT_NUM;
4849 sign = 1;
4850 if (PEND) {
4851 return ONIGERR_EMPTY_GROUP_NAME;
4852 }
4853 else {
4854 PFETCH_S(c);
4855 if (c == end_code)
4856 return ONIGERR_EMPTY_GROUP_NAME;
4857
4858 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4859 if (is_ref == TRUE)
4860 *num_type = IS_ABS_NUM;
4861 else {
4862 r = ONIGERR_INVALID_GROUP_NAME;
4863 }
4864 digit_count++;
4865 }
4866 else if (c == '-') {
4867 if (is_ref == TRUE) {
4868 *num_type = IS_REL_NUM;
4869 sign = -1;
4870 pnum_head = p;
4871 }
4872 else {
4873 r = ONIGERR_INVALID_GROUP_NAME;
4874 }
4875 }
4876 else if (c == '+') {
4877 if (is_ref == TRUE) {
4878 *num_type = IS_REL_NUM;
4879 sign = 1;
4880 pnum_head = p;
4881 }
4882 else {
4883 r = ONIGERR_INVALID_GROUP_NAME;
4884 }
4885 }
4886 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4887 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4888 }
4889 }
4890
4891 if (r == 0) {
4892 while (!PEND) {
4893 name_end = p;
4894 PFETCH_S(c);
4895 if (c == end_code || c == ')') {
4896 if (*num_type != IS_NOT_NUM && digit_count == 0)
4897 r = ONIGERR_INVALID_GROUP_NAME;
4898 break;
4899 }
4900
4901 if (*num_type != IS_NOT_NUM) {
4902 if (IS_CODE_DIGIT_ASCII(enc, c)) {
4903 digit_count++;
4904 }
4905 else {
4906 if (!ONIGENC_IS_CODE_WORD(enc, c))
4907 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4908 else
4909 r = ONIGERR_INVALID_GROUP_NAME;
4910
4911 *num_type = IS_NOT_NUM;
4912 }
4913 }
4914 else {
4915 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
4916 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
4917 }
4918 }
4919 }
4920
4921 if (c != end_code) {
4922 r = ONIGERR_INVALID_GROUP_NAME;
4923 goto err;
4924 }
4925
4926 if (*num_type != IS_NOT_NUM) {
4927 *rback_num = scan_number(&pnum_head, name_end, enc);
4928 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
4929 else if (*rback_num == 0) {
4930 if (*num_type == IS_REL_NUM) {
4931 r = ONIGERR_INVALID_GROUP_NAME;
4932 goto err;
4933 }
4934 }
4935
4936 *rback_num *= sign;
4937 }
4938
4939 *rname_end = name_end;
4940 *src = p;
4941 return 0;
4942 }
4943 else {
4944 while (!PEND) {
4945 name_end = p;
4946 PFETCH_S(c);
4947 if (c == end_code || c == ')')
4948 break;
4949 }
4950 if (PEND)
4951 name_end = end;
4952
4953 err:
4954 onig_scan_env_set_error_string(env, r, *src, name_end);
4955 return r;
4956 }
4957 }
4958
4959 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)4960 CC_ESC_WARN(ScanEnv* env, UChar *c)
4961 {
4962 if (onig_warn == onig_null_warn) return ;
4963
4964 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
4965 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
4966 UChar buf[WARN_BUFSIZE];
4967 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4968 env->pattern, env->pattern_end,
4969 (UChar* )"character class has '%s' without escape",
4970 c);
4971 (*onig_warn)((char* )buf);
4972 }
4973 }
4974
4975 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)4976 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
4977 {
4978 if (onig_warn == onig_null_warn) return ;
4979
4980 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
4981 UChar buf[WARN_BUFSIZE];
4982 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
4983 (env)->pattern, (env)->pattern_end,
4984 (UChar* )"regular expression has '%s' without escape", c);
4985 (*onig_warn)((char* )buf);
4986 }
4987 }
4988
4989 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)4990 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
4991 UChar **next, OnigEncoding enc)
4992 {
4993 int i;
4994 OnigCodePoint x;
4995 UChar *q;
4996 UChar *p = from;
4997
4998 while (p < to) {
4999 x = ONIGENC_MBC_TO_CODE(enc, p, to);
5000 q = p + enclen(enc, p);
5001 if (x == s[0]) {
5002 for (i = 1; i < n && q < to; i++) {
5003 x = ONIGENC_MBC_TO_CODE(enc, q, to);
5004 if (x != s[i]) break;
5005 q += enclen(enc, q);
5006 }
5007 if (i >= n) {
5008 if (IS_NOT_NULL(next))
5009 *next = q;
5010 return p;
5011 }
5012 }
5013 p = q;
5014 }
5015 return NULL_UCHARP;
5016 }
5017
5018 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)5019 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
5020 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
5021 {
5022 int i, in_esc;
5023 OnigCodePoint x;
5024 UChar *q;
5025 UChar *p = from;
5026
5027 in_esc = 0;
5028 while (p < to) {
5029 if (in_esc) {
5030 in_esc = 0;
5031 p += enclen(enc, p);
5032 }
5033 else {
5034 x = ONIGENC_MBC_TO_CODE(enc, p, to);
5035 q = p + enclen(enc, p);
5036 if (x == s[0]) {
5037 for (i = 1; i < n && q < to; i++) {
5038 x = ONIGENC_MBC_TO_CODE(enc, q, to);
5039 if (x != s[i]) break;
5040 q += enclen(enc, q);
5041 }
5042 if (i >= n) return 1;
5043 p += enclen(enc, p);
5044 }
5045 else {
5046 x = ONIGENC_MBC_TO_CODE(enc, p, to);
5047 if (x == bad) return 0;
5048 else if (x == MC_ESC(syn)) in_esc = 1;
5049 p = q;
5050 }
5051 }
5052 }
5053 return 0;
5054 }
5055
5056 static int
fetch_token_cc(PToken * tok,UChar ** src,UChar * end,ScanEnv * env,int state)5057 fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
5058 {
5059 int r;
5060 OnigCodePoint code;
5061 OnigCodePoint c, c2;
5062 OnigSyntaxType* syn = env->syntax;
5063 OnigEncoding enc = env->enc;
5064 UChar* prev;
5065 UChar* p = *src;
5066 PFETCH_READY;
5067
5068 if (tok->code_point_continue != 0) {
5069 r = get_next_code_point(&p, end, tok->base_num, enc, TRUE, &code);
5070 if (r == 1) {
5071 tok->code_point_continue = 0;
5072 }
5073 else if (r == 2) {
5074 tok->type = TK_CC_RANGE;
5075 goto end;
5076 }
5077 else if (r == 0) {
5078 tok->type = TK_CODE_POINT;
5079 tok->u.code = code;
5080 goto end;
5081 }
5082 else
5083 return r; /* error */
5084 }
5085
5086 if (PEND) {
5087 tok->type = TK_EOT;
5088 return tok->type;
5089 }
5090
5091 PFETCH(c);
5092 tok->type = TK_CHAR;
5093 tok->base_num = 0;
5094 tok->u.code = c;
5095 tok->escaped = 0;
5096
5097 if (c == ']') {
5098 tok->type = TK_CC_CLOSE;
5099 }
5100 else if (c == '-') {
5101 tok->type = TK_CC_RANGE;
5102 }
5103 else if (c == MC_ESC(syn)) {
5104 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
5105 goto end;
5106
5107 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
5108
5109 PFETCH(c);
5110 tok->escaped = 1;
5111 tok->u.code = c;
5112 switch (c) {
5113 case 'w':
5114 tok->type = TK_CHAR_TYPE;
5115 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5116 tok->u.prop.not = 0;
5117 break;
5118 case 'W':
5119 tok->type = TK_CHAR_TYPE;
5120 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5121 tok->u.prop.not = 1;
5122 break;
5123 case 'd':
5124 tok->type = TK_CHAR_TYPE;
5125 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5126 tok->u.prop.not = 0;
5127 break;
5128 case 'D':
5129 tok->type = TK_CHAR_TYPE;
5130 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5131 tok->u.prop.not = 1;
5132 break;
5133 case 's':
5134 tok->type = TK_CHAR_TYPE;
5135 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5136 tok->u.prop.not = 0;
5137 break;
5138 case 'S':
5139 tok->type = TK_CHAR_TYPE;
5140 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5141 tok->u.prop.not = 1;
5142 break;
5143 case 'h':
5144 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5145 tok->type = TK_CHAR_TYPE;
5146 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5147 tok->u.prop.not = 0;
5148 break;
5149 case 'H':
5150 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5151 tok->type = TK_CHAR_TYPE;
5152 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5153 tok->u.prop.not = 1;
5154 break;
5155
5156 case 'p':
5157 case 'P':
5158 if (PEND) break;
5159
5160 c2 = PPEEK;
5161 if (c2 == '{' &&
5162 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5163 PINC;
5164 tok->type = TK_CHAR_PROPERTY;
5165 tok->u.prop.not = c == 'P';
5166
5167 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5168 PFETCH(c2);
5169 if (c2 == '^') {
5170 tok->u.prop.not = tok->u.prop.not == 0;
5171 }
5172 else
5173 PUNFETCH;
5174 }
5175 }
5176 break;
5177
5178 case 'o':
5179 if (PEND) break;
5180
5181 prev = p;
5182 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5183 PINC;
5184 r = scan_octal_number(&p, end, 0, 11, enc, &code);
5185 if (r < 0) return r;
5186 if (!PEND) {
5187 c2 = PPEEK;
5188 if (IS_CODE_DIGIT_ASCII(enc, c2))
5189 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5190 }
5191
5192 tok->base_num = 8;
5193 goto brace_code_point_entry;
5194 }
5195 break;
5196
5197 case 'x':
5198 if (PEND) break;
5199
5200 prev = p;
5201 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5202 PINC;
5203 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5204 if (r < 0) return r;
5205 if (!PEND) {
5206 c2 = PPEEK;
5207 if (IS_CODE_XDIGIT_ASCII(enc, c2))
5208 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5209 }
5210
5211 tok->base_num = 16;
5212 brace_code_point_entry:
5213 if ((p > prev + enclen(enc, prev))) {
5214 if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
5215 if (PPEEK_IS('}')) {
5216 PINC;
5217 }
5218 else {
5219 int curr_state;
5220
5221 curr_state = (state == CS_RANGE) ? CPS_EMPTY : CPS_START;
5222 r = check_code_point_sequence_cc(p, end, tok->base_num, enc,
5223 curr_state);
5224 if (r < 0) return r;
5225 if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
5226 tok->code_point_continue = TRUE;
5227 }
5228 tok->type = TK_CODE_POINT;
5229 tok->u.code = code;
5230 }
5231 else {
5232 /* can't read nothing or invalid format */
5233 p = prev;
5234 }
5235 }
5236 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5237 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5238 if (r < 0) return r;
5239 if (p == prev) { /* can't read nothing. */
5240 code = 0; /* but, it's not error */
5241 }
5242 tok->type = TK_CRUDE_BYTE;
5243 tok->base_num = 16;
5244 tok->u.byte = (UChar )code;
5245 }
5246 break;
5247
5248 case 'u':
5249 if (PEND) break;
5250
5251 prev = p;
5252 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5253 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5254 if (r < 0) return r;
5255 if (p == prev) { /* can't read nothing. */
5256 code = 0; /* but, it's not error */
5257 }
5258 tok->type = TK_CODE_POINT;
5259 tok->base_num = 16;
5260 tok->u.code = code;
5261 }
5262 break;
5263
5264 case '0':
5265 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
5266 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5267 PUNFETCH;
5268 prev = p;
5269 r = scan_octal_number(&p, end, 0, 3, enc, &code);
5270 if (r < 0) return r;
5271 if (code >= 256) return ONIGERR_TOO_BIG_NUMBER;
5272 if (p == prev) { /* can't read nothing. */
5273 code = 0; /* but, it's not error */
5274 }
5275 tok->type = TK_CRUDE_BYTE;
5276 tok->base_num = 8;
5277 tok->u.byte = (UChar )code;
5278 }
5279 break;
5280
5281 default:
5282 PUNFETCH;
5283 r = fetch_escaped_value(&p, end, env, &c2);
5284 if (r < 0) return r;
5285 if (tok->u.code != c2) {
5286 tok->u.code = c2;
5287 tok->type = TK_CODE_POINT;
5288 }
5289 break;
5290 }
5291 }
5292 else if (c == '[') {
5293 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
5294 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
5295 tok->backp = p; /* point at '[' is read */
5296 PINC;
5297 if (str_exist_check_with_esc(send, 2, p, end,
5298 (OnigCodePoint )']', enc, syn)) {
5299 tok->type = TK_CC_POSIX_BRACKET_OPEN;
5300 }
5301 else {
5302 PUNFETCH;
5303 goto cc_in_cc;
5304 }
5305 }
5306 else {
5307 cc_in_cc:
5308 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
5309 tok->type = TK_CC_OPEN_CC;
5310 }
5311 else {
5312 CC_ESC_WARN(env, (UChar* )"[");
5313 }
5314 }
5315 }
5316 else if (c == '&') {
5317 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
5318 !PEND && (PPEEK_IS('&'))) {
5319 PINC;
5320 tok->type = TK_CC_AND;
5321 }
5322 }
5323
5324 end:
5325 *src = p;
5326 return tok->type;
5327 }
5328
5329 static int
fetch_token(PToken * tok,UChar ** src,UChar * end,ScanEnv * env)5330 fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
5331 {
5332 int r;
5333 OnigCodePoint code;
5334 OnigCodePoint c;
5335 OnigEncoding enc = env->enc;
5336 OnigSyntaxType* syn = env->syntax;
5337 UChar* prev;
5338 UChar* p = *src;
5339 PFETCH_READY;
5340
5341 if (tok->code_point_continue != 0) {
5342 r = get_next_code_point(&p, end, tok->base_num, enc, FALSE, &code);
5343 if (r == 1) {
5344 tok->code_point_continue = 0;
5345 }
5346 else if (r == 0) {
5347 tok->type = TK_CODE_POINT;
5348 tok->u.code = code;
5349 goto out;
5350 }
5351 else
5352 return r; /* error */
5353 }
5354
5355 start:
5356 if (PEND) {
5357 tok->type = TK_EOT;
5358 return tok->type;
5359 }
5360
5361 tok->type = TK_STRING;
5362 tok->base_num = 0;
5363 tok->backp = p;
5364
5365 PFETCH(c);
5366 if (IS_MC_ESC_CODE(c, syn)) {
5367 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
5368
5369 tok->backp = p;
5370 PFETCH(c);
5371
5372 tok->u.code = c;
5373 tok->escaped = 1;
5374 switch (c) {
5375 case '*':
5376 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
5377 tok->type = TK_REPEAT;
5378 tok->u.repeat.lower = 0;
5379 tok->u.repeat.upper = INFINITE_REPEAT;
5380 goto greedy_check;
5381 break;
5382
5383 case '+':
5384 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
5385 tok->type = TK_REPEAT;
5386 tok->u.repeat.lower = 1;
5387 tok->u.repeat.upper = INFINITE_REPEAT;
5388 goto greedy_check;
5389 break;
5390
5391 case '?':
5392 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
5393 tok->type = TK_REPEAT;
5394 tok->u.repeat.lower = 0;
5395 tok->u.repeat.upper = 1;
5396 greedy_check:
5397 tok->u.repeat.possessive = 0;
5398 greedy_check2:
5399 if (!PEND && PPEEK_IS('?') &&
5400 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) &&
5401 tok->u.repeat.possessive == 0) {
5402 PFETCH(c);
5403 tok->u.repeat.greedy = 0;
5404 tok->u.repeat.possessive = 0;
5405 }
5406 else {
5407 possessive_check:
5408 tok->u.repeat.greedy = 1;
5409 if (!PEND && PPEEK_IS('+') &&
5410 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
5411 tok->type != TK_INTERVAL) ||
5412 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
5413 tok->type == TK_INTERVAL)) &&
5414 tok->u.repeat.possessive == 0) {
5415 PFETCH(c);
5416 tok->u.repeat.possessive = 1;
5417 }
5418 }
5419 break;
5420
5421 case '{':
5422 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
5423 r = fetch_interval(&p, end, tok, env);
5424 if (r < 0) return r; /* error */
5425 if (r == 0) goto greedy_check2;
5426 else if (r == 2) { /* {n} */
5427 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5428 goto possessive_check;
5429
5430 goto greedy_check2;
5431 }
5432 /* r == 1 : normal char */
5433 break;
5434
5435 case '|':
5436 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
5437 tok->type = TK_ALT;
5438 break;
5439
5440 case '(':
5441 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5442 tok->type = TK_SUBEXP_OPEN;
5443 break;
5444
5445 case ')':
5446 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
5447 tok->type = TK_SUBEXP_CLOSE;
5448 break;
5449
5450 case 'w':
5451 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5452 tok->type = TK_CHAR_TYPE;
5453 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5454 tok->u.prop.not = 0;
5455 break;
5456
5457 case 'W':
5458 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
5459 tok->type = TK_CHAR_TYPE;
5460 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
5461 tok->u.prop.not = 1;
5462 break;
5463
5464 case 'b':
5465 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5466 tok->type = TK_ANCHOR;
5467 tok->u.anchor = ANCR_WORD_BOUNDARY;
5468 break;
5469
5470 case 'B':
5471 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
5472 tok->type = TK_ANCHOR;
5473 tok->u.anchor = ANCR_NO_WORD_BOUNDARY;
5474 break;
5475
5476 case 'y':
5477 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5478 tok->type = TK_ANCHOR;
5479 tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY;
5480 break;
5481
5482 case 'Y':
5483 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5484 tok->type = TK_ANCHOR;
5485 tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY;
5486 break;
5487
5488 #ifdef USE_WORD_BEGIN_END
5489 case '<':
5490 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5491 tok->type = TK_ANCHOR;
5492 tok->u.anchor = ANCR_WORD_BEGIN;
5493 break;
5494
5495 case '>':
5496 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
5497 tok->type = TK_ANCHOR;
5498 tok->u.anchor = ANCR_WORD_END;
5499 break;
5500 #endif
5501
5502 case 's':
5503 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5504 tok->type = TK_CHAR_TYPE;
5505 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5506 tok->u.prop.not = 0;
5507 break;
5508
5509 case 'S':
5510 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
5511 tok->type = TK_CHAR_TYPE;
5512 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
5513 tok->u.prop.not = 1;
5514 break;
5515
5516 case 'd':
5517 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5518 tok->type = TK_CHAR_TYPE;
5519 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5520 tok->u.prop.not = 0;
5521 break;
5522
5523 case 'D':
5524 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
5525 tok->type = TK_CHAR_TYPE;
5526 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
5527 tok->u.prop.not = 1;
5528 break;
5529
5530 case 'h':
5531 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5532 tok->type = TK_CHAR_TYPE;
5533 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5534 tok->u.prop.not = 0;
5535 break;
5536
5537 case 'H':
5538 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
5539 tok->type = TK_CHAR_TYPE;
5540 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
5541 tok->u.prop.not = 1;
5542 break;
5543
5544 case 'K':
5545 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break;
5546 tok->type = TK_KEEP;
5547 break;
5548
5549 case 'R':
5550 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break;
5551 tok->type = TK_GENERAL_NEWLINE;
5552 break;
5553
5554 case 'N':
5555 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5556 tok->type = TK_NO_NEWLINE;
5557 break;
5558
5559 case 'O':
5560 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break;
5561 tok->type = TK_TRUE_ANYCHAR;
5562 break;
5563
5564 case 'X':
5565 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break;
5566 tok->type = TK_TEXT_SEGMENT;
5567 break;
5568
5569 case 'A':
5570 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5571 begin_buf:
5572 tok->type = TK_ANCHOR;
5573 tok->u.subtype = ANCR_BEGIN_BUF;
5574 break;
5575
5576 case 'Z':
5577 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5578 tok->type = TK_ANCHOR;
5579 tok->u.subtype = ANCR_SEMI_END_BUF;
5580 break;
5581
5582 case 'z':
5583 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
5584 end_buf:
5585 tok->type = TK_ANCHOR;
5586 tok->u.subtype = ANCR_END_BUF;
5587 break;
5588
5589 case 'G':
5590 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
5591 tok->type = TK_ANCHOR;
5592 tok->u.subtype = ANCR_BEGIN_POSITION;
5593 break;
5594
5595 case '`':
5596 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5597 goto begin_buf;
5598 break;
5599
5600 case '\'':
5601 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
5602 goto end_buf;
5603 break;
5604
5605 case 'o':
5606 if (PEND) break;
5607
5608 prev = p;
5609 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
5610 PINC;
5611 r = scan_octal_number(&p, end, 0, 11, enc, &code);
5612 if (r < 0) return r;
5613 if (!PEND) {
5614 if (IS_CODE_DIGIT_ASCII(enc, PPEEK))
5615 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5616 }
5617
5618 tok->base_num = 8;
5619 goto brace_code_point_entry;
5620 }
5621 break;
5622
5623 case 'x':
5624 if (PEND) break;
5625
5626 prev = p;
5627 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
5628 PINC;
5629 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code);
5630 if (r < 0) return r;
5631 if (!PEND) {
5632 if (IS_CODE_XDIGIT_ASCII(enc, PPEEK))
5633 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
5634 }
5635
5636 tok->base_num = 16;
5637 brace_code_point_entry:
5638 if ((p > prev + enclen(enc, prev))) {
5639 if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE;
5640 if (PPEEK_IS('}')) {
5641 PINC;
5642 }
5643 else {
5644 r = check_code_point_sequence(p, end, tok->base_num, enc);
5645 if (r < 0) return r;
5646 if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
5647 tok->code_point_continue = TRUE;
5648 }
5649 tok->type = TK_CODE_POINT;
5650 tok->u.code = code;
5651 }
5652 else {
5653 /* can't read nothing or invalid format */
5654 p = prev;
5655 }
5656 }
5657 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
5658 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code);
5659 if (r < 0) return r;
5660 if (p == prev) { /* can't read nothing. */
5661 code = 0; /* but, it's not error */
5662 }
5663 tok->type = TK_CRUDE_BYTE;
5664 tok->base_num = 16;
5665 tok->u.byte = (UChar )code;
5666 }
5667 break;
5668
5669 case 'u':
5670 if (PEND) break;
5671
5672 prev = p;
5673 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
5674 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
5675 if (r < 0) return r;
5676 if (p == prev) { /* can't read nothing. */
5677 code = 0; /* but, it's not error */
5678 }
5679 tok->type = TK_CODE_POINT;
5680 tok->base_num = 16;
5681 tok->u.code = code;
5682 }
5683 break;
5684
5685 case '1': case '2': case '3': case '4':
5686 case '5': case '6': case '7': case '8': case '9':
5687 PUNFETCH;
5688 prev = p;
5689 r = scan_number(&p, end, enc);
5690 if (r < 0 || r > ONIG_MAX_BACKREF_NUM) {
5691 goto skip_backref;
5692 }
5693
5694 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
5695 (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */
5696 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5697 if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node))
5698 return ONIGERR_INVALID_BACKREF;
5699 }
5700
5701 tok->type = TK_BACKREF;
5702 tok->u.backref.num = 1;
5703 tok->u.backref.ref1 = r;
5704 tok->u.backref.by_name = 0;
5705 #ifdef USE_BACKREF_WITH_LEVEL
5706 tok->u.backref.exist_level = 0;
5707 #endif
5708 break;
5709 }
5710
5711 skip_backref:
5712 if (c == '8' || c == '9') {
5713 /* normal char */
5714 p = prev; PINC;
5715 break;
5716 }
5717
5718 p = prev;
5719 /* fall through */
5720 case '0':
5721 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
5722 prev = p;
5723 r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code);
5724 if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER;
5725 if (p == prev) { /* can't read nothing. */
5726 code = 0; /* but, it's not error */
5727 }
5728 tok->type = TK_CRUDE_BYTE;
5729 tok->base_num = 8;
5730 tok->u.byte = (UChar )code;
5731 }
5732 else if (c != '0') {
5733 PINC;
5734 }
5735 break;
5736
5737 case 'k':
5738 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
5739 PFETCH(c);
5740 if (c == '<' || c == '\'') {
5741 UChar* name_end;
5742 int* backs;
5743 int back_num;
5744 enum REF_NUM num_type;
5745
5746 prev = p;
5747
5748 #ifdef USE_BACKREF_WITH_LEVEL
5749 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
5750 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
5751 env, &back_num, &tok->u.backref.level, &num_type);
5752 if (r == 1) tok->u.backref.exist_level = 1;
5753 else tok->u.backref.exist_level = 0;
5754 #else
5755 r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE);
5756 #endif
5757 if (r < 0) return r;
5758
5759 if (num_type != IS_NOT_NUM) {
5760 if (num_type == IS_REL_NUM) {
5761 back_num = backref_rel_to_abs(back_num, env);
5762 }
5763 if (back_num <= 0)
5764 return ONIGERR_INVALID_BACKREF;
5765
5766 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5767 if (back_num > env->num_mem ||
5768 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
5769 return ONIGERR_INVALID_BACKREF;
5770 }
5771 tok->type = TK_BACKREF;
5772 tok->u.backref.by_name = 0;
5773 tok->u.backref.num = 1;
5774 tok->u.backref.ref1 = back_num;
5775 }
5776 else {
5777 int num = name_to_group_numbers(env, prev, name_end, &backs);
5778 if (num <= 0) {
5779 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5780 }
5781 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5782 int i;
5783 for (i = 0; i < num; i++) {
5784 if (backs[i] > env->num_mem ||
5785 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
5786 return ONIGERR_INVALID_BACKREF;
5787 }
5788 }
5789
5790 tok->type = TK_BACKREF;
5791 tok->u.backref.by_name = 1;
5792 if (num == 1) {
5793 tok->u.backref.num = 1;
5794 tok->u.backref.ref1 = backs[0];
5795 }
5796 else {
5797 tok->u.backref.num = num;
5798 tok->u.backref.refs = backs;
5799 }
5800 }
5801 }
5802 else
5803 PUNFETCH;
5804 }
5805 break;
5806
5807 #ifdef USE_CALL
5808 case 'g':
5809 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
5810 PFETCH(c);
5811 if (c == '<' || c == '\'') {
5812 int gnum;
5813 UChar* name_end;
5814 enum REF_NUM num_type;
5815
5816 prev = p;
5817 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
5818 &gnum, &num_type, TRUE);
5819 if (r < 0) return r;
5820
5821 if (num_type != IS_NOT_NUM) {
5822 if (num_type == IS_REL_NUM) {
5823 gnum = backref_rel_to_abs(gnum, env);
5824 if (gnum < 0) {
5825 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
5826 prev, name_end);
5827 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
5828 }
5829 }
5830 tok->u.call.by_number = 1;
5831 tok->u.call.gnum = gnum;
5832 }
5833 else {
5834 tok->u.call.by_number = 0;
5835 tok->u.call.gnum = 0;
5836 }
5837
5838 tok->type = TK_CALL;
5839 tok->u.call.name = prev;
5840 tok->u.call.name_end = name_end;
5841 }
5842 else
5843 PUNFETCH;
5844 }
5845 break;
5846 #endif
5847
5848 case 'Q':
5849 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
5850 tok->type = TK_QUOTE_OPEN;
5851 }
5852 break;
5853
5854 case 'p':
5855 case 'P':
5856 if (!PEND && PPEEK_IS('{') &&
5857 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
5858 PINC;
5859 tok->type = TK_CHAR_PROPERTY;
5860 tok->u.prop.not = c == 'P';
5861
5862 if (!PEND &&
5863 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
5864 PFETCH(c);
5865 if (c == '^') {
5866 tok->u.prop.not = tok->u.prop.not == 0;
5867 }
5868 else
5869 PUNFETCH;
5870 }
5871 }
5872 break;
5873
5874 default:
5875 {
5876 OnigCodePoint c2;
5877
5878 PUNFETCH;
5879 r = fetch_escaped_value(&p, end, env, &c2);
5880 if (r < 0) return r;
5881 if (tok->u.code != c2) {
5882 tok->type = TK_CODE_POINT;
5883 tok->u.code = c2;
5884 }
5885 else { /* string */
5886 p = tok->backp + enclen(enc, tok->backp);
5887 }
5888 }
5889 break;
5890 }
5891 }
5892 else {
5893 tok->u.code = c;
5894 tok->escaped = 0;
5895
5896 #ifdef USE_VARIABLE_META_CHARS
5897 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
5898 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
5899 if (c == MC_ANYCHAR(syn))
5900 goto any_char;
5901 else if (c == MC_ANYTIME(syn))
5902 goto any_time;
5903 else if (c == MC_ZERO_OR_ONE_TIME(syn))
5904 goto zero_or_one_time;
5905 else if (c == MC_ONE_OR_MORE_TIME(syn))
5906 goto one_or_more_time;
5907 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
5908 tok->type = TK_ANYCHAR_ANYTIME;
5909 goto out;
5910 }
5911 }
5912 #endif
5913
5914 switch (c) {
5915 case '.':
5916 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
5917 #ifdef USE_VARIABLE_META_CHARS
5918 any_char:
5919 #endif
5920 tok->type = TK_ANYCHAR;
5921 break;
5922
5923 case '*':
5924 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
5925 #ifdef USE_VARIABLE_META_CHARS
5926 any_time:
5927 #endif
5928 tok->type = TK_REPEAT;
5929 tok->u.repeat.lower = 0;
5930 tok->u.repeat.upper = INFINITE_REPEAT;
5931 goto greedy_check;
5932 break;
5933
5934 case '+':
5935 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
5936 #ifdef USE_VARIABLE_META_CHARS
5937 one_or_more_time:
5938 #endif
5939 tok->type = TK_REPEAT;
5940 tok->u.repeat.lower = 1;
5941 tok->u.repeat.upper = INFINITE_REPEAT;
5942 goto greedy_check;
5943 break;
5944
5945 case '?':
5946 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
5947 #ifdef USE_VARIABLE_META_CHARS
5948 zero_or_one_time:
5949 #endif
5950 tok->type = TK_REPEAT;
5951 tok->u.repeat.lower = 0;
5952 tok->u.repeat.upper = 1;
5953 goto greedy_check;
5954 break;
5955
5956 case '{':
5957 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
5958 r = fetch_interval(&p, end, tok, env);
5959 if (r < 0) return r; /* error */
5960 if (r == 0) goto greedy_check2;
5961 else if (r == 2) { /* {n} */
5962 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
5963 goto possessive_check;
5964
5965 goto greedy_check2;
5966 }
5967 /* r == 1 : normal char */
5968 break;
5969
5970 case '|':
5971 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
5972 tok->type = TK_ALT;
5973 break;
5974
5975 case '(':
5976 if (!PEND && PPEEK_IS('?') &&
5977 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
5978 PINC;
5979 if (! PEND) {
5980 c = PPEEK;
5981 if (c == '#') {
5982 PFETCH(c);
5983 while (1) {
5984 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5985 PFETCH(c);
5986 if (c == MC_ESC(syn)) {
5987 if (! PEND) PFETCH(c);
5988 }
5989 else {
5990 if (c == ')') break;
5991 }
5992 }
5993 goto start;
5994 }
5995 else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) {
5996 int gnum;
5997 UChar* name;
5998 UChar* name_end;
5999 enum REF_NUM num_type;
6000
6001 switch (c) {
6002 case '&':
6003 {
6004 PINC;
6005 name = p;
6006 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
6007 &gnum, &num_type, FALSE);
6008 if (r < 0) return r;
6009
6010 tok->type = TK_CALL;
6011 tok->u.call.by_number = 0;
6012 tok->u.call.gnum = 0;
6013 tok->u.call.name = name;
6014 tok->u.call.name_end = name_end;
6015 }
6016 break;
6017
6018 case 'R':
6019 tok->type = TK_CALL;
6020 tok->u.call.by_number = 1;
6021 tok->u.call.gnum = 0;
6022 tok->u.call.name = p;
6023 PINC;
6024 if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
6025 tok->u.call.name_end = p;
6026 break;
6027
6028 case '-':
6029 case '+':
6030 goto lparen_qmark_num;
6031 break;
6032 default:
6033 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end;
6034
6035 lparen_qmark_num:
6036 {
6037 name = p;
6038 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env,
6039 &gnum, &num_type, TRUE);
6040 if (r < 0) return r;
6041
6042 if (num_type == IS_NOT_NUM) {
6043 return ONIGERR_INVALID_GROUP_NAME;
6044 }
6045 else {
6046 if (num_type == IS_REL_NUM) {
6047 gnum = backref_rel_to_abs(gnum, env);
6048 if (gnum < 0) {
6049 onig_scan_env_set_error_string(env,
6050 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
6051 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
6052 }
6053 }
6054 tok->u.call.by_number = 1;
6055 tok->u.call.gnum = gnum;
6056 }
6057
6058 tok->type = TK_CALL;
6059 tok->u.call.name = name;
6060 tok->u.call.name_end = name_end;
6061 }
6062 break;
6063 }
6064 }
6065 }
6066 lparen_qmark_end:
6067 PUNFETCH;
6068 }
6069
6070 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
6071 tok->type = TK_SUBEXP_OPEN;
6072 break;
6073
6074 case ')':
6075 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
6076 tok->type = TK_SUBEXP_CLOSE;
6077 break;
6078
6079 case '^':
6080 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
6081 tok->type = TK_ANCHOR;
6082 tok->u.subtype = (OPTON_SINGLELINE(env->options)
6083 ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE);
6084 break;
6085
6086 case '$':
6087 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
6088 tok->type = TK_ANCHOR;
6089 tok->u.subtype = (OPTON_SINGLELINE(env->options)
6090 ? ANCR_SEMI_END_BUF : ANCR_END_LINE);
6091 break;
6092
6093 case '[':
6094 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
6095 tok->type = TK_OPEN_CC;
6096 break;
6097
6098 case ']':
6099 if (*src > env->pattern) /* /].../ is allowed. */
6100 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
6101 break;
6102
6103 case '#':
6104 if (OPTON_EXTEND(env->options)) {
6105 while (!PEND) {
6106 PFETCH(c);
6107 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
6108 break;
6109 }
6110 goto start;
6111 break;
6112 }
6113 break;
6114
6115 case ' ': case '\t': case '\n': case '\r': case '\f':
6116 if (OPTON_EXTEND(env->options))
6117 goto start;
6118 break;
6119
6120 default:
6121 /* string */
6122 break;
6123 }
6124 }
6125
6126 out:
6127 *src = p;
6128 return tok->type;
6129 }
6130
6131 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])6132 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
6133 OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out,
6134 const OnigCodePoint mbr[])
6135 {
6136 int i, r;
6137 OnigCodePoint j;
6138
6139 int n = ONIGENC_CODE_RANGE_NUM(mbr);
6140
6141 if (not == 0) {
6142 for (i = 0; i < n; i++) {
6143 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
6144 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
6145 if (j >= sb_out) {
6146 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6147 r = add_code_range_to_buf(&(cc->mbuf), j,
6148 ONIGENC_CODE_RANGE_TO(mbr, i));
6149 if (r != 0) return r;
6150 i++;
6151 }
6152
6153 goto sb_end;
6154 }
6155 BITSET_SET_BIT(cc->bs, j);
6156 }
6157 }
6158
6159 sb_end:
6160 for ( ; i < n; i++) {
6161 r = add_code_range_to_buf(&(cc->mbuf),
6162 ONIGENC_CODE_RANGE_FROM(mbr, i),
6163 ONIGENC_CODE_RANGE_TO(mbr, i));
6164 if (r != 0) return r;
6165 }
6166 }
6167 else {
6168 OnigCodePoint prev = 0;
6169
6170 for (i = 0; i < n; i++) {
6171 for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
6172 if (j >= sb_out) {
6173 goto sb_end2;
6174 }
6175 BITSET_SET_BIT(cc->bs, j);
6176 }
6177 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
6178 }
6179 for (j = prev; j < sb_out; j++) {
6180 BITSET_SET_BIT(cc->bs, j);
6181 }
6182
6183 sb_end2:
6184 prev = sb_out;
6185
6186 for (i = 0; i < n; i++) {
6187 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6188 r = add_code_range_to_buf(&(cc->mbuf), prev,
6189 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
6190 if (r != 0) return r;
6191 }
6192 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
6193 if (prev == 0) goto end;
6194 }
6195
6196 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
6197 if (r != 0) return r;
6198 }
6199
6200 end:
6201 return 0;
6202 }
6203
6204 static int
add_ctype_to_cc_by_range_limit(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[],OnigCodePoint limit)6205 add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,
6206 OnigEncoding enc ARG_UNUSED,
6207 OnigCodePoint sb_out,
6208 const OnigCodePoint mbr[], OnigCodePoint limit)
6209 {
6210 int i, r;
6211 OnigCodePoint j;
6212 OnigCodePoint from;
6213 OnigCodePoint to;
6214
6215 int n = ONIGENC_CODE_RANGE_NUM(mbr);
6216
6217 if (not == 0) {
6218 for (i = 0; i < n; i++) {
6219 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
6220 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
6221 if (j > limit) goto end;
6222 if (j >= sb_out) {
6223 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
6224 to = ONIGENC_CODE_RANGE_TO(mbr, i);
6225 if (to > limit) to = limit;
6226 r = add_code_range_to_buf(&(cc->mbuf), j, to);
6227 if (r != 0) return r;
6228 i++;
6229 }
6230
6231 goto sb_end;
6232 }
6233 BITSET_SET_BIT(cc->bs, j);
6234 }
6235 }
6236
6237 sb_end:
6238 for ( ; i < n; i++) {
6239 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6240 to = ONIGENC_CODE_RANGE_TO(mbr, i);
6241 if (from > limit) break;
6242 if (to > limit) to = limit;
6243 r = add_code_range_to_buf(&(cc->mbuf), from, to);
6244 if (r != 0) return r;
6245 }
6246 }
6247 else {
6248 OnigCodePoint prev = 0;
6249
6250 for (i = 0; i < n; i++) {
6251 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6252 if (from > limit) {
6253 for (j = prev; j < sb_out; j++) {
6254 BITSET_SET_BIT(cc->bs, j);
6255 }
6256 goto sb_end2;
6257 }
6258 for (j = prev; j < from; j++) {
6259 if (j >= sb_out) goto sb_end2;
6260 BITSET_SET_BIT(cc->bs, j);
6261 }
6262 prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6263 if (prev > limit) prev = limit;
6264 prev++;
6265 if (prev == 0) goto end;
6266 }
6267 for (j = prev; j < sb_out; j++) {
6268 BITSET_SET_BIT(cc->bs, j);
6269 }
6270
6271 sb_end2:
6272 prev = sb_out;
6273
6274 for (i = 0; i < n; i++) {
6275 from = ONIGENC_CODE_RANGE_FROM(mbr, i);
6276 if (from > limit) goto last;
6277
6278 if (prev < from) {
6279 r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1);
6280 if (r != 0) return r;
6281 }
6282 prev = ONIGENC_CODE_RANGE_TO(mbr, i);
6283 if (prev > limit) prev = limit;
6284 prev++;
6285 if (prev == 0) goto end;
6286 }
6287
6288 last:
6289 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT);
6290 if (r != 0) return r;
6291 }
6292
6293 end:
6294 return 0;
6295 }
6296
6297 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)6298 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
6299 {
6300 int c, r;
6301 int ascii_mode;
6302 int is_single;
6303 const OnigCodePoint *ranges;
6304 OnigCodePoint limit;
6305 OnigCodePoint sb_out;
6306 OnigEncoding enc = env->enc;
6307
6308 ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options);
6309
6310 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
6311 if (r == 0) {
6312 if (ascii_mode == 0)
6313 r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
6314 else
6315 r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out,
6316 ranges, ASCII_LIMIT);
6317 return r;
6318 }
6319 else if (r != ONIG_NO_SUPPORT_CONFIG) {
6320 return r;
6321 }
6322
6323 r = 0;
6324 is_single = ONIGENC_IS_SINGLEBYTE(enc);
6325 limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE;
6326
6327 switch (ctype) {
6328 case ONIGENC_CTYPE_ALPHA:
6329 case ONIGENC_CTYPE_BLANK:
6330 case ONIGENC_CTYPE_CNTRL:
6331 case ONIGENC_CTYPE_DIGIT:
6332 case ONIGENC_CTYPE_LOWER:
6333 case ONIGENC_CTYPE_PUNCT:
6334 case ONIGENC_CTYPE_SPACE:
6335 case ONIGENC_CTYPE_UPPER:
6336 case ONIGENC_CTYPE_XDIGIT:
6337 case ONIGENC_CTYPE_ASCII:
6338 case ONIGENC_CTYPE_ALNUM:
6339 if (not != 0) {
6340 for (c = 0; c < (int )limit; c++) {
6341 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6342 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6343 BITSET_SET_BIT(cc->bs, c);
6344 }
6345 }
6346 for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6347 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6348 BITSET_SET_BIT(cc->bs, c);
6349 }
6350
6351 if (is_single == 0)
6352 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6353 }
6354 else {
6355 for (c = 0; c < (int )limit; c++) {
6356 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) {
6357 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6358 BITSET_SET_BIT(cc->bs, c);
6359 }
6360 }
6361 }
6362 break;
6363
6364 case ONIGENC_CTYPE_GRAPH:
6365 case ONIGENC_CTYPE_PRINT:
6366 case ONIGENC_CTYPE_WORD:
6367 if (not != 0) {
6368 for (c = 0; c < (int )limit; c++) {
6369 /* check invalid code point */
6370 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6371 && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6372 BITSET_SET_BIT(cc->bs, c);
6373 }
6374 for (c = limit; c < SINGLE_BYTE_SIZE; c++) {
6375 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6376 BITSET_SET_BIT(cc->bs, c);
6377 }
6378 if (ascii_mode != 0 && is_single == 0)
6379 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6380 }
6381 else {
6382 for (c = 0; c < (int )limit; c++) {
6383 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1)
6384 && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
6385 BITSET_SET_BIT(cc->bs, c);
6386 }
6387 if (ascii_mode == 0 && is_single == 0)
6388 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
6389 }
6390 break;
6391
6392 default:
6393 return ONIGERR_PARSER_BUG;
6394 break;
6395 }
6396
6397 return r;
6398 }
6399
6400 static int
prs_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)6401 prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
6402 {
6403 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
6404 #define POSIX_BRACKET_NAME_MIN_LEN 4
6405
6406 static PosixBracketEntryType PBS[] = {
6407 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
6408 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
6409 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
6410 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
6411 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
6412 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
6413 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
6414 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
6415 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
6416 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
6417 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
6418 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
6419 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
6420 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
6421 { (UChar* )NULL, -1, 0 }
6422 };
6423
6424 PosixBracketEntryType *pb;
6425 int not, i, r;
6426 OnigCodePoint c;
6427 OnigEncoding enc = env->enc;
6428 UChar *p = *src;
6429
6430 if (PPEEK_IS('^')) {
6431 PINC_S;
6432 not = 1;
6433 }
6434 else
6435 not = 0;
6436
6437 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
6438 goto not_posix_bracket;
6439
6440 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
6441 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
6442 p = (UChar* )onigenc_step(enc, p, end, pb->len);
6443 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
6444 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6445
6446 r = add_ctype_to_cc(cc, pb->ctype, not, env);
6447 if (r != 0) return r;
6448
6449 PINC_S; PINC_S;
6450 *src = p;
6451 return 0;
6452 }
6453 }
6454
6455 not_posix_bracket:
6456 c = 0;
6457 i = 0;
6458 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
6459 PINC_S;
6460 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
6461 }
6462 if (c == ':' && ! PEND) {
6463 PINC_S;
6464 if (! PEND) {
6465 PFETCH_S(c);
6466 if (c == ']')
6467 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
6468 }
6469 }
6470
6471 return 1; /* 1: is not POSIX bracket, but no error. */
6472 }
6473
6474 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)6475 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
6476 {
6477 int r;
6478 OnigCodePoint c;
6479 OnigEncoding enc;
6480 UChar *prev, *start, *p;
6481
6482 p = *src;
6483 enc = env->enc;
6484 r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6485 start = prev = p;
6486
6487 while (!PEND) {
6488 prev = p;
6489 PFETCH_S(c);
6490 if (c == '}') {
6491 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
6492 if (r >= 0) {
6493 *src = p;
6494 }
6495 else {
6496 onig_scan_env_set_error_string(env, r, *src, prev);
6497 }
6498
6499 return r;
6500 }
6501 else if (c == '(' || c == ')' || c == '{' || c == '|') {
6502 break;
6503 }
6504 }
6505
6506 return r;
6507 }
6508
6509 static int
prs_char_property(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6510 prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6511 {
6512 int r, ctype;
6513 CClassNode* cc;
6514
6515 ctype = fetch_char_property_to_ctype(src, end, env);
6516 if (ctype < 0) return ctype;
6517
6518 *np = node_new_cclass();
6519 CHECK_NULL_RETURN_MEMERR(*np);
6520 cc = CCLASS_(*np);
6521 r = add_ctype_to_cc(cc, ctype, FALSE, env);
6522 if (r != 0) return r;
6523 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6524
6525 return 0;
6526 }
6527
6528
6529 static int
cc_cprop_next(CClassNode * cc,OnigCodePoint * pcode,CVAL * val,CSTATE * state,ScanEnv * env)6530 cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
6531 ScanEnv* env)
6532 {
6533 int r;
6534
6535 if (*state == CS_RANGE)
6536 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
6537
6538 if (*state == CS_VALUE) {
6539 if (*val == CV_SB)
6540 BITSET_SET_BIT(cc->bs, (int )(*pcode));
6541 else if (*val == CV_MB) {
6542 r = add_code_range(&(cc->mbuf), env, *pcode, *pcode);
6543 if (r < 0) return r;
6544 }
6545 }
6546
6547 *state = CS_VALUE;
6548 *val = CV_CPROP;
6549 return 0;
6550 }
6551
6552 static int
cc_char_next(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_raw,int to_raw,CVAL intype,CVAL * type,CSTATE * state,ScanEnv * env)6553 cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
6554 int* from_raw, int to_raw, CVAL intype, CVAL* type,
6555 CSTATE* state, ScanEnv* env)
6556 {
6557 int r;
6558
6559 switch (*state) {
6560 case CS_VALUE:
6561 if (*type == CV_SB) {
6562 if (*from > 0xff)
6563 return ONIGERR_INVALID_CODE_POINT_VALUE;
6564
6565 BITSET_SET_BIT(cc->bs, (int )(*from));
6566 }
6567 else if (*type == CV_MB) {
6568 r = add_code_range(&(cc->mbuf), env, *from, *from);
6569 if (r < 0) return r;
6570 }
6571 break;
6572
6573 case CS_RANGE:
6574 if (intype == *type) {
6575 if (intype == CV_SB) {
6576 if (*from > 0xff || to > 0xff)
6577 return ONIGERR_INVALID_CODE_POINT_VALUE;
6578
6579 if (*from > to) {
6580 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6581 goto ccs_range_end;
6582 else
6583 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6584 }
6585 bitset_set_range(cc->bs, (int )*from, (int )to);
6586 }
6587 else {
6588 r = add_code_range(&(cc->mbuf), env, *from, to);
6589 if (r < 0) return r;
6590 }
6591 }
6592 else {
6593 if (*from > to) {
6594 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
6595 goto ccs_range_end;
6596 else
6597 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
6598 }
6599 bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
6600 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
6601 if (r < 0) return r;
6602 }
6603 ccs_range_end:
6604 *state = CS_COMPLETE;
6605 break;
6606
6607 case CS_COMPLETE:
6608 case CS_START:
6609 *state = CS_VALUE;
6610 break;
6611
6612 default:
6613 break;
6614 }
6615
6616 *from_raw = to_raw;
6617 *from = to;
6618 *type = intype;
6619 return 0;
6620 }
6621
6622 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)6623 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
6624 ScanEnv* env)
6625 {
6626 int in_esc;
6627 OnigCodePoint code;
6628 OnigEncoding enc = env->enc;
6629 UChar* p = from;
6630
6631 in_esc = 0;
6632 while (! PEND) {
6633 if (ignore_escaped && in_esc) {
6634 in_esc = 0;
6635 }
6636 else {
6637 PFETCH_S(code);
6638 if (code == c) return 1;
6639 if (code == MC_ESC(env->syntax)) in_esc = 1;
6640 }
6641 }
6642 return 0;
6643 }
6644
6645 static int
prs_cc(Node ** np,PToken * tok,UChar ** src,UChar * end,ScanEnv * env)6646 prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
6647 {
6648 int r, neg, len, fetched, and_start;
6649 OnigCodePoint in_code, curr_code;
6650 UChar *p;
6651 Node* node;
6652 CClassNode *cc, *prev_cc;
6653 CClassNode work_cc;
6654 int curr_raw, in_raw;
6655 CSTATE state;
6656 CVAL in_type;
6657 CVAL curr_type;
6658
6659 *np = NULL_NODE;
6660 INC_PARSE_DEPTH(env->parse_depth);
6661
6662 state = CS_START;
6663 prev_cc = (CClassNode* )NULL;
6664 r = fetch_token_cc(tok, src, end, env, state);
6665 if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) {
6666 neg = 1;
6667 r = fetch_token_cc(tok, src, end, env, state);
6668 }
6669 else {
6670 neg = 0;
6671 }
6672
6673 if (r < 0) return r;
6674 if (r == TK_CC_CLOSE) {
6675 if (! code_exist_check((OnigCodePoint )']',
6676 *src, env->pattern_end, 1, env))
6677 return ONIGERR_EMPTY_CHAR_CLASS;
6678
6679 CC_ESC_WARN(env, (UChar* )"]");
6680 r = tok->type = TK_CHAR; /* allow []...] */
6681 }
6682
6683 *np = node = node_new_cclass();
6684 CHECK_NULL_RETURN_MEMERR(node);
6685 cc = CCLASS_(node);
6686
6687 and_start = 0;
6688 curr_type = CV_UNDEF;
6689
6690 p = *src;
6691 while (r != TK_CC_CLOSE) {
6692 fetched = 0;
6693 switch (r) {
6694 case TK_CHAR:
6695 any_char_in:
6696 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
6697 if (len < 0) {
6698 r = len;
6699 goto err;
6700 }
6701 in_type = (len == 1) ? CV_SB : CV_MB;
6702 in_code = tok->u.code;
6703 in_raw = 0;
6704 goto val_entry2;
6705 break;
6706
6707 case TK_CRUDE_BYTE:
6708 /* tok->base_num != 0 : octal or hexadec. */
6709 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base_num != 0) {
6710 int i, j;
6711 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
6712 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
6713 UChar* psave = p;
6714 int base_num = tok->base_num;
6715
6716 buf[0] = tok->u.byte;
6717 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
6718 r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE);
6719 if (r < 0) goto err;
6720 if (r != TK_CRUDE_BYTE || tok->base_num != base_num) {
6721 fetched = 1;
6722 break;
6723 }
6724 buf[i] = tok->u.byte;
6725 }
6726
6727 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
6728 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6729 goto err;
6730 }
6731
6732 /* clear buf tail */
6733 for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0';
6734
6735 len = enclen(env->enc, buf);
6736 if (i < len) {
6737 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6738 goto err;
6739 }
6740 else if (i > len) { /* fetch back */
6741 p = psave;
6742 for (i = 1; i < len; i++) {
6743 r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE);
6744 if (r < 0) goto err;
6745 }
6746 fetched = 0;
6747 }
6748
6749 if (i == 1) {
6750 in_code = (OnigCodePoint )buf[0];
6751 goto crude_single;
6752 }
6753 else {
6754 in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
6755 in_type = CV_MB;
6756 }
6757 }
6758 else {
6759 in_code = (OnigCodePoint )tok->u.byte;
6760 crude_single:
6761 in_type = CV_SB;
6762 }
6763 in_raw = 1;
6764 goto val_entry2;
6765 break;
6766
6767 case TK_CODE_POINT:
6768 in_code = tok->u.code;
6769 in_raw = 1;
6770 val_entry:
6771 len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code);
6772 if (len < 0) {
6773 if (state != CS_RANGE ||
6774 ! IS_SYNTAX_BV(env->syntax,
6775 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) ||
6776 in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) {
6777 r = len;
6778 goto err;
6779 }
6780 }
6781 in_type = (len == 1 ? CV_SB : CV_MB);
6782 val_entry2:
6783 r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type,
6784 &curr_type, &state, env);
6785 if (r != 0) goto err;
6786 break;
6787
6788 case TK_CC_POSIX_BRACKET_OPEN:
6789 r = prs_posix_bracket(cc, &p, end, env);
6790 if (r < 0) goto err;
6791 if (r == 1) { /* is not POSIX bracket */
6792 CC_ESC_WARN(env, (UChar* )"[");
6793 p = tok->backp;
6794 in_code = tok->u.code;
6795 in_raw = 0;
6796 goto val_entry;
6797 }
6798 goto next_cprop;
6799 break;
6800
6801 case TK_CHAR_TYPE:
6802 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
6803 if (r != 0) goto err;
6804
6805 next_cprop:
6806 r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env);
6807 if (r != 0) goto err;
6808 break;
6809
6810 case TK_CHAR_PROPERTY:
6811 {
6812 int ctype = fetch_char_property_to_ctype(&p, end, env);
6813 if (ctype < 0) {
6814 r = ctype;
6815 goto err;
6816 }
6817 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
6818 if (r != 0) goto err;
6819 goto next_cprop;
6820 }
6821 break;
6822
6823 case TK_CC_RANGE:
6824 if (state == CS_VALUE) {
6825 r = fetch_token_cc(tok, &p, end, env, CS_RANGE);
6826 if (r < 0) goto err;
6827
6828 fetched = 1;
6829 if (r == TK_CC_CLOSE) { /* allow [x-] */
6830 range_end_val:
6831 in_code = (OnigCodePoint )'-';
6832 in_raw = 0;
6833 goto val_entry;
6834 }
6835 else if (r == TK_CC_AND) {
6836 CC_ESC_WARN(env, (UChar* )"-");
6837 goto range_end_val;
6838 }
6839
6840 if (curr_type == CV_CPROP) {
6841 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6842 goto err;
6843 }
6844
6845 state = CS_RANGE;
6846 }
6847 else if (state == CS_START) {
6848 /* [-xa] is allowed */
6849 in_code = tok->u.code;
6850 in_raw = 0;
6851
6852 r = fetch_token_cc(tok, &p, end, env, CS_VALUE);
6853 if (r < 0) goto err;
6854
6855 fetched = 1;
6856 /* [--x] or [a&&-x] is warned. */
6857 if (r == TK_CC_RANGE || and_start != 0)
6858 CC_ESC_WARN(env, (UChar* )"-");
6859
6860 goto val_entry;
6861 }
6862 else if (state == CS_RANGE) {
6863 CC_ESC_WARN(env, (UChar* )"-");
6864 goto any_char_in; /* [!--] is allowed */
6865 }
6866 else { /* CS_COMPLETE */
6867 r = fetch_token_cc(tok, &p, end, env, CS_VALUE);
6868 if (r < 0) goto err;
6869
6870 fetched = 1;
6871 if (r == TK_CC_CLOSE)
6872 goto range_end_val; /* allow [a-b-] */
6873 else if (r == TK_CC_AND) {
6874 CC_ESC_WARN(env, (UChar* )"-");
6875 goto range_end_val;
6876 }
6877
6878 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
6879 CC_ESC_WARN(env, (UChar* )"-");
6880 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
6881 }
6882 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
6883 goto err;
6884 }
6885 break;
6886
6887 case TK_CC_OPEN_CC: /* [ */
6888 {
6889 Node *anode;
6890 CClassNode* acc;
6891
6892 if (state == CS_VALUE) {
6893 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6894 &state, env);
6895 if (r != 0) goto err;
6896 }
6897 state = CS_COMPLETE;
6898
6899 r = prs_cc(&anode, tok, &p, end, env);
6900 if (r != 0) {
6901 onig_node_free(anode);
6902 goto cc_open_err;
6903 }
6904 acc = CCLASS_(anode);
6905 r = or_cclass(cc, acc, env->enc);
6906 onig_node_free(anode);
6907
6908 cc_open_err:
6909 if (r != 0) goto err;
6910 }
6911 break;
6912
6913 case TK_CC_AND: /* && */
6914 {
6915 if (state == CS_VALUE) {
6916 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6917 &state, env);
6918 if (r != 0) goto err;
6919 }
6920 /* initialize local variables */
6921 and_start = 1;
6922 state = CS_START;
6923
6924 if (IS_NOT_NULL(prev_cc)) {
6925 r = and_cclass(prev_cc, cc, env->enc);
6926 if (r != 0) goto err;
6927 bbuf_free(cc->mbuf);
6928 }
6929 else {
6930 prev_cc = cc;
6931 cc = &work_cc;
6932 }
6933 initialize_cclass(cc);
6934 }
6935 break;
6936
6937 case TK_EOT:
6938 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
6939 goto err;
6940 break;
6941 default:
6942 r = ONIGERR_PARSER_BUG;
6943 goto err;
6944 break;
6945 }
6946
6947 if (fetched)
6948 r = tok->type;
6949 else {
6950 r = fetch_token_cc(tok, &p, end, env, state);
6951 if (r < 0) goto err;
6952 }
6953 }
6954
6955 if (state == CS_VALUE) {
6956 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type,
6957 &state, env);
6958 if (r != 0) goto err;
6959 }
6960
6961 if (IS_NOT_NULL(prev_cc)) {
6962 r = and_cclass(prev_cc, cc, env->enc);
6963 if (r != 0) goto err;
6964 bbuf_free(cc->mbuf);
6965 cc = prev_cc;
6966 }
6967
6968 if (neg != 0)
6969 NCCLASS_SET_NOT(cc);
6970 else
6971 NCCLASS_CLEAR_NOT(cc);
6972 if (IS_NCCLASS_NOT(cc) &&
6973 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
6974 int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
6975 if (is_empty != 0)
6976 BITSET_IS_EMPTY(cc->bs, is_empty);
6977
6978 if (is_empty == 0) {
6979 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
6980 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
6981 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
6982 else
6983 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
6984 }
6985 }
6986 }
6987 *src = p;
6988 DEC_PARSE_DEPTH(env->parse_depth);
6989 return 0;
6990
6991 err:
6992 if (cc != CCLASS_(*np))
6993 bbuf_free(cc->mbuf);
6994 return r;
6995 }
6996
6997 static int prs_alts(Node** top, PToken* tok, int term,
6998 UChar** src, UChar* end, ScanEnv* env, int group_head);
6999
7000 #ifdef USE_CALLOUT
7001
7002 /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */
7003 static int
prs_callout_of_contents(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)7004 prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
7005 {
7006 int r;
7007 int i;
7008 int in;
7009 int num;
7010 OnigCodePoint c;
7011 UChar* code_start;
7012 UChar* code_end;
7013 UChar* contents;
7014 UChar* tag_start;
7015 UChar* tag_end;
7016 int brace_nest;
7017 CalloutListEntry* e;
7018 RegexExt* ext;
7019 OnigEncoding enc = env->enc;
7020 UChar* p = *src;
7021
7022 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7023
7024 brace_nest = 0;
7025 while (PPEEK_IS('{')) {
7026 brace_nest++;
7027 PINC_S;
7028 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7029 }
7030
7031 in = ONIG_CALLOUT_IN_PROGRESS;
7032 code_start = p;
7033 while (1) {
7034 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7035
7036 code_end = p;
7037 PFETCH_S(c);
7038 if (c == '}') {
7039 i = brace_nest;
7040 while (i > 0) {
7041 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7042 PFETCH_S(c);
7043 if (c == '}') i--;
7044 else break;
7045 }
7046 if (i == 0) break;
7047 }
7048 }
7049
7050 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7051
7052 PFETCH_S(c);
7053 if (c == '[') {
7054 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7055 tag_end = tag_start = p;
7056 while (! PEND) {
7057 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7058 tag_end = p;
7059 PFETCH_S(c);
7060 if (c == ']') break;
7061 }
7062 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
7063 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7064
7065 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7066 PFETCH_S(c);
7067 }
7068 else {
7069 tag_start = tag_end = 0;
7070 }
7071
7072 if (c == 'X') {
7073 in |= ONIG_CALLOUT_IN_RETRACTION;
7074 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7075 PFETCH_S(c);
7076 }
7077 else if (c == '<') {
7078 in = ONIG_CALLOUT_IN_RETRACTION;
7079 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7080 PFETCH_S(c);
7081 }
7082 else if (c == '>') { /* no needs (default) */
7083 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7084 PFETCH_S(c);
7085 }
7086
7087 if (c != cterm)
7088 return ONIGERR_INVALID_CALLOUT_PATTERN;
7089
7090 r = reg_callout_list_entry(env, &num);
7091 if (r != 0) return r;
7092
7093 ext = onig_get_regex_ext(env->reg);
7094 CHECK_NULL_RETURN_MEMERR(ext);
7095 if (IS_NULL(ext->pattern)) {
7096 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7097 if (r != ONIG_NORMAL) return r;
7098 }
7099
7100 if (tag_start != tag_end) {
7101 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7102 if (r != ONIG_NORMAL) return r;
7103 }
7104
7105 contents = onigenc_strdup(enc, code_start, code_end);
7106 CHECK_NULL_RETURN_MEMERR(contents);
7107
7108 e = onig_reg_callout_list_at(env->reg, num);
7109 if (IS_NULL(e)) {
7110 xfree(contents);
7111 return ONIGERR_MEMORY;
7112 }
7113
7114 r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env);
7115 if (r != 0) {
7116 xfree(contents);
7117 return r;
7118 }
7119
7120 e->of = ONIG_CALLOUT_OF_CONTENTS;
7121 e->in = in;
7122 e->name_id = ONIG_NON_NAME_ID;
7123 e->u.content.start = contents;
7124 e->u.content.end = contents + (code_end - code_start);
7125
7126 *src = p;
7127 return 0;
7128 }
7129
7130 static long
prs_long(OnigEncoding enc,UChar * s,UChar * end,int sign_on,long max,long * rl)7131 prs_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl)
7132 {
7133 long v;
7134 long d;
7135 int flag;
7136 UChar* p;
7137 OnigCodePoint c;
7138
7139 if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG;
7140
7141 flag = 1;
7142 v = 0;
7143 p = s;
7144 while (p < end) {
7145 c = ONIGENC_MBC_TO_CODE(enc, p, end);
7146 p += ONIGENC_MBC_ENC_LEN(enc, p);
7147 if (c >= '0' && c <= '9') {
7148 d = (long )(c - '0');
7149 if (v > (max - d) / 10)
7150 return ONIGERR_INVALID_CALLOUT_ARG;
7151
7152 v = v * 10 + d;
7153 }
7154 else if (sign_on != 0 && (c == '-' || c == '+')) {
7155 if (c == '-') flag = -1;
7156 }
7157 else
7158 return ONIGERR_INVALID_CALLOUT_ARG;
7159
7160 sign_on = 0;
7161 }
7162
7163 *rl = flag * v;
7164 return ONIG_NORMAL;
7165 }
7166
7167 static void
clear_callout_args(int n,unsigned int types[],OnigValue vals[])7168 clear_callout_args(int n, unsigned int types[], OnigValue vals[])
7169 {
7170 int i;
7171
7172 for (i = 0; i < n; i++) {
7173 switch (types[i]) {
7174 case ONIG_TYPE_STRING:
7175 if (IS_NOT_NULL(vals[i].s.start))
7176 xfree(vals[i].s.start);
7177 break;
7178 default:
7179 break;
7180 }
7181 }
7182 }
7183
7184 static int
prs_callout_args(int skip_mode,int cterm,UChar ** src,UChar * end,int max_arg_num,unsigned int types[],OnigValue vals[],ScanEnv * env)7185 prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
7186 int max_arg_num, unsigned int types[], OnigValue vals[],
7187 ScanEnv* env)
7188 {
7189 #define MAX_CALLOUT_ARG_BYTE_LENGTH 128
7190
7191 int r;
7192 int n;
7193 int esc;
7194 int cn;
7195 UChar* s;
7196 UChar* e;
7197 UChar* eesc;
7198 OnigCodePoint c;
7199 UChar* bufend;
7200 UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH];
7201 OnigEncoding enc = env->enc;
7202 UChar* p = *src;
7203
7204 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7205
7206 c = 0;
7207 n = 0;
7208 while (n < ONIG_CALLOUT_MAX_ARGS_NUM) {
7209 cn = 0;
7210 esc = 0;
7211 eesc = 0;
7212 bufend = buf;
7213 s = e = p;
7214 while (1) {
7215 if (PEND) {
7216 r = ONIGERR_INVALID_CALLOUT_PATTERN;
7217 goto err_clear;
7218 }
7219
7220 e = p;
7221 PFETCH_S(c);
7222 if (esc != 0) {
7223 esc = 0;
7224 if (c == '\\' || c == cterm || c == ',') {
7225 /* */
7226 }
7227 else {
7228 e = eesc;
7229 cn++;
7230 }
7231 goto add_char;
7232 }
7233 else {
7234 if (c == '\\') {
7235 esc = 1;
7236 eesc = e;
7237 }
7238 else if (c == cterm || c == ',')
7239 break;
7240 else {
7241 size_t clen;
7242
7243 add_char:
7244 if (skip_mode == FALSE) {
7245 clen = p - e;
7246 if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) {
7247 r = ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */
7248 goto err_clear;
7249 }
7250
7251 xmemcpy(bufend, e, clen);
7252 bufend += clen;
7253 }
7254 cn++;
7255 }
7256 }
7257 }
7258
7259 if (cn != 0) {
7260 if (max_arg_num >= 0 && n >= max_arg_num) {
7261 r = ONIGERR_INVALID_CALLOUT_ARG;
7262 goto err_clear;
7263 }
7264
7265 if (skip_mode == FALSE) {
7266 if ((types[n] & ONIG_TYPE_LONG) != 0) {
7267 int fixed = 0;
7268 if (cn > 0) {
7269 long rl;
7270 r = prs_long(enc, buf, bufend, 1, LONG_MAX, &rl);
7271 if (r == ONIG_NORMAL) {
7272 vals[n].l = rl;
7273 fixed = 1;
7274 types[n] = ONIG_TYPE_LONG;
7275 }
7276 }
7277
7278 if (fixed == 0) {
7279 types[n] = (types[n] & ~ONIG_TYPE_LONG);
7280 if (types[n] == ONIG_TYPE_VOID) {
7281 r = ONIGERR_INVALID_CALLOUT_ARG;
7282 goto err_clear;
7283 }
7284 }
7285 }
7286
7287 switch (types[n]) {
7288 case ONIG_TYPE_LONG:
7289 break;
7290
7291 case ONIG_TYPE_CHAR:
7292 if (cn != 1) {
7293 r = ONIGERR_INVALID_CALLOUT_ARG;
7294 goto err_clear;
7295 }
7296 vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend);
7297 break;
7298
7299 case ONIG_TYPE_STRING:
7300 {
7301 UChar* rs = onigenc_strdup(enc, buf, bufend);
7302 if (IS_NULL(rs)) {
7303 r = ONIGERR_MEMORY; goto err_clear;
7304 }
7305 vals[n].s.start = rs;
7306 vals[n].s.end = rs + (e - s);
7307 }
7308 break;
7309
7310 case ONIG_TYPE_TAG:
7311 if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) {
7312 r = ONIGERR_INVALID_CALLOUT_TAG_NAME;
7313 goto err_clear;
7314 }
7315
7316 vals[n].s.start = s;
7317 vals[n].s.end = e;
7318 break;
7319
7320 case ONIG_TYPE_VOID:
7321 case ONIG_TYPE_POINTER:
7322 r = ONIGERR_PARSER_BUG;
7323 goto err_clear;
7324 break;
7325 }
7326 }
7327
7328 n++;
7329 }
7330
7331 if (c == cterm) break;
7332 }
7333
7334 if (c != cterm) {
7335 r = ONIGERR_INVALID_CALLOUT_PATTERN;
7336 goto err_clear;
7337 }
7338
7339 *src = p;
7340 return n;
7341
7342 err_clear:
7343 if (skip_mode == FALSE)
7344 clear_callout_args(n, types, vals);
7345 return r;
7346 }
7347
7348 /* (*name[TAG]) (*name[TAG]{a,b,..}) */
7349 static int
prs_callout_of_name(Node ** np,int cterm,UChar ** src,UChar * end,ScanEnv * env)7350 prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
7351 {
7352 int r;
7353 int i;
7354 int in;
7355 int num;
7356 int name_id;
7357 int arg_num;
7358 int max_arg_num;
7359 int opt_arg_num;
7360 int is_not_single;
7361 OnigCodePoint c;
7362 UChar* name_start;
7363 UChar* name_end;
7364 UChar* tag_start;
7365 UChar* tag_end;
7366 Node* node;
7367 CalloutListEntry* e;
7368 RegexExt* ext;
7369 unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM];
7370 OnigValue vals[ONIG_CALLOUT_MAX_ARGS_NUM];
7371 OnigEncoding enc = env->enc;
7372 UChar* p = *src;
7373
7374 /* PFETCH_READY; */
7375 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN;
7376
7377 node = 0;
7378 name_start = p;
7379 while (1) {
7380 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7381 name_end = p;
7382 PFETCH_S(c);
7383 if (c == cterm || c == '[' || c == '{') break;
7384 }
7385
7386 if (! is_allowed_callout_name(enc, name_start, name_end))
7387 return ONIGERR_INVALID_CALLOUT_NAME;
7388
7389 if (c == '[') {
7390 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7391 tag_end = tag_start = p;
7392 while (! PEND) {
7393 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7394 tag_end = p;
7395 PFETCH_S(c);
7396 if (c == ']') break;
7397 }
7398 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end))
7399 return ONIGERR_INVALID_CALLOUT_TAG_NAME;
7400
7401 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7402 PFETCH_S(c);
7403 }
7404 else {
7405 tag_start = tag_end = 0;
7406 }
7407
7408 if (c == '{') {
7409 UChar* save;
7410
7411 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7412
7413 /* read for single check only */
7414 save = p;
7415 arg_num = prs_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env);
7416 if (arg_num < 0) return arg_num;
7417
7418 is_not_single = PPEEK_IS(cterm) ? 0 : 1;
7419 p = save;
7420 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7421 &name_id);
7422 if (r != ONIG_NORMAL) return r;
7423
7424 max_arg_num = get_callout_arg_num_by_name_id(name_id);
7425 for (i = 0; i < max_arg_num; i++) {
7426 types[i] = get_callout_arg_type_by_name_id(name_id, i);
7427 }
7428
7429 arg_num = prs_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env);
7430 if (arg_num < 0) return arg_num;
7431
7432 if (PEND) {
7433 r = ONIGERR_END_PATTERN_IN_GROUP;
7434 goto err_clear;
7435 }
7436 PFETCH_S(c);
7437 }
7438 else {
7439 arg_num = 0;
7440
7441 is_not_single = 0;
7442 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end,
7443 &name_id);
7444 if (r != ONIG_NORMAL) return r;
7445
7446 max_arg_num = get_callout_arg_num_by_name_id(name_id);
7447 for (i = 0; i < max_arg_num; i++) {
7448 types[i] = get_callout_arg_type_by_name_id(name_id, i);
7449 }
7450 }
7451
7452 in = onig_get_callout_in_by_name_id(name_id);
7453 opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id);
7454 if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) {
7455 r = ONIGERR_INVALID_CALLOUT_ARG;
7456 goto err_clear;
7457 }
7458
7459 if (c != cterm) {
7460 r = ONIGERR_INVALID_CALLOUT_PATTERN;
7461 goto err_clear;
7462 }
7463
7464 r = reg_callout_list_entry(env, &num);
7465 if (r != 0) goto err_clear;
7466
7467 ext = onig_get_regex_ext(env->reg);
7468 if (IS_NULL(ext)) {
7469 r = ONIGERR_MEMORY; goto err_clear;
7470 }
7471 if (IS_NULL(ext->pattern)) {
7472 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end);
7473 if (r != ONIG_NORMAL) goto err_clear;
7474 }
7475
7476 if (tag_start != tag_end) {
7477 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);
7478 if (r != ONIG_NORMAL) goto err_clear;
7479 }
7480
7481 e = onig_reg_callout_list_at(env->reg, num);
7482 if (IS_NULL(e)) {
7483 r = ONIGERR_MEMORY; goto err_clear;
7484 }
7485
7486 r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env);
7487 if (r != ONIG_NORMAL) goto err_clear;
7488
7489 e->of = ONIG_CALLOUT_OF_NAME;
7490 e->in = in;
7491 e->name_id = name_id;
7492 e->type = onig_get_callout_type_by_name_id(name_id);
7493 e->start_func = onig_get_callout_start_func_by_name_id(name_id);
7494 e->end_func = onig_get_callout_end_func_by_name_id(name_id);
7495 e->u.arg.num = max_arg_num;
7496 e->u.arg.passed_num = arg_num;
7497 for (i = 0; i < max_arg_num; i++) {
7498 e->u.arg.types[i] = types[i];
7499 if (i < arg_num)
7500 e->u.arg.vals[i] = vals[i];
7501 else
7502 e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i);
7503 }
7504
7505 *np = node;
7506 *src = p;
7507 return 0;
7508
7509 err_clear:
7510 clear_callout_args(arg_num, types, vals);
7511 return r;
7512 }
7513 #endif
7514
7515 static int
prs_bag(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)7516 prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
7517 ScanEnv* env)
7518 {
7519 int r, num;
7520 Node *target;
7521 OnigOptionType option;
7522 OnigCodePoint c;
7523 int list_capture;
7524 OnigEncoding enc = env->enc;
7525
7526 UChar* p = *src;
7527 PFETCH_READY;
7528
7529 *np = NULL;
7530 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7531
7532 option = env->options;
7533 c = PPEEK;
7534 if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
7535 PINC;
7536 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7537
7538 PFETCH(c);
7539 switch (c) {
7540 case ':': /* (?:...) grouping only */
7541 group:
7542 r = fetch_token(tok, &p, end, env);
7543 if (r < 0) return r;
7544 r = prs_alts(np, tok, term, &p, end, env, FALSE);
7545 if (r < 0) return r;
7546 *src = p;
7547 return 1; /* group */
7548 break;
7549
7550 case '=':
7551 *np = node_new_anchor(ANCR_PREC_READ);
7552 break;
7553 case '!': /* preceding read */
7554 *np = node_new_anchor(ANCR_PREC_READ_NOT);
7555 break;
7556 case '>': /* (?>...) stop backtrack */
7557 *np = node_new_bag(BAG_STOP_BACKTRACK);
7558 break;
7559
7560 case '\'':
7561 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7562 goto named_group1;
7563 }
7564 else
7565 return ONIGERR_UNDEFINED_GROUP_OPTION;
7566 break;
7567
7568 case '<': /* look behind (?<=...), (?<!...) */
7569 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
7570 PFETCH(c);
7571 if (c == '=')
7572 *np = node_new_anchor(ANCR_LOOK_BEHIND);
7573 else if (c == '!')
7574 *np = node_new_anchor(ANCR_LOOK_BEHIND_NOT);
7575 else {
7576 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7577 UChar *name;
7578 UChar *name_end;
7579 enum REF_NUM num_type;
7580
7581 PUNFETCH;
7582 c = '<';
7583
7584 named_group1:
7585 list_capture = 0;
7586
7587 #ifdef USE_CAPTURE_HISTORY
7588 named_group2:
7589 #endif
7590 name = p;
7591 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num,
7592 &num_type, FALSE);
7593 if (r < 0) return r;
7594
7595 num = scan_env_add_mem_entry(env);
7596 if (num < 0) return num;
7597 if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM)
7598 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7599
7600 r = name_add(env->reg, name, name_end, num, env);
7601 if (r != 0) return r;
7602 *np = node_new_memory(1);
7603 CHECK_NULL_RETURN_MEMERR(*np);
7604 BAG_(*np)->m.regnum = num;
7605 if (list_capture != 0)
7606 MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7607 env->num_named++;
7608 }
7609 else {
7610 return ONIGERR_UNDEFINED_GROUP_OPTION;
7611 }
7612 }
7613 break;
7614
7615 case '~':
7616 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) {
7617 Node* absent;
7618 Node* expr;
7619 int head_bar;
7620 int is_range_cutter;
7621
7622 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7623
7624 if (PPEEK_IS('|')) { /* (?~|generator|absent) */
7625 PINC;
7626 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7627
7628 head_bar = 1;
7629 if (PPEEK_IS(')')) { /* (?~|) : range clear */
7630 PINC;
7631 r = make_range_clear(np, env);
7632 if (r != 0) return r;
7633 goto end;
7634 }
7635 }
7636 else
7637 head_bar = 0;
7638
7639 r = fetch_token(tok, &p, end, env);
7640 if (r < 0) return r;
7641 r = prs_alts(&absent, tok, term, &p, end, env, TRUE);
7642 if (r < 0) {
7643 onig_node_free(absent);
7644 return r;
7645 }
7646
7647 expr = NULL_NODE;
7648 is_range_cutter = 0;
7649 if (head_bar != 0) {
7650 Node* top = absent;
7651 if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) {
7652 expr = NULL_NODE;
7653 is_range_cutter = 1;
7654 /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */
7655 }
7656 else {
7657 absent = NODE_CAR(top);
7658 expr = NODE_CDR(top);
7659 NODE_CAR(top) = NULL_NODE;
7660 NODE_CDR(top) = NULL_NODE;
7661 onig_node_free(top);
7662 if (IS_NULL(NODE_CDR(expr))) {
7663 top = expr;
7664 expr = NODE_CAR(top);
7665 NODE_CAR(top) = NULL_NODE;
7666 onig_node_free(top);
7667 }
7668 }
7669 }
7670
7671 r = make_absent_tree(np, absent, expr, is_range_cutter, env);
7672 if (r != 0) {
7673 return r;
7674 }
7675 goto end;
7676 }
7677 else {
7678 return ONIGERR_UNDEFINED_GROUP_OPTION;
7679 }
7680 break;
7681
7682 #ifdef USE_CALLOUT
7683 case '{':
7684 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS))
7685 return ONIGERR_UNDEFINED_GROUP_OPTION;
7686
7687 r = prs_callout_of_contents(np, ')', &p, end, env);
7688 if (r != 0) return r;
7689
7690 goto end;
7691 break;
7692 #endif
7693
7694 case '(':
7695 /* (?()...) */
7696 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) {
7697 UChar *prev;
7698 Node* condition;
7699 int condition_is_checker;
7700
7701 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7702 PFETCH(c);
7703 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7704
7705 if (IS_CODE_DIGIT_ASCII(enc, c)
7706 || c == '-' || c == '+' || c == '<' || c == '\'') {
7707 #ifdef USE_BACKREF_WITH_LEVEL
7708 int exist_level;
7709 int level;
7710 #endif
7711 UChar* name_end;
7712 int back_num;
7713 enum REF_NUM num_type;
7714 int is_enclosed;
7715
7716 is_enclosed = (c == '<' || c == '\'') ? 1 : 0;
7717 if (! is_enclosed)
7718 PUNFETCH;
7719 prev = p;
7720 #ifdef USE_BACKREF_WITH_LEVEL
7721 exist_level = 0;
7722 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
7723 r = fetch_name_with_level(
7724 (OnigCodePoint )(is_enclosed != 0 ? c : '('),
7725 &p, end, &name_end,
7726 env, &back_num, &level, &num_type);
7727 if (r == 1) exist_level = 1;
7728 #else
7729 r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('),
7730 &p, end, &name_end, env, &back_num, &num_type, TRUE);
7731 #endif
7732 if (r < 0) {
7733 if (is_enclosed == 0) {
7734 goto any_condition;
7735 }
7736 else
7737 return r;
7738 }
7739
7740 condition_is_checker = 1;
7741 if (num_type != IS_NOT_NUM) {
7742 if (num_type == IS_REL_NUM) {
7743 back_num = backref_rel_to_abs(back_num, env);
7744 }
7745 if (back_num <= 0)
7746 return ONIGERR_INVALID_BACKREF;
7747
7748 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7749 if (back_num > env->num_mem ||
7750 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
7751 return ONIGERR_INVALID_BACKREF;
7752 }
7753
7754 condition = node_new_backref_checker(1, &back_num, FALSE,
7755 #ifdef USE_BACKREF_WITH_LEVEL
7756 exist_level, level,
7757 #endif
7758 env);
7759 }
7760 else {
7761 int num;
7762 int* backs;
7763
7764 num = name_to_group_numbers(env, prev, name_end, &backs);
7765 if (num <= 0) {
7766 return ONIGERR_UNDEFINED_NAME_REFERENCE;
7767 }
7768 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
7769 int i;
7770 for (i = 0; i < num; i++) {
7771 if (backs[i] > env->num_mem ||
7772 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
7773 return ONIGERR_INVALID_BACKREF;
7774 }
7775 }
7776
7777 condition = node_new_backref_checker(num, backs, TRUE,
7778 #ifdef USE_BACKREF_WITH_LEVEL
7779 exist_level, level,
7780 #endif
7781 env);
7782 }
7783
7784 if (is_enclosed != 0) {
7785 if (PEND) goto err_if_else;
7786 PFETCH(c);
7787 if (c != ')') goto err_if_else;
7788 }
7789 }
7790 #ifdef USE_CALLOUT
7791 else if (c == '?') {
7792 if (IS_SYNTAX_OP2(env->syntax,
7793 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) {
7794 if (! PEND && PPEEK_IS('{')) {
7795 /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */
7796 condition_is_checker = 0;
7797 PFETCH(c);
7798 r = prs_callout_of_contents(&condition, ')', &p, end, env);
7799 if (r != 0) return r;
7800 goto end_condition;
7801 }
7802 }
7803 goto any_condition;
7804 }
7805 else if (c == '*' &&
7806 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
7807 condition_is_checker = 0;
7808 r = prs_callout_of_name(&condition, ')', &p, end, env);
7809 if (r != 0) return r;
7810 goto end_condition;
7811 }
7812 #endif
7813 else {
7814 any_condition:
7815 PUNFETCH;
7816 condition_is_checker = 0;
7817 r = fetch_token(tok, &p, end, env);
7818 if (r < 0) return r;
7819 r = prs_alts(&condition, tok, term, &p, end, env, FALSE);
7820 if (r < 0) {
7821 onig_node_free(condition);
7822 return r;
7823 }
7824 }
7825
7826 #ifdef USE_CALLOUT
7827 end_condition:
7828 #endif
7829 CHECK_NULL_RETURN_MEMERR(condition);
7830
7831 if (PEND) {
7832 err_if_else:
7833 onig_node_free(condition);
7834 return ONIGERR_END_PATTERN_IN_GROUP;
7835 }
7836
7837 if (PPEEK_IS(')')) { /* case: empty body: make backref checker */
7838 if (condition_is_checker == 0) {
7839 onig_node_free(condition);
7840 return ONIGERR_INVALID_IF_ELSE_SYNTAX;
7841 }
7842 PFETCH(c);
7843 *np = condition;
7844 }
7845 else { /* if-else */
7846 int then_is_empty;
7847 Node *Then, *Else;
7848
7849 Then = 0;
7850 if (PPEEK_IS('|')) {
7851 PFETCH(c);
7852 then_is_empty = 1;
7853 }
7854 else
7855 then_is_empty = 0;
7856
7857 r = fetch_token(tok, &p, end, env);
7858 if (r < 0) {
7859 onig_node_free(condition);
7860 return r;
7861 }
7862 r = prs_alts(&target, tok, term, &p, end, env, TRUE);
7863 if (r < 0) {
7864 onig_node_free(condition);
7865 onig_node_free(target);
7866 return r;
7867 }
7868
7869 if (then_is_empty != 0) {
7870 Else = target;
7871 }
7872 else {
7873 if (NODE_TYPE(target) == NODE_ALT) {
7874 Then = NODE_CAR(target);
7875 if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) {
7876 Else = NODE_CAR(NODE_CDR(target));
7877 cons_node_free_alone(NODE_CDR(target));
7878 }
7879 else {
7880 Else = NODE_CDR(target);
7881 }
7882 cons_node_free_alone(target);
7883 }
7884 else {
7885 Then = target;
7886 Else = 0;
7887 }
7888 }
7889
7890 *np = node_new_bag_if_else(condition, Then, Else);
7891 if (IS_NULL(*np)) {
7892 onig_node_free(condition);
7893 onig_node_free(Then);
7894 onig_node_free(Else);
7895 return ONIGERR_MEMORY;
7896 }
7897 }
7898 goto end;
7899 }
7900 else {
7901 return ONIGERR_UNDEFINED_GROUP_OPTION;
7902 }
7903 break;
7904
7905 #ifdef USE_CAPTURE_HISTORY
7906 case '@':
7907 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
7908 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
7909 PFETCH(c);
7910 if (c == '<' || c == '\'') {
7911 list_capture = 1;
7912 goto named_group2; /* (?@<name>...) */
7913 }
7914 PUNFETCH;
7915 }
7916
7917 *np = node_new_memory(0);
7918 CHECK_NULL_RETURN_MEMERR(*np);
7919 num = scan_env_add_mem_entry(env);
7920 if (num < 0) {
7921 return num;
7922 }
7923 else if (num >= (int )MEM_STATUS_BITS_NUM) {
7924 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
7925 }
7926 BAG_(*np)->m.regnum = num;
7927 MEM_STATUS_ON_SIMPLE(env->cap_history, num);
7928 }
7929 else {
7930 return ONIGERR_UNDEFINED_GROUP_OPTION;
7931 }
7932 break;
7933 #endif
7934
7935 #ifdef USE_POSIXLINE_OPTION
7936 case 'p':
7937 #endif
7938 case '-': case 'i': case 'm': case 's': case 'x':
7939 case 'W': case 'D': case 'S': case 'P':
7940 case 'y':
7941 {
7942 int neg = 0;
7943
7944 while (1) {
7945 switch (c) {
7946 case ':':
7947 case ')':
7948 break;
7949
7950 case '-': neg = 1; break;
7951 case 'x': OPTION_NEGATE(option, ONIG_OPTION_EXTEND, neg); break;
7952 case 'i': OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break;
7953 case 's':
7954 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7955 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg);
7956 }
7957 else
7958 return ONIGERR_UNDEFINED_GROUP_OPTION;
7959 break;
7960
7961 case 'm':
7962 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
7963 OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE));
7964 }
7965 else if (IS_SYNTAX_OP2(env->syntax,
7966 ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) {
7967 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg);
7968 }
7969 else
7970 return ONIGERR_UNDEFINED_GROUP_OPTION;
7971 break;
7972 #ifdef USE_POSIXLINE_OPTION
7973 case 'p':
7974 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
7975 break;
7976 #endif
7977 case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
7978 case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
7979 case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
7980 case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
7981
7982 case 'y': /* y{g}, y{w} */
7983 {
7984 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
7985 return ONIGERR_UNDEFINED_GROUP_OPTION;
7986
7987 if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION;
7988
7989 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7990 if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION;
7991 PFETCH(c);
7992 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
7993 PFETCH(c);
7994 switch (c) {
7995 case 'g':
7996 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
7997 return ONIGERR_UNDEFINED_GROUP_OPTION;
7998
7999 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE);
8000 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE);
8001 break;
8002 #ifdef USE_UNICODE_WORD_BREAK
8003 case 'w':
8004 if (! ONIGENC_IS_UNICODE_ENCODING(enc))
8005 return ONIGERR_UNDEFINED_GROUP_OPTION;
8006
8007 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE);
8008 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE);
8009 break;
8010 #endif
8011 default:
8012 return ONIGERR_UNDEFINED_GROUP_OPTION;
8013 break;
8014 }
8015 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
8016 PFETCH(c);
8017 if (c != '}')
8018 return ONIGERR_UNDEFINED_GROUP_OPTION;
8019 break;
8020 } /* case 'y' */
8021
8022 default:
8023 return ONIGERR_UNDEFINED_GROUP_OPTION;
8024 }
8025
8026 if (c == ')') {
8027 *np = node_new_option(option);
8028 CHECK_NULL_RETURN_MEMERR(*np);
8029 *src = p;
8030 return 2; /* option only */
8031 }
8032 else if (c == ':') {
8033 OnigOptionType prev = env->options;
8034
8035 env->options = option;
8036 r = fetch_token(tok, &p, end, env);
8037 if (r < 0) return r;
8038 r = prs_alts(&target, tok, term, &p, end, env, FALSE);
8039 env->options = prev;
8040 if (r < 0) {
8041 onig_node_free(target);
8042 return r;
8043 }
8044 *np = node_new_option(option);
8045 CHECK_NULL_RETURN_MEMERR(*np);
8046 NODE_BODY(*np) = target;
8047 *src = p;
8048 return 0;
8049 }
8050
8051 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
8052 PFETCH(c);
8053 } /* while (1) */
8054 }
8055 break;
8056
8057 default:
8058 return ONIGERR_UNDEFINED_GROUP_OPTION;
8059 }
8060 }
8061 #ifdef USE_CALLOUT
8062 else if (c == '*' &&
8063 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) {
8064 PINC;
8065 r = prs_callout_of_name(np, ')', &p, end, env);
8066 if (r != 0) return r;
8067
8068 goto end;
8069 }
8070 #endif
8071 else {
8072 if (OPTON_DONT_CAPTURE_GROUP(env->options))
8073 goto group;
8074
8075 *np = node_new_memory(0);
8076 CHECK_NULL_RETURN_MEMERR(*np);
8077 num = scan_env_add_mem_entry(env);
8078 if (num < 0) return num;
8079 BAG_(*np)->m.regnum = num;
8080 }
8081
8082 CHECK_NULL_RETURN_MEMERR(*np);
8083 r = fetch_token(tok, &p, end, env);
8084 if (r < 0) return r;
8085 r = prs_alts(&target, tok, term, &p, end, env, FALSE);
8086 if (r < 0) {
8087 onig_node_free(target);
8088 return r;
8089 }
8090
8091 NODE_BODY(*np) = target;
8092
8093 if (NODE_TYPE(*np) == NODE_BAG) {
8094 if (BAG_(*np)->type == BAG_MEMORY) {
8095 /* Don't move this to previous of prs_alts() */
8096 r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np);
8097 if (r != 0) return r;
8098 }
8099 }
8100
8101 end:
8102 *src = p;
8103 return 0;
8104 }
8105
8106 static const char* PopularQStr[] = {
8107 "?", "*", "+", "??", "*?", "+?"
8108 };
8109
8110 static const char* ReduceQStr[] = {
8111 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
8112 };
8113
8114 static int
assign_quantifier_body(Node * qnode,Node * target,int group,ScanEnv * env)8115 assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env)
8116 {
8117 QuantNode* qn;
8118
8119 qn = QUANT_(qnode);
8120 if (qn->lower == 1 && qn->upper == 1)
8121 return 1;
8122
8123 switch (NODE_TYPE(target)) {
8124 case NODE_STRING:
8125 if (group == 0) {
8126 if (str_node_can_be_split(target, env->enc)) {
8127 Node* n = str_node_split_last_char(target, env->enc);
8128 if (IS_NOT_NULL(n)) {
8129 NODE_BODY(qnode) = n;
8130 return 2;
8131 }
8132 }
8133 }
8134 break;
8135
8136 case NODE_QUANT:
8137 { /* check redundant double repeat. */
8138 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
8139 QuantNode* qnt = QUANT_(target);
8140 int nestq_num = quantifier_type_num(qn);
8141 int targetq_num = quantifier_type_num(qnt);
8142
8143 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
8144 if (targetq_num >= 0 && nestq_num >= 0 &&
8145 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
8146 UChar buf[WARN_BUFSIZE];
8147
8148 switch(ReduceTypeTable[targetq_num][nestq_num]) {
8149 case RQ_ASIS:
8150 break;
8151
8152 case RQ_DEL:
8153 if (onig_verb_warn != onig_null_warn) {
8154 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
8155 env->pattern, env->pattern_end,
8156 (UChar* )"redundant nested repeat operator");
8157 (*onig_verb_warn)((char* )buf);
8158 }
8159 goto warn_exit;
8160 break;
8161
8162 default:
8163 if (onig_verb_warn != onig_null_warn) {
8164 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
8165 env->pattern, env->pattern_end,
8166 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
8167 PopularQStr[targetq_num], PopularQStr[nestq_num],
8168 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
8169 (*onig_verb_warn)((char* )buf);
8170 }
8171 goto warn_exit;
8172 break;
8173 }
8174 }
8175
8176 warn_exit:
8177 #endif
8178 if (targetq_num >= 0 && nestq_num < 0) {
8179 if (targetq_num == 1 || targetq_num == 2) { /* * or + */
8180 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
8181 if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {
8182 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
8183 }
8184 }
8185 }
8186 else {
8187 int r;
8188
8189 NODE_BODY(qnode) = target;
8190 r = onig_reduce_nested_quantifier(qnode);
8191 return r;
8192 }
8193 }
8194 break;
8195
8196 default:
8197 break;
8198 }
8199
8200 NODE_BODY(qnode) = target;
8201 return 0;
8202 }
8203
8204
8205 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8206 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)8207 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
8208 {
8209 BBuf *tbuf;
8210 int r;
8211
8212 if (IS_NCCLASS_NOT(cc)) {
8213 bitset_invert(cc->bs);
8214
8215 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
8216 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
8217 if (r != 0) return r;
8218
8219 bbuf_free(cc->mbuf);
8220 cc->mbuf = tbuf;
8221 }
8222
8223 NCCLASS_CLEAR_NOT(cc);
8224 }
8225
8226 return 0;
8227 }
8228 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
8229
8230 #define ADD_CODE_INTO_CC(cc, code, enc) do {\
8231 if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\
8232 add_code_range_to_buf(&((cc)->mbuf), code, code);\
8233 }\
8234 else {\
8235 BITSET_SET_BIT((cc)->bs, code);\
8236 }\
8237 } while (0)
8238
8239 extern int
onig_new_cclass_with_code_list(Node ** rnode,OnigEncoding enc,int n,OnigCodePoint codes[])8240 onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,
8241 int n, OnigCodePoint codes[])
8242 {
8243 int i;
8244 Node* node;
8245 CClassNode* cc;
8246
8247 *rnode = NULL_NODE;
8248
8249 node = node_new_cclass();
8250 CHECK_NULL_RETURN_MEMERR(node);
8251
8252 cc = CCLASS_(node);
8253
8254 for (i = 0; i < n; i++) {
8255 ADD_CODE_INTO_CC(cc, codes[i], enc);
8256 }
8257
8258 *rnode = node;
8259 return 0;
8260 }
8261
8262 typedef struct {
8263 ScanEnv* env;
8264 CClassNode* cc;
8265 Node* alt_root;
8266 Node** ptail;
8267 } IApplyCaseFoldArg;
8268
8269 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)8270 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
8271 {
8272 IApplyCaseFoldArg* iarg;
8273 ScanEnv* env;
8274 CClassNode* cc;
8275
8276 iarg = (IApplyCaseFoldArg* )arg;
8277 env = iarg->env;
8278 cc = iarg->cc;
8279
8280 if (to_len == 1) {
8281 int is_in = onig_is_code_in_cc(env->enc, from, cc);
8282 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8283 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
8284 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
8285 ADD_CODE_INTO_CC(cc, *to, env->enc);
8286 }
8287 #else
8288 if (is_in != 0) {
8289 if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
8290 ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) {
8291 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
8292 add_code_range(&(cc->mbuf), env, *to, *to);
8293 }
8294 else {
8295 if (IS_NCCLASS_NOT(cc)) {
8296 BITSET_CLEAR_BIT(cc->bs, *to);
8297 }
8298 else
8299 BITSET_SET_BIT(cc->bs, *to);
8300 }
8301 }
8302 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
8303 }
8304 else {
8305 int r, i, len;
8306 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8307
8308 if (onig_is_code_in_cc(env->enc, from, cc)
8309 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
8310 && !IS_NCCLASS_NOT(cc)
8311 #endif
8312 ) {
8313 int n, j, m, index;
8314 Node* list_node;
8315 Node* ns[3];
8316
8317 n = 0;
8318 for (i = 0; i < to_len; i++) {
8319 OnigCodePoint code;
8320 Node* csnode;
8321 CClassNode* cs_cc;
8322
8323 index = onigenc_unicode_fold1_key(&to[i]);
8324 if (index >= 0) {
8325 csnode = node_new_cclass();
8326 cs_cc = CCLASS_(csnode);
8327 if (IS_NULL(csnode)) {
8328 err_free_ns:
8329 for (j = 0; j < n; j++) onig_node_free(ns[j]);
8330 return ONIGERR_MEMORY;
8331 }
8332 m = FOLDS1_UNFOLDS_NUM(index);
8333 for (j = 0; j < m; j++) {
8334 code = FOLDS1_UNFOLDS(index)[j];
8335 ADD_CODE_INTO_CC(cs_cc, code, env->enc);
8336 }
8337 ADD_CODE_INTO_CC(cs_cc, to[i], env->enc);
8338 ns[n++] = csnode;
8339 }
8340 else {
8341 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
8342 if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {
8343 csnode = node_new_str(buf, buf + len);
8344 if (IS_NULL(csnode)) goto err_free_ns;
8345
8346 NODE_STRING_SET_CASE_EXPANDED(csnode);
8347 ns[n++] = csnode;
8348 }
8349 else {
8350 r = onig_node_str_cat(ns[n-1], buf, buf + len);
8351 if (r < 0) goto err_free_ns;
8352 }
8353 }
8354 }
8355
8356 if (n == 1)
8357 list_node = ns[0];
8358 else
8359 list_node = make_list(n, ns);
8360
8361 *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE);
8362 if (IS_NULL(*(iarg->ptail))) {
8363 onig_node_free(list_node);
8364 return ONIGERR_MEMORY;
8365 }
8366 iarg->ptail = &(NODE_CDR((*(iarg->ptail))));
8367 }
8368 }
8369
8370 return 0;
8371 }
8372
8373 static int
prs_exp(Node ** np,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8374 prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
8375 ScanEnv* env, int group_head)
8376 {
8377 int r, len, group;
8378 Node* qn;
8379 Node** tp;
8380 unsigned int parse_depth;
8381
8382 retry:
8383 group = 0;
8384 *np = NULL;
8385 if (tok->type == (enum TokenSyms )term)
8386 goto end_of_token;
8387
8388 parse_depth = env->parse_depth;
8389
8390 switch (tok->type) {
8391 case TK_ALT:
8392 case TK_EOT:
8393 end_of_token:
8394 *np = node_new_empty();
8395 CHECK_NULL_RETURN_MEMERR(*np);
8396 return tok->type;
8397 break;
8398
8399 case TK_SUBEXP_OPEN:
8400 r = prs_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env);
8401 if (r < 0) return r;
8402 if (r == 1) { /* group */
8403 if (group_head == 0)
8404 group = 1;
8405 else {
8406 Node* target = *np;
8407 *np = node_new_group(target);
8408 if (IS_NULL(*np)) {
8409 onig_node_free(target);
8410 return ONIGERR_MEMORY;
8411 }
8412 group = 2;
8413 }
8414 }
8415 else if (r == 2) { /* option only */
8416 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) {
8417 env->options = BAG_(*np)->o.options;
8418 r = fetch_token(tok, src, end, env);
8419 if (r < 0) return r;
8420 onig_node_free(*np);
8421 goto retry;
8422 }
8423 else {
8424 Node* target;
8425 OnigOptionType prev = env->options;
8426
8427 env->options = BAG_(*np)->o.options;
8428 r = fetch_token(tok, src, end, env);
8429 if (r < 0) return r;
8430 r = prs_alts(&target, tok, term, src, end, env, FALSE);
8431 env->options = prev;
8432 if (r < 0) {
8433 onig_node_free(target);
8434 return r;
8435 }
8436 NODE_BODY(*np) = target;
8437 }
8438 return tok->type;
8439 }
8440 break;
8441
8442 case TK_SUBEXP_CLOSE:
8443 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
8444 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
8445
8446 if (tok->escaped) goto tk_crude_byte;
8447 else goto tk_byte;
8448 break;
8449
8450 case TK_STRING:
8451 tk_byte:
8452 {
8453 *np = node_new_str_with_options(tok->backp, *src, env->options);
8454 CHECK_NULL_RETURN_MEMERR(*np);
8455
8456 while (1) {
8457 r = fetch_token(tok, src, end, env);
8458 if (r < 0) return r;
8459 if (r != TK_STRING) break;
8460
8461 r = onig_node_str_cat(*np, tok->backp, *src);
8462 if (r < 0) return r;
8463 }
8464
8465 string_end:
8466 tp = np;
8467 goto repeat;
8468 }
8469 break;
8470
8471 case TK_CRUDE_BYTE:
8472 tk_crude_byte:
8473 {
8474 *np = node_new_str_crude_char(tok->u.byte, env->options);
8475 CHECK_NULL_RETURN_MEMERR(*np);
8476 len = 1;
8477 while (1) {
8478 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
8479 if (len == enclen(env->enc, STR_(*np)->s)) {
8480 r = fetch_token(tok, src, end, env);
8481 goto tk_crude_byte_end;
8482 }
8483 }
8484
8485 r = fetch_token(tok, src, end, env);
8486 if (r < 0) return r;
8487 if (r != TK_CRUDE_BYTE)
8488 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
8489
8490 r = node_str_cat_char(*np, tok->u.byte);
8491 if (r < 0) return r;
8492
8493 len++;
8494 }
8495
8496 tk_crude_byte_end:
8497 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end))
8498 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8499
8500 NODE_STRING_CLEAR_CRUDE(*np);
8501 goto string_end;
8502 }
8503 break;
8504
8505 case TK_CODE_POINT:
8506 {
8507 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
8508 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code);
8509 if (len < 0) return len;
8510 len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
8511 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
8512 *np = node_new_str_crude(buf, buf + len, env->options);
8513 #else
8514 *np = node_new_str_with_options(buf, buf + len, env->options);
8515 #endif
8516 CHECK_NULL_RETURN_MEMERR(*np);
8517 }
8518 break;
8519
8520 case TK_QUOTE_OPEN:
8521 {
8522 OnigCodePoint end_op[2];
8523 UChar *qstart, *qend, *nextp;
8524
8525 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
8526 end_op[1] = (OnigCodePoint )'E';
8527 qstart = *src;
8528 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
8529 if (IS_NULL(qend)) {
8530 nextp = qend = end;
8531 }
8532 *np = node_new_str_with_options(qstart, qend, env->options);
8533 CHECK_NULL_RETURN_MEMERR(*np);
8534 *src = nextp;
8535 }
8536 break;
8537
8538 case TK_CHAR_TYPE:
8539 {
8540 switch (tok->u.prop.ctype) {
8541 case ONIGENC_CTYPE_WORD:
8542 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options);
8543 CHECK_NULL_RETURN_MEMERR(*np);
8544 break;
8545
8546 case ONIGENC_CTYPE_SPACE:
8547 case ONIGENC_CTYPE_DIGIT:
8548 case ONIGENC_CTYPE_XDIGIT:
8549 {
8550 CClassNode* cc;
8551
8552 *np = node_new_cclass();
8553 CHECK_NULL_RETURN_MEMERR(*np);
8554 cc = CCLASS_(*np);
8555 r = add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env);
8556 if (r != 0) {
8557 onig_node_free(*np);
8558 *np = NULL_NODE;
8559 return r;
8560 }
8561 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
8562 }
8563 break;
8564
8565 default:
8566 return ONIGERR_PARSER_BUG;
8567 break;
8568 }
8569 }
8570 break;
8571
8572 case TK_CHAR_PROPERTY:
8573 r = prs_char_property(np, tok, src, end, env);
8574 if (r != 0) return r;
8575 break;
8576
8577 case TK_OPEN_CC:
8578 {
8579 CClassNode* cc;
8580
8581 r = prs_cc(np, tok, src, end, env);
8582 if (r != 0) return r;
8583
8584 cc = CCLASS_(*np);
8585 if (OPTON_IGNORECASE(env->options)) {
8586 IApplyCaseFoldArg iarg;
8587
8588 iarg.env = env;
8589 iarg.cc = cc;
8590 iarg.alt_root = NULL_NODE;
8591 iarg.ptail = &(iarg.alt_root);
8592
8593 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
8594 i_apply_case_fold, &iarg);
8595 if (r != 0) {
8596 onig_node_free(iarg.alt_root);
8597 return r;
8598 }
8599 if (IS_NOT_NULL(iarg.alt_root)) {
8600 Node* work = onig_node_new_alt(*np, iarg.alt_root);
8601 if (IS_NULL(work)) {
8602 onig_node_free(iarg.alt_root);
8603 return ONIGERR_MEMORY;
8604 }
8605 *np = work;
8606 }
8607 }
8608 }
8609 break;
8610
8611 case TK_ANYCHAR:
8612 *np = node_new_anychar(env->options);
8613 CHECK_NULL_RETURN_MEMERR(*np);
8614 break;
8615
8616 case TK_ANYCHAR_ANYTIME:
8617 *np = node_new_anychar(env->options);
8618 CHECK_NULL_RETURN_MEMERR(*np);
8619 qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE);
8620 CHECK_NULL_RETURN_MEMERR(qn);
8621 NODE_BODY(qn) = *np;
8622 *np = qn;
8623 break;
8624
8625 case TK_BACKREF:
8626 len = tok->u.backref.num;
8627 *np = node_new_backref(len,
8628 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
8629 tok->u.backref.by_name,
8630 #ifdef USE_BACKREF_WITH_LEVEL
8631 tok->u.backref.exist_level,
8632 tok->u.backref.level,
8633 #endif
8634 env);
8635 CHECK_NULL_RETURN_MEMERR(*np);
8636 break;
8637
8638 #ifdef USE_CALL
8639 case TK_CALL:
8640 {
8641 int gnum = tok->u.call.gnum;
8642
8643 *np = node_new_call(tok->u.call.name, tok->u.call.name_end,
8644 gnum, tok->u.call.by_number);
8645 CHECK_NULL_RETURN_MEMERR(*np);
8646 env->num_call++;
8647 if (tok->u.call.by_number != 0 && gnum == 0) {
8648 env->has_call_zero = 1;
8649 }
8650 }
8651 break;
8652 #endif
8653
8654 case TK_ANCHOR:
8655 *np = node_new_anchor_with_options(tok->u.anchor, env->options);
8656 CHECK_NULL_RETURN_MEMERR(*np);
8657 break;
8658
8659 case TK_REPEAT:
8660 case TK_INTERVAL:
8661 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
8662 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
8663 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
8664 else {
8665 *np = node_new_empty();
8666 CHECK_NULL_RETURN_MEMERR(*np);
8667 }
8668 }
8669 else {
8670 goto tk_byte;
8671 }
8672 break;
8673
8674 case TK_KEEP:
8675 r = node_new_keep(np, env);
8676 if (r < 0) return r;
8677 break;
8678
8679 case TK_GENERAL_NEWLINE:
8680 r = node_new_general_newline(np, env);
8681 if (r < 0) return r;
8682 break;
8683
8684 case TK_NO_NEWLINE:
8685 r = node_new_no_newline(np, env);
8686 if (r < 0) return r;
8687 break;
8688
8689 case TK_TRUE_ANYCHAR:
8690 r = node_new_true_anychar(np);
8691 if (r < 0) return r;
8692 break;
8693
8694 case TK_TEXT_SEGMENT:
8695 r = make_text_segment(np, env);
8696 if (r < 0) return r;
8697 break;
8698
8699 default:
8700 return ONIGERR_PARSER_BUG;
8701 break;
8702 }
8703
8704 {
8705 tp = np;
8706
8707 re_entry:
8708 r = fetch_token(tok, src, end, env);
8709 if (r < 0) return r;
8710
8711 repeat:
8712 if (r == TK_REPEAT || r == TK_INTERVAL) {
8713 Node* target;
8714
8715 if (is_invalid_quantifier_target(*tp))
8716 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
8717
8718 INC_PARSE_DEPTH(parse_depth);
8719
8720 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
8721 r == TK_INTERVAL);
8722 CHECK_NULL_RETURN_MEMERR(qn);
8723 QUANT_(qn)->greedy = tok->u.repeat.greedy;
8724 if (group == 2) {
8725 target = node_drop_group(*tp);
8726 *tp = NULL_NODE;
8727 }
8728 else {
8729 target = *tp;
8730 }
8731 r = assign_quantifier_body(qn, target, group, env);
8732 if (r < 0) {
8733 onig_node_free(qn);
8734 *tp = NULL_NODE;
8735 return r;
8736 }
8737
8738 if (tok->u.repeat.possessive != 0) {
8739 Node* en;
8740 en = node_new_bag(BAG_STOP_BACKTRACK);
8741 if (IS_NULL(en)) {
8742 onig_node_free(qn);
8743 return ONIGERR_MEMORY;
8744 }
8745 NODE_BODY(en) = qn;
8746 qn = en;
8747 }
8748
8749 if (r == 0) {
8750 *tp = qn;
8751 }
8752 else if (r == 1) { /* x{1,1} ==> x */
8753 onig_node_free(qn);
8754 *tp = target;
8755 }
8756 else if (r == 2) { /* split case: /abc+/ */
8757 Node *tmp;
8758
8759 *tp = node_new_list(*tp, NULL);
8760 if (IS_NULL(*tp)) {
8761 onig_node_free(qn);
8762 return ONIGERR_MEMORY;
8763 }
8764 tmp = NODE_CDR(*tp) = node_new_list(qn, NULL);
8765 if (IS_NULL(tmp)) {
8766 onig_node_free(qn);
8767 return ONIGERR_MEMORY;
8768 }
8769 tp = &(NODE_CAR(tmp));
8770 }
8771 group = 0;
8772 goto re_entry;
8773 }
8774 }
8775
8776 return r;
8777 }
8778
8779 static int
prs_branch(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8780 prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8781 ScanEnv* env, int group_head)
8782 {
8783 int r;
8784 Node *node, **headp;
8785
8786 *top = NULL;
8787 INC_PARSE_DEPTH(env->parse_depth);
8788
8789 r = prs_exp(&node, tok, term, src, end, env, group_head);
8790 if (r < 0) {
8791 onig_node_free(node);
8792 return r;
8793 }
8794
8795 if (r == TK_EOT || r == term || r == TK_ALT) {
8796 *top = node;
8797 }
8798 else {
8799 *top = node_new_list(node, NULL);
8800 if (IS_NULL(*top)) {
8801 onig_node_free(node);
8802 return ONIGERR_MEMORY;
8803 }
8804
8805 headp = &(NODE_CDR(*top));
8806 while (r != TK_EOT && r != term && r != TK_ALT) {
8807 r = prs_exp(&node, tok, term, src, end, env, FALSE);
8808 if (r < 0) {
8809 onig_node_free(node);
8810 return r;
8811 }
8812
8813 if (NODE_TYPE(node) == NODE_LIST) {
8814 *headp = node;
8815 while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node);
8816 headp = &(NODE_CDR(node));
8817 }
8818 else {
8819 *headp = node_new_list(node, NULL);
8820 headp = &(NODE_CDR(*headp));
8821 }
8822 }
8823 }
8824
8825 DEC_PARSE_DEPTH(env->parse_depth);
8826 return r;
8827 }
8828
8829 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
8830 static int
prs_alts(Node ** top,PToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env,int group_head)8831 prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
8832 ScanEnv* env, int group_head)
8833 {
8834 int r;
8835 Node *node, **headp;
8836 OnigOptionType save_options;
8837
8838 *top = NULL;
8839 INC_PARSE_DEPTH(env->parse_depth);
8840 save_options = env->options;
8841
8842 r = prs_branch(&node, tok, term, src, end, env, group_head);
8843 if (r < 0) {
8844 onig_node_free(node);
8845 return r;
8846 }
8847
8848 if (r == term) {
8849 *top = node;
8850 }
8851 else if (r == TK_ALT) {
8852 *top = onig_node_new_alt(node, NULL);
8853 if (IS_NULL(*top)) {
8854 onig_node_free(node);
8855 return ONIGERR_MEMORY;
8856 }
8857
8858 headp = &(NODE_CDR(*top));
8859 while (r == TK_ALT) {
8860 r = fetch_token(tok, src, end, env);
8861 if (r < 0) return r;
8862 r = prs_branch(&node, tok, term, src, end, env, FALSE);
8863 if (r < 0) {
8864 onig_node_free(node);
8865 return r;
8866 }
8867 *headp = onig_node_new_alt(node, NULL);
8868 if (IS_NULL(*headp)) {
8869 onig_node_free(node);
8870 onig_node_free(*top);
8871 return ONIGERR_MEMORY;
8872 }
8873
8874 headp = &(NODE_CDR(*headp));
8875 }
8876
8877 if (tok->type != (enum TokenSyms )term)
8878 goto err;
8879 }
8880 else {
8881 onig_node_free(node);
8882 err:
8883 if (term == TK_SUBEXP_CLOSE)
8884 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
8885 else
8886 return ONIGERR_PARSER_BUG;
8887 }
8888
8889 env->options = save_options;
8890 DEC_PARSE_DEPTH(env->parse_depth);
8891 return r;
8892 }
8893
8894 static int
prs_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)8895 prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
8896 {
8897 int r;
8898 PToken tok;
8899
8900 ptoken_init(&tok);
8901 r = fetch_token(&tok, src, end, env);
8902 if (r < 0) return r;
8903 r = prs_alts(top, &tok, TK_EOT, src, end, env, FALSE);
8904 if (r < 0) return r;
8905
8906 return 0;
8907 }
8908
8909 #ifdef USE_CALL
8910 static int
make_call_zero_body(Node * node,ScanEnv * env,Node ** rnode)8911 make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
8912 {
8913 int r;
8914
8915 Node* x = node_new_memory(0 /* 0: is not named */);
8916 CHECK_NULL_RETURN_MEMERR(x);
8917
8918 NODE_BODY(x) = node;
8919 BAG_(x)->m.regnum = 0;
8920 r = scan_env_set_mem_node(env, 0, x);
8921 if (r != 0) {
8922 onig_node_free(x);
8923 return r;
8924 }
8925
8926 *rnode = x;
8927 return 0;
8928 }
8929 #endif
8930
8931 extern int
onig_parse_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)8932 onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
8933 regex_t* reg, ScanEnv* env)
8934 {
8935 int r;
8936 UChar* p;
8937 #ifdef USE_CALLOUT
8938 RegexExt* ext;
8939 #endif
8940
8941 reg->string_pool = 0;
8942 reg->string_pool_end = 0;
8943 reg->num_mem = 0;
8944 reg->num_repeat = 0;
8945 reg->num_empty_check = 0;
8946 reg->repeat_range_alloc = 0;
8947 reg->repeat_range = (RepeatRange* )NULL;
8948 reg->empty_status_mem = 0;
8949
8950 names_clear(reg);
8951
8952 scan_env_clear(env);
8953 env->options = reg->options;
8954 env->case_fold_flag = reg->case_fold_flag;
8955 env->enc = reg->enc;
8956 env->syntax = reg->syntax;
8957 env->pattern = (UChar* )pattern;
8958 env->pattern_end = (UChar* )end;
8959 env->reg = reg;
8960
8961 *root = NULL;
8962
8963 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
8964 return ONIGERR_INVALID_WIDE_CHAR_VALUE;
8965
8966 p = (UChar* )pattern;
8967 r = prs_regexp(root, &p, (UChar* )end, env);
8968 if (r != 0) return r;
8969
8970 #ifdef USE_CALL
8971 if (env->has_call_zero != 0) {
8972 Node* zero_node;
8973 r = make_call_zero_body(*root, env, &zero_node);
8974 if (r != 0) return r;
8975
8976 *root = zero_node;
8977 }
8978 #endif
8979
8980 reg->num_mem = env->num_mem;
8981
8982 #ifdef USE_CALLOUT
8983 ext = reg->extp;
8984 if (IS_NOT_NULL(ext) && ext->callout_num > 0) {
8985 r = setup_ext_callout_list_values(reg);
8986 }
8987 #endif
8988
8989 return r;
8990 }
8991
8992 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)8993 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
8994 UChar* arg, UChar* arg_end)
8995 {
8996 env->error = arg;
8997 env->error_end = arg_end;
8998 }
8999