1 /********************************************************************** 2 regparse.c - Oniguruma (regular expression library) 3 **********************************************************************/ 4 /*- 5 * Copyright (c) 2002-2020 K.Kosako 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL standard_initdbnull22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #ifdef DEBUG_NODE_FREE 31 #ifndef NEED_TO_INCLUDE_STDIO 32 #define NEED_TO_INCLUDE_STDIO 33 #endif 34 #endif 35 36 #include "regparse.h" 37 #include "st.h" 38 39 #define INIT_TAG_NAMES_ALLOC_NUM 5 40 41 #define WARN_BUFSIZE 256 42 43 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 44 45 #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \ 46 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */) 47 #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \ 48 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_') 49 50 #define OPTON_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE) 51 #define OPTON_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE) 52 #define OPTON_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE) 53 #define OPTON_EXTEND(option) ((option) & ONIG_OPTION_EXTEND) 54 #define OPTON_WORD_ASCII(option) \ 55 ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) 56 #define OPTON_DIGIT_ASCII(option) \ 57 ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) 58 #define OPTON_SPACE_ASCII(option) \ 59 ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) 60 #define OPTON_POSIX_ASCII(option) ((option) & ONIG_OPTION_POSIX_IS_ASCII) 61 #define OPTON_TEXT_SEGMENT_WORD(option) ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD) 62 63 #define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \ 64 ((ctype) >= 0 && \ 65 (((ctype) < ONIGENC_CTYPE_ASCII && OPTON_POSIX_ASCII(options)) ||\ 66 ((ctype) == ONIGENC_CTYPE_WORD && OPTON_WORD_ASCII(options)) ||\ 67 ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\ 68 ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options)))) 69 70 71 OnigSyntaxType OnigSyntaxOniguruma = { 72 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | 73 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | 74 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL | 75 ONIG_SYN_OP_ESC_CONTROL_CHARS | 76 ONIG_SYN_OP_ESC_C_CONTROL ) 77 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) 78 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | 79 ONIG_SYN_OP2_OPTION_ONIGURUMA | 80 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | 81 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | 82 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP | 83 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS | 84 ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME | 85 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT | 86 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | 87 ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT | 88 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | 89 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | 90 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | 91 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | 92 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | 93 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | 94 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | 95 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 ) 96 , ( SYN_GNU_REGEX_BV | 97 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | 98 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | 99 ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND | 100 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | 101 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | 102 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | 103 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | 104 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | 105 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) 106 , ONIG_OPTION_NONE 107 , 108 { 109 (OnigCodePoint )'\\' /* esc */ 110 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 111 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 112 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 113 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 114 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 115 } 116 }; 117 118 OnigSyntaxType OnigSyntaxRuby = { 119 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | 120 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | 121 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL | 122 ONIG_SYN_OP_ESC_CONTROL_CHARS | 123 ONIG_SYN_OP_ESC_C_CONTROL ) 124 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) 125 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | 126 ONIG_SYN_OP2_OPTION_RUBY | 127 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | 128 ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | 129 ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP | 130 ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT | 131 ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | 132 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | 133 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | 134 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | 135 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | 136 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | 137 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | 138 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | 139 ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 ) 140 , ( SYN_GNU_REGEX_BV | 141 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | 142 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | 143 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | 144 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | 145 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | 146 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | 147 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) 148 , ONIG_OPTION_NONE 149 , 150 { 151 (OnigCodePoint )'\\' /* esc */ 152 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 153 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 154 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 155 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 156 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 157 } 158 }; 159 160 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA; 161 162 extern void onig_null_warn(const char* s ARG_UNUSED) { } 163 164 #ifdef DEFAULT_WARN_FUNCTION 165 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; 166 #else 167 static OnigWarnFunc onig_warn = onig_null_warn; 168 #endif 169 170 #ifdef DEFAULT_VERB_WARN_FUNCTION 171 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; 172 #else 173 static OnigWarnFunc onig_verb_warn = onig_null_warn; 174 #endif 175 176 extern void onig_set_warn_func(OnigWarnFunc f) 177 { 178 onig_warn = f; 179 } 180 181 extern void onig_set_verb_warn_func(OnigWarnFunc f) 182 { 183 onig_verb_warn = f; 184 } 185 186 extern void 187 onig_warning(const char* s) 188 { 189 if (onig_warn == onig_null_warn) return ; 190 191 (*onig_warn)(s); 192 } 193 194 #define DEFAULT_MAX_CAPTURE_NUM 32767 195 196 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM; 197 198 extern int 199 onig_set_capture_num_limit(int num) 200 { 201 if (num < 0) return -1; 202 203 MaxCaptureNum = num; 204 return 0; 205 } 206 207 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; 208 209 extern unsigned int 210 onig_get_parse_depth_limit(void) 211 { 212 return ParseDepthLimit; 213 } 214 215 extern int 216 onig_set_parse_depth_limit(unsigned int depth) 217 { 218 if (depth == 0) 219 ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; 220 else 221 ParseDepthLimit = depth; 222 return 0; 223 } 224 225 #ifdef ONIG_DEBUG_PARSE 226 #define INC_PARSE_DEPTH(d) do {\ 227 (d)++;\ 228 if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ 229 if ((d) > ParseDepthLimit) \ 230 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ 231 } while (0) 232 #else 233 #define INC_PARSE_DEPTH(d) do {\ 234 (d)++;\ 235 if ((d) > ParseDepthLimit) \ 236 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ 237 } while (0) 238 #endif 239 240 #define DEC_PARSE_DEPTH(d) (d)-- 241 242 243 static int 244 bbuf_init(BBuf* buf, int size) 245 { 246 if (size <= 0) { 247 size = 0; 248 buf->p = NULL; 249 } 250 else { 251 buf->p = (UChar* )xmalloc(size); 252 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); 253 } 254 255 buf->alloc = size; 256 buf->used = 0; 257 return 0; 258 } 259 260 static void 261 bbuf_free(BBuf* bbuf) 262 { 263 if (IS_NOT_NULL(bbuf)) { 264 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); 265 xfree(bbuf); 266 } 267 } 268 269 static int 270 bbuf_clone(BBuf** rto, BBuf* from) 271 { 272 int r; 273 BBuf *to; 274 275 *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); 276 CHECK_NULL_RETURN_MEMERR(to); 277 r = BB_INIT(to, from->alloc); 278 if (r != 0) { 279 xfree(to->p); 280 *rto = 0; 281 return r; 282 } 283 to->used = from->used; 284 xmemcpy(to->p, from->p, from->used); 285 return 0; 286 } 287 288 static int 289 backref_rel_to_abs(int rel_no, ScanEnv* env) 290 { 291 if (rel_no > 0) { 292 return env->num_mem + rel_no; 293 } 294 else { 295 return env->num_mem + 1 + rel_no; 296 } 297 } 298 299 #define OPTION_ON(v,f) ((v) |= (f)) 300 #define OPTION_OFF(v,f) ((v) &= ~(f)) 301 302 #define OPTION_NEGATE(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) 303 304 #define MBCODE_START_POS(enc) \ 305 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) 306 307 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ 308 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) 309 310 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ 311 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ 312 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ 313 if (r != 0) return r;\ 314 }\ 315 } while (0) 316 317 318 #define BITSET_IS_EMPTY(bs,empty) do {\ 319 int i;\ 320 empty = 1;\ 321 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\ 322 if ((bs)[i] != 0) {\ 323 empty = 0; break;\ 324 }\ 325 }\ 326 } while (0) 327 328 static void 329 bitset_set_range(BitSetRef bs, int from, int to) 330 { 331 int i; 332 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { 333 BITSET_SET_BIT(bs, i); 334 } 335 } 336 337 static void 338 bitset_invert(BitSetRef bs) 339 { 340 int i; 341 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); } 342 } 343 344 static void 345 bitset_invert_to(BitSetRef from, BitSetRef to) 346 { 347 int i; 348 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); } 349 } 350 351 static void 352 bitset_and(BitSetRef dest, BitSetRef bs) 353 { 354 int i; 355 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; } 356 } 357 358 static void 359 bitset_or(BitSetRef dest, BitSetRef bs) 360 { 361 int i; 362 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; } 363 } 364 365 static void 366 bitset_copy(BitSetRef dest, BitSetRef bs) 367 { 368 int i; 369 for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; } 370 } 371 372 extern int 373 onig_strncmp(const UChar* s1, const UChar* s2, int n) 374 { 375 int x; 376 377 while (n-- > 0) { 378 x = *s2++ - *s1++; 379 if (x) return x; 380 } 381 return 0; 382 } 383 384 extern void 385 onig_strcpy(UChar* dest, const UChar* src, const UChar* end) 386 { 387 int len = (int )(end - src); 388 if (len > 0) { 389 xmemcpy(dest, src, len); 390 dest[len] = (UChar )0; 391 } 392 } 393 394 /* scan pattern methods */ 395 #define PEND_VALUE 0 396 397 #define PFETCH_READY UChar* pfetch_prev 398 #define PEND (p < end ? 0 : 1) 399 #define PUNFETCH p = pfetch_prev 400 #define PINC do { \ 401 pfetch_prev = p; \ 402 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 403 } while (0) 404 #define PFETCH(c) do { \ 405 c = ONIGENC_MBC_TO_CODE(enc, p, end); \ 406 pfetch_prev = p; \ 407 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 408 } while (0) 409 410 #define PINC_S do { \ 411 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 412 } while (0) 413 #define PFETCH_S(c) do { \ 414 c = ONIGENC_MBC_TO_CODE(enc, p, end); \ 415 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 416 } while (0) 417 418 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) 419 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) 420 421 static UChar* 422 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, 423 int capa) 424 { 425 UChar* r; 426 427 if (dest) 428 r = (UChar* )xrealloc(dest, capa + 1); 429 else 430 r = (UChar* )xmalloc(capa + 1); 431 432 CHECK_NULL_RETURN(r); 433 onig_strcpy(r + (dest_end - dest), src, src_end); 434 return r; 435 } 436 437 /* dest on static area */ 438 static UChar* 439 strcat_capa_from_static(UChar* dest, UChar* dest_end, 440 const UChar* src, const UChar* src_end, int capa) 441 { 442 UChar* r; 443 444 r = (UChar* )xmalloc(capa + 1); 445 CHECK_NULL_RETURN(r); 446 onig_strcpy(r, dest, dest_end); 447 onig_strcpy(r + (dest_end - dest), src, src_end); 448 return r; 449 } 450 451 452 #ifdef USE_ST_LIBRARY 453 454 typedef struct { 455 UChar* s; 456 UChar* end; 457 } st_str_end_key; 458 459 static int 460 str_end_cmp(st_str_end_key* x, st_str_end_key* y) 461 { 462 UChar *p, *q; 463 int c; 464 465 if ((x->end - x->s) != (y->end - y->s)) 466 return 1; 467 468 p = x->s; 469 q = y->s; 470 while (p < x->end) { 471 c = (int )*p - (int )*q; 472 if (c != 0) return c; 473 474 p++; q++; 475 } 476 477 return 0; 478 } 479 480 static int 481 str_end_hash(st_str_end_key* x) 482 { 483 UChar *p; 484 unsigned val = 0; 485 486 p = x->s; 487 while (p < x->end) { 488 val = val * 997 + (unsigned )*p++; 489 } 490 491 return (int) (val + (val >> 5)); 492 } 493 494 extern hash_table_type 495 onig_st_init_strend_table_with_size(int size) 496 { 497 static struct st_hash_type hashType = { 498 str_end_cmp, 499 str_end_hash, 500 }; 501 502 return (hash_table_type )onig_st_init_table_with_size(&hashType, size); 503 } 504 505 extern int 506 onig_st_lookup_strend(hash_table_type table, const UChar* str_key, 507 const UChar* end_key, hash_data_type *value) 508 { 509 st_str_end_key key; 510 511 key.s = (UChar* )str_key; 512 key.end = (UChar* )end_key; 513 514 return onig_st_lookup(table, (st_data_t )(&key), value); 515 } 516 517 extern int 518 onig_st_insert_strend(hash_table_type table, const UChar* str_key, 519 const UChar* end_key, hash_data_type value) 520 { 521 st_str_end_key* key; 522 int result; 523 524 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); 525 CHECK_NULL_RETURN_MEMERR(key); 526 527 key->s = (UChar* )str_key; 528 key->end = (UChar* )end_key; 529 result = onig_st_insert(table, (st_data_t )key, value); 530 if (result) { 531 xfree(key); 532 } 533 return result; 534 } 535 536 537 #ifdef USE_CALLOUT 538 539 typedef struct { 540 OnigEncoding enc; 541 int type; /* callout type: single or not */ 542 UChar* s; 543 UChar* end; 544 } st_callout_name_key; 545 546 static int 547 callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y) 548 { 549 UChar *p, *q; 550 int c; 551 552 if (x->enc != y->enc) return 1; 553 if (x->type != y->type) return 1; 554 if ((x->end - x->s) != (y->end - y->s)) 555 return 1; 556 557 p = x->s; 558 q = y->s; 559 while (p < x->end) { 560 c = (int )*p - (int )*q; 561 if (c != 0) return c; 562 563 p++; q++; 564 } 565 566 return 0; 567 } 568 569 static int 570 callout_name_table_hash(st_callout_name_key* x) 571 { 572 UChar *p; 573 unsigned int val = 0; 574 575 p = x->s; 576 while (p < x->end) { 577 val = val * 997 + (unsigned int )*p++; 578 } 579 580 /* use intptr_t for escape warning in Windows */ 581 return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); 582 } 583 584 extern hash_table_type 585 onig_st_init_callout_name_table_with_size(int size) 586 { 587 static struct st_hash_type hashType = { 588 callout_name_table_cmp, 589 callout_name_table_hash, 590 }; 591 592 return (hash_table_type )onig_st_init_table_with_size(&hashType, size); 593 } 594 595 extern int 596 onig_st_lookup_callout_name_table(hash_table_type table, 597 OnigEncoding enc, 598 int type, 599 const UChar* str_key, 600 const UChar* end_key, 601 hash_data_type *value) 602 { 603 st_callout_name_key key; 604 605 key.enc = enc; 606 key.type = type; 607 key.s = (UChar* )str_key; 608 key.end = (UChar* )end_key; 609 610 return onig_st_lookup(table, (st_data_t )(&key), value); 611 } 612 613 static int 614 st_insert_callout_name_table(hash_table_type table, 615 OnigEncoding enc, int type, 616 UChar* str_key, UChar* end_key, 617 hash_data_type value) 618 { 619 st_callout_name_key* key; 620 int result; 621 622 key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key)); 623 CHECK_NULL_RETURN_MEMERR(key); 624 625 /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */ 626 key->enc = enc; 627 key->type = type; 628 key->s = str_key; 629 key->end = end_key; 630 result = onig_st_insert(table, (st_data_t )key, value); 631 if (result) { 632 xfree(key); 633 } 634 return result; 635 } 636 #endif 637 638 #endif /* USE_ST_LIBRARY */ 639 640 641 #define INIT_NAME_BACKREFS_ALLOC_NUM 8 642 643 typedef struct { 644 UChar* name; 645 int name_len; /* byte length */ 646 int back_num; /* number of backrefs */ 647 int back_alloc; 648 int back_ref1; 649 int* back_refs; 650 } NameEntry; 651 652 #ifdef USE_ST_LIBRARY 653 654 #define INIT_NAMES_ALLOC_NUM 5 655 656 typedef st_table NameTable; 657 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ 658 659 #define NAMEBUF_SIZE 24 660 #define NAMEBUF_SIZE_1 25 661 662 #ifdef ONIG_DEBUG 663 static int 664 i_print_name_entry(UChar* key, NameEntry* e, void* arg) 665 { 666 int i; 667 FILE* fp = (FILE* )arg; 668 669 fprintf(fp, "%s: ", e->name); 670 if (e->back_num == 0) 671 fputs("-", fp); 672 else if (e->back_num == 1) 673 fprintf(fp, "%d", e->back_ref1); 674 else { 675 for (i = 0; i < e->back_num; i++) { 676 if (i > 0) fprintf(fp, ", "); 677 fprintf(fp, "%d", e->back_refs[i]); 678 } 679 } 680 fputs("\n", fp); 681 return ST_CONTINUE; 682 } 683 684 extern int 685 onig_print_names(FILE* fp, regex_t* reg) 686 { 687 NameTable* t = (NameTable* )reg->name_table; 688 689 if (IS_NOT_NULL(t)) { 690 fprintf(fp, "name table\n"); 691 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); 692 fputs("\n", fp); 693 } 694 return 0; 695 } 696 #endif /* ONIG_DEBUG */ 697 698 static int 699 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) 700 { 701 xfree(e->name); 702 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 703 xfree(key); 704 xfree(e); 705 return ST_DELETE; 706 } 707 708 static int 709 names_clear(regex_t* reg) 710 { 711 NameTable* t = (NameTable* )reg->name_table; 712 713 if (IS_NOT_NULL(t)) { 714 onig_st_foreach(t, i_free_name_entry, 0); 715 } 716 return 0; 717 } 718 719 extern int 720 onig_names_free(regex_t* reg) 721 { 722 int r; 723 NameTable* t; 724 725 r = names_clear(reg); 726 if (r != 0) return r; 727 728 t = (NameTable* )reg->name_table; 729 if (IS_NOT_NULL(t)) onig_st_free_table(t); 730 reg->name_table = (void* )NULL; 731 return 0; 732 } 733 734 static NameEntry* 735 name_find(regex_t* reg, const UChar* name, const UChar* name_end) 736 { 737 NameEntry* e; 738 NameTable* t = (NameTable* )reg->name_table; 739 740 e = (NameEntry* )NULL; 741 if (IS_NOT_NULL(t)) { 742 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); 743 } 744 return e; 745 } 746 747 typedef struct { 748 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); 749 regex_t* reg; 750 void* arg; 751 int ret; 752 OnigEncoding enc; 753 } INamesArg; 754 755 static int 756 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) 757 { 758 int r = (*(arg->func))(e->name, 759 e->name + e->name_len, 760 e->back_num, 761 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 762 arg->reg, arg->arg); 763 if (r != 0) { 764 arg->ret = r; 765 return ST_STOP; 766 } 767 return ST_CONTINUE; 768 } 769 770 extern int 771 onig_foreach_name(regex_t* reg, 772 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 773 { 774 INamesArg narg; 775 NameTable* t = (NameTable* )reg->name_table; 776 777 narg.ret = 0; 778 if (IS_NOT_NULL(t)) { 779 narg.func = func; 780 narg.reg = reg; 781 narg.arg = arg; 782 narg.enc = reg->enc; /* should be pattern encoding. */ 783 onig_st_foreach(t, i_names, (HashDataType )&narg); 784 } 785 return narg.ret; 786 } 787 788 static int 789 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map) 790 { 791 int i; 792 793 if (e->back_num > 1) { 794 for (i = 0; i < e->back_num; i++) { 795 e->back_refs[i] = map[e->back_refs[i]].new_val; 796 } 797 } 798 else if (e->back_num == 1) { 799 e->back_ref1 = map[e->back_ref1].new_val; 800 } 801 802 return ST_CONTINUE; 803 } 804 805 extern int 806 onig_renumber_name_table(regex_t* reg, GroupNumMap* map) 807 { 808 NameTable* t = (NameTable* )reg->name_table; 809 810 if (IS_NOT_NULL(t)) { 811 onig_st_foreach(t, i_renumber_name, (HashDataType )map); 812 } 813 return 0; 814 } 815 816 817 extern int 818 onig_number_of_names(regex_t* reg) 819 { 820 NameTable* t = (NameTable* )reg->name_table; 821 822 if (IS_NOT_NULL(t)) 823 return t->num_entries; 824 else 825 return 0; 826 } 827 828 #else /* USE_ST_LIBRARY */ 829 830 #define INIT_NAMES_ALLOC_NUM 8 831 832 typedef struct { 833 NameEntry* e; 834 int num; 835 int alloc; 836 } NameTable; 837 838 #ifdef ONIG_DEBUG 839 extern int 840 onig_print_names(FILE* fp, regex_t* reg) 841 { 842 int i, j; 843 NameEntry* e; 844 NameTable* t = (NameTable* )reg->name_table; 845 846 if (IS_NOT_NULL(t) && t->num > 0) { 847 fprintf(fp, "name table\n"); 848 for (i = 0; i < t->num; i++) { 849 e = &(t->e[i]); 850 fprintf(fp, "%s: ", e->name); 851 if (e->back_num == 0) { 852 fputs("-", fp); 853 } 854 else if (e->back_num == 1) { 855 fprintf(fp, "%d", e->back_ref1); 856 } 857 else { 858 for (j = 0; j < e->back_num; j++) { 859 if (j > 0) fprintf(fp, ", "); 860 fprintf(fp, "%d", e->back_refs[j]); 861 } 862 } 863 fputs("\n", fp); 864 } 865 fputs("\n", fp); 866 } 867 return 0; 868 } 869 #endif 870 871 static int 872 names_clear(regex_t* reg) 873 { 874 int i; 875 NameEntry* e; 876 NameTable* t = (NameTable* )reg->name_table; 877 878 if (IS_NOT_NULL(t)) { 879 for (i = 0; i < t->num; i++) { 880 e = &(t->e[i]); 881 if (IS_NOT_NULL(e->name)) { 882 xfree(e->name); 883 e->name = NULL; 884 e->name_len = 0; 885 e->back_num = 0; 886 e->back_alloc = 0; 887 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 888 e->back_refs = (int* )NULL; 889 } 890 } 891 if (IS_NOT_NULL(t->e)) { 892 xfree(t->e); 893 t->e = NULL; 894 } 895 t->num = 0; 896 } 897 return 0; 898 } 899 900 extern int 901 onig_names_free(regex_t* reg) 902 { 903 int r; 904 NameTable* t; 905 906 r = names_clear(reg); 907 if (r != 0) return r; 908 909 t = (NameTable* )reg->name_table; 910 if (IS_NOT_NULL(t)) xfree(t); 911 reg->name_table = NULL; 912 return 0; 913 } 914 915 static NameEntry* 916 name_find(regex_t* reg, UChar* name, UChar* name_end) 917 { 918 int i, len; 919 NameEntry* e; 920 NameTable* t = (NameTable* )reg->name_table; 921 922 if (IS_NOT_NULL(t)) { 923 len = name_end - name; 924 for (i = 0; i < t->num; i++) { 925 e = &(t->e[i]); 926 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) 927 return e; 928 } 929 } 930 return (NameEntry* )NULL; 931 } 932 933 extern int 934 onig_foreach_name(regex_t* reg, 935 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 936 { 937 int i, r; 938 NameEntry* e; 939 NameTable* t = (NameTable* )reg->name_table; 940 941 if (IS_NOT_NULL(t)) { 942 for (i = 0; i < t->num; i++) { 943 e = &(t->e[i]); 944 r = (*func)(e->name, e->name + e->name_len, e->back_num, 945 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 946 reg, arg); 947 if (r != 0) return r; 948 } 949 } 950 return 0; 951 } 952 953 extern int 954 onig_number_of_names(regex_t* reg) 955 { 956 NameTable* t = (NameTable* )reg->name_table; 957 958 if (IS_NOT_NULL(t)) 959 return t->num; 960 else 961 return 0; 962 } 963 964 #endif /* else USE_ST_LIBRARY */ 965 966 static int 967 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) 968 { 969 int r; 970 int alloc; 971 NameEntry* e; 972 NameTable* t = (NameTable* )reg->name_table; 973 974 if (name_end - name <= 0) 975 return ONIGERR_EMPTY_GROUP_NAME; 976 977 e = name_find(reg, name, name_end); 978 if (IS_NULL(e)) { 979 #ifdef USE_ST_LIBRARY 980 if (IS_NULL(t)) { 981 t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM); 982 CHECK_NULL_RETURN_MEMERR(t); 983 reg->name_table = (void* )t; 984 } 985 e = (NameEntry* )xmalloc(sizeof(NameEntry)); 986 CHECK_NULL_RETURN_MEMERR(e); 987 988 e->name = onigenc_strdup(reg->enc, name, name_end); 989 if (IS_NULL(e->name)) { 990 xfree(e); return ONIGERR_MEMORY; 991 } 992 r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), 993 (HashDataType )e); 994 if (r < 0) return r; 995 996 e->name_len = (int )(name_end - name); 997 e->back_num = 0; 998 e->back_alloc = 0; 999 e->back_refs = (int* )NULL; 1000 1001 #else 1002 1003 if (IS_NULL(t)) { 1004 alloc = INIT_NAMES_ALLOC_NUM; 1005 t = (NameTable* )xmalloc(sizeof(NameTable)); 1006 CHECK_NULL_RETURN_MEMERR(t); 1007 t->e = NULL; 1008 t->alloc = 0; 1009 t->num = 0; 1010 1011 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); 1012 if (IS_NULL(t->e)) { 1013 xfree(t); 1014 return ONIGERR_MEMORY; 1015 } 1016 t->alloc = alloc; 1017 reg->name_table = t; 1018 goto clear; 1019 } 1020 else if (t->num == t->alloc) { 1021 int i; 1022 1023 alloc = t->alloc * 2; 1024 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); 1025 CHECK_NULL_RETURN_MEMERR(t->e); 1026 t->alloc = alloc; 1027 1028 clear: 1029 for (i = t->num; i < t->alloc; i++) { 1030 t->e[i].name = NULL; 1031 t->e[i].name_len = 0; 1032 t->e[i].back_num = 0; 1033 t->e[i].back_alloc = 0; 1034 t->e[i].back_refs = (int* )NULL; 1035 } 1036 } 1037 e = &(t->e[t->num]); 1038 t->num++; 1039 e->name = onigenc_strdup(reg->enc, name, name_end); 1040 if (IS_NULL(e->name)) return ONIGERR_MEMORY; 1041 e->name_len = name_end - name; 1042 #endif 1043 } 1044 1045 if (e->back_num >= 1 && 1046 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { 1047 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, 1048 name, name_end); 1049 return ONIGERR_MULTIPLEX_DEFINED_NAME; 1050 } 1051 1052 e->back_num++; 1053 if (e->back_num == 1) { 1054 e->back_ref1 = backref; 1055 } 1056 else { 1057 if (e->back_num == 2) { 1058 alloc = INIT_NAME_BACKREFS_ALLOC_NUM; 1059 e->back_refs = (int* )xmalloc(sizeof(int) * alloc); 1060 CHECK_NULL_RETURN_MEMERR(e->back_refs); 1061 e->back_alloc = alloc; 1062 e->back_refs[0] = e->back_ref1; 1063 e->back_refs[1] = backref; 1064 } 1065 else { 1066 if (e->back_num > e->back_alloc) { 1067 alloc = e->back_alloc * 2; 1068 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); 1069 CHECK_NULL_RETURN_MEMERR(e->back_refs); 1070 e->back_alloc = alloc; 1071 } 1072 e->back_refs[e->back_num - 1] = backref; 1073 } 1074 } 1075 1076 return 0; 1077 } 1078 1079 extern int 1080 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 1081 const UChar* name_end, int** nums) 1082 { 1083 NameEntry* e = name_find(reg, name, name_end); 1084 1085 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; 1086 1087 switch (e->back_num) { 1088 case 0: 1089 break; 1090 case 1: 1091 *nums = &(e->back_ref1); 1092 break; 1093 default: 1094 *nums = e->back_refs; 1095 break; 1096 } 1097 return e->back_num; 1098 } 1099 1100 static int 1101 name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, 1102 int** nums) 1103 { 1104 regex_t* reg; 1105 NameEntry* e; 1106 1107 reg = env->reg; 1108 e = name_find(reg, name, name_end); 1109 1110 if (IS_NULL(e)) { 1111 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, 1112 (UChar* )name, (UChar* )name_end); 1113 return ONIGERR_UNDEFINED_NAME_REFERENCE; 1114 } 1115 1116 switch (e->back_num) { 1117 case 0: 1118 break; 1119 case 1: 1120 *nums = &(e->back_ref1); 1121 break; 1122 default: 1123 *nums = e->back_refs; 1124 break; 1125 } 1126 return e->back_num; 1127 } 1128 1129 extern int 1130 onig_name_to_backref_number(regex_t* reg, const UChar* name, 1131 const UChar* name_end, OnigRegion *region) 1132 { 1133 int i, n, *nums; 1134 1135 n = onig_name_to_group_numbers(reg, name, name_end, &nums); 1136 if (n < 0) 1137 return n; 1138 else if (n == 0) 1139 return ONIGERR_PARSER_BUG; 1140 else if (n == 1) 1141 return nums[0]; 1142 else { 1143 if (IS_NOT_NULL(region)) { 1144 for (i = n - 1; i >= 0; i--) { 1145 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) 1146 return nums[i]; 1147 } 1148 } 1149 return nums[n - 1]; 1150 } 1151 } 1152 1153 extern int 1154 onig_noname_group_capture_is_active(regex_t* reg) 1155 { 1156 if (OPTON_DONT_CAPTURE_GROUP(reg->options)) 1157 return 0; 1158 1159 if (onig_number_of_names(reg) > 0 && 1160 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && 1161 ! OPTON_CAPTURE_GROUP(reg->options)) { 1162 return 0; 1163 } 1164 1165 return 1; 1166 } 1167 1168 #ifdef USE_CALLOUT 1169 1170 typedef struct { 1171 OnigCalloutType type; 1172 int in; 1173 OnigCalloutFunc start_func; 1174 OnigCalloutFunc end_func; 1175 int arg_num; 1176 int opt_arg_num; 1177 unsigned int arg_types[ONIG_CALLOUT_MAX_ARGS_NUM]; 1178 OnigValue opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM]; 1179 UChar* name; /* reference to GlobalCalloutNameTable entry: e->name */ 1180 } CalloutNameListEntry; 1181 1182 typedef struct { 1183 int n; 1184 int alloc; 1185 CalloutNameListEntry* v; 1186 } CalloutNameListType; 1187 1188 static CalloutNameListType* GlobalCalloutNameList; 1189 1190 static int 1191 make_callout_func_list(CalloutNameListType** rs, int init_size) 1192 { 1193 CalloutNameListType* s; 1194 CalloutNameListEntry* v; 1195 1196 *rs = 0; 1197 1198 s = xmalloc(sizeof(*s)); 1199 if (IS_NULL(s)) return ONIGERR_MEMORY; 1200 1201 v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size); 1202 if (IS_NULL(v)) { 1203 xfree(s); 1204 return ONIGERR_MEMORY; 1205 } 1206 1207 s->n = 0; 1208 s->alloc = init_size; 1209 s->v = v; 1210 1211 *rs = s; 1212 return ONIG_NORMAL; 1213 } 1214 1215 static void 1216 free_callout_func_list(CalloutNameListType* s) 1217 { 1218 if (IS_NOT_NULL(s)) { 1219 if (IS_NOT_NULL(s->v)) { 1220 int i, j; 1221 1222 for (i = 0; i < s->n; i++) { 1223 CalloutNameListEntry* e = s->v + i; 1224 for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) { 1225 if (e->arg_types[j] == ONIG_TYPE_STRING) { 1226 UChar* p = e->opt_defaults[j].s.start; 1227 if (IS_NOT_NULL(p)) xfree(p); 1228 } 1229 } 1230 } 1231 xfree(s->v); 1232 } 1233 xfree(s); 1234 } 1235 } 1236 1237 static int 1238 callout_func_list_add(CalloutNameListType* s, int* rid) 1239 { 1240 if (s->n >= s->alloc) { 1241 int new_size = s->alloc * 2; 1242 CalloutNameListEntry* nv = (CalloutNameListEntry* ) 1243 xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size); 1244 if (IS_NULL(nv)) return ONIGERR_MEMORY; 1245 1246 s->alloc = new_size; 1247 s->v = nv; 1248 } 1249 1250 *rid = s->n; 1251 1252 xmemset(&(s->v[s->n]), 0, sizeof(*(s->v))); 1253 s->n++; 1254 return ONIG_NORMAL; 1255 } 1256 1257 1258 typedef struct { 1259 UChar* name; 1260 int name_len; /* byte length */ 1261 int id; 1262 } CalloutNameEntry; 1263 1264 #ifdef USE_ST_LIBRARY 1265 typedef st_table CalloutNameTable; 1266 #else 1267 typedef struct { 1268 CalloutNameEntry* e; 1269 int num; 1270 int alloc; 1271 } CalloutNameTable; 1272 #endif 1273 1274 static CalloutNameTable* GlobalCalloutNameTable; 1275 static int CalloutNameIDCounter; 1276 1277 #ifdef USE_ST_LIBRARY 1278 1279 static int 1280 i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e, 1281 void* arg ARG_UNUSED) 1282 { 1283 xfree(e->name); 1284 /*xfree(key->s); */ /* is same as e->name */ 1285 xfree(key); 1286 xfree(e); 1287 return ST_DELETE; 1288 } 1289 1290 static int 1291 callout_name_table_clear(CalloutNameTable* t) 1292 { 1293 if (IS_NOT_NULL(t)) { 1294 onig_st_foreach(t, i_free_callout_name_entry, 0); 1295 } 1296 return 0; 1297 } 1298 1299 static int 1300 global_callout_name_table_free(void) 1301 { 1302 if (IS_NOT_NULL(GlobalCalloutNameTable)) { 1303 int r = callout_name_table_clear(GlobalCalloutNameTable); 1304 if (r != 0) return r; 1305 1306 onig_st_free_table(GlobalCalloutNameTable); 1307 GlobalCalloutNameTable = 0; 1308 CalloutNameIDCounter = 0; 1309 } 1310 1311 return 0; 1312 } 1313 1314 static CalloutNameEntry* 1315 callout_name_find(OnigEncoding enc, int is_not_single, 1316 const UChar* name, const UChar* name_end) 1317 { 1318 int r; 1319 CalloutNameEntry* e; 1320 CalloutNameTable* t = GlobalCalloutNameTable; 1321 1322 e = (CalloutNameEntry* )NULL; 1323 if (IS_NOT_NULL(t)) { 1324 r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end, 1325 (HashDataType* )((void* )(&e))); 1326 if (r == 0) { /* not found */ 1327 if (enc != ONIG_ENCODING_ASCII && 1328 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) { 1329 enc = ONIG_ENCODING_ASCII; 1330 onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end, 1331 (HashDataType* )((void* )(&e))); 1332 } 1333 } 1334 } 1335 return e; 1336 } 1337 1338 #else 1339 1340 static int 1341 callout_name_table_clear(CalloutNameTable* t) 1342 { 1343 int i; 1344 CalloutNameEntry* e; 1345 1346 if (IS_NOT_NULL(t)) { 1347 for (i = 0; i < t->num; i++) { 1348 e = &(t->e[i]); 1349 if (IS_NOT_NULL(e->name)) { 1350 xfree(e->name); 1351 e->name = NULL; 1352 e->name_len = 0; 1353 e->id = 0; 1354 e->func = 0; 1355 } 1356 } 1357 if (IS_NOT_NULL(t->e)) { 1358 xfree(t->e); 1359 t->e = NULL; 1360 } 1361 t->num = 0; 1362 } 1363 return 0; 1364 } 1365 1366 static int 1367 global_callout_name_table_free(void) 1368 { 1369 if (IS_NOT_NULL(GlobalCalloutNameTable)) { 1370 int r = callout_name_table_clear(GlobalCalloutNameTable); 1371 if (r != 0) return r; 1372 1373 xfree(GlobalCalloutNameTable); 1374 GlobalCalloutNameTable = 0; 1375 CalloutNameIDCounter = 0; 1376 } 1377 return 0; 1378 } 1379 1380 static CalloutNameEntry* 1381 callout_name_find(UChar* name, UChar* name_end) 1382 { 1383 int i, len; 1384 CalloutNameEntry* e; 1385 CalloutNameTable* t = Calloutnames; 1386 1387 if (IS_NOT_NULL(t)) { 1388 len = name_end - name; 1389 for (i = 0; i < t->num; i++) { 1390 e = &(t->e[i]); 1391 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) 1392 return e; 1393 } 1394 } 1395 return (CalloutNameEntry* )NULL; 1396 } 1397 1398 #endif 1399 1400 /* name string must be single byte char string. */ 1401 static int 1402 callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc, 1403 int is_not_single, UChar* name, UChar* name_end) 1404 { 1405 int r; 1406 CalloutNameEntry* e; 1407 CalloutNameTable* t = GlobalCalloutNameTable; 1408 1409 *rentry = 0; 1410 if (name_end - name <= 0) 1411 return ONIGERR_INVALID_CALLOUT_NAME; 1412 1413 e = callout_name_find(enc, is_not_single, name, name_end); 1414 if (IS_NULL(e)) { 1415 #ifdef USE_ST_LIBRARY 1416 if (IS_NULL(t)) { 1417 t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM); 1418 CHECK_NULL_RETURN_MEMERR(t); 1419 GlobalCalloutNameTable = t; 1420 } 1421 e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry)); 1422 CHECK_NULL_RETURN_MEMERR(e); 1423 1424 e->name = onigenc_strdup(enc, name, name_end); 1425 if (IS_NULL(e->name)) { 1426 xfree(e); return ONIGERR_MEMORY; 1427 } 1428 1429 r = st_insert_callout_name_table(t, enc, is_not_single, 1430 e->name, (e->name + (name_end - name)), 1431 (HashDataType )e); 1432 if (r < 0) return r; 1433 1434 #else 1435 1436 int alloc; 1437 1438 if (IS_NULL(t)) { 1439 alloc = INIT_NAMES_ALLOC_NUM; 1440 t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable)); 1441 CHECK_NULL_RETURN_MEMERR(t); 1442 t->e = NULL; 1443 t->alloc = 0; 1444 t->num = 0; 1445 1446 t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc); 1447 if (IS_NULL(t->e)) { 1448 xfree(t); 1449 return ONIGERR_MEMORY; 1450 } 1451 t->alloc = alloc; 1452 GlobalCalloutNameTable = t; 1453 goto clear; 1454 } 1455 else if (t->num == t->alloc) { 1456 int i; 1457 1458 alloc = t->alloc * 2; 1459 t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc); 1460 CHECK_NULL_RETURN_MEMERR(t->e); 1461 t->alloc = alloc; 1462 1463 clear: 1464 for (i = t->num; i < t->alloc; i++) { 1465 t->e[i].name = NULL; 1466 t->e[i].name_len = 0; 1467 t->e[i].id = 0; 1468 } 1469 } 1470 e = &(t->e[t->num]); 1471 t->num++; 1472 e->name = onigenc_strdup(enc, name, name_end); 1473 if (IS_NULL(e->name)) return ONIGERR_MEMORY; 1474 #endif 1475 1476 CalloutNameIDCounter++; 1477 e->id = CalloutNameIDCounter; 1478 e->name_len = (int )(name_end - name); 1479 } 1480 1481 *rentry = e; 1482 return e->id; 1483 } 1484 1485 static int 1486 is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end) 1487 { 1488 UChar* p; 1489 OnigCodePoint c; 1490 1491 if (name >= name_end) return 0; 1492 1493 p = name; 1494 while (p < name_end) { 1495 c = ONIGENC_MBC_TO_CODE(enc, p, name_end); 1496 if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c)) 1497 return 0; 1498 1499 if (p == name) { 1500 if (c >= '0' && c <= '9') return 0; 1501 } 1502 1503 p += ONIGENC_MBC_ENC_LEN(enc, p); 1504 } 1505 1506 return 1; 1507 } 1508 1509 static int 1510 is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end) 1511 { 1512 UChar* p; 1513 OnigCodePoint c; 1514 1515 if (name >= name_end) return 0; 1516 1517 p = name; 1518 while (p < name_end) { 1519 c = ONIGENC_MBC_TO_CODE(enc, p, name_end); 1520 if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c)) 1521 return 0; 1522 1523 if (p == name) { 1524 if (c >= '0' && c <= '9') return 0; 1525 } 1526 1527 p += ONIGENC_MBC_ENC_LEN(enc, p); 1528 } 1529 1530 return 1; 1531 } 1532 1533 extern int 1534 onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type, 1535 UChar* name, UChar* name_end, int in, 1536 OnigCalloutFunc start_func, 1537 OnigCalloutFunc end_func, 1538 int arg_num, unsigned int arg_types[], 1539 int opt_arg_num, OnigValue opt_defaults[]) 1540 { 1541 int r; 1542 int i; 1543 int j; 1544 int id; 1545 int is_not_single; 1546 CalloutNameEntry* e; 1547 CalloutNameListEntry* fe; 1548 1549 if (callout_type != ONIG_CALLOUT_TYPE_SINGLE) 1550 return ONIGERR_INVALID_ARGUMENT; 1551 1552 if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM) 1553 return ONIGERR_INVALID_CALLOUT_ARG; 1554 1555 if (opt_arg_num < 0 || opt_arg_num > arg_num) 1556 return ONIGERR_INVALID_CALLOUT_ARG; 1557 1558 if (start_func == 0 && end_func == 0) 1559 return ONIGERR_INVALID_CALLOUT_ARG; 1560 1561 if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0) 1562 return ONIGERR_INVALID_CALLOUT_ARG; 1563 1564 for (i = 0; i < arg_num; i++) { 1565 unsigned int t = arg_types[i]; 1566 if (t == ONIG_TYPE_VOID) 1567 return ONIGERR_INVALID_CALLOUT_ARG; 1568 else { 1569 if (i >= arg_num - opt_arg_num) { 1570 if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && 1571 t != ONIG_TYPE_TAG) 1572 return ONIGERR_INVALID_CALLOUT_ARG; 1573 } 1574 else { 1575 if (t != ONIG_TYPE_LONG) { 1576 t = t & ~ONIG_TYPE_LONG; 1577 if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG) 1578 return ONIGERR_INVALID_CALLOUT_ARG; 1579 } 1580 } 1581 } 1582 } 1583 1584 if (! is_allowed_callout_name(enc, name, name_end)) { 1585 return ONIGERR_INVALID_CALLOUT_NAME; 1586 } 1587 1588 is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE); 1589 id = callout_name_entry(&e, enc, is_not_single, name, name_end); 1590 if (id < 0) return id; 1591 1592 r = ONIG_NORMAL; 1593 if (IS_NULL(GlobalCalloutNameList)) { 1594 r = make_callout_func_list(&GlobalCalloutNameList, 10); 1595 if (r != ONIG_NORMAL) return r; 1596 } 1597 1598 while (id >= GlobalCalloutNameList->n) { 1599 int rid; 1600 r = callout_func_list_add(GlobalCalloutNameList, &rid); 1601 if (r != ONIG_NORMAL) return r; 1602 } 1603 1604 fe = GlobalCalloutNameList->v + id; 1605 fe->type = callout_type; 1606 fe->in = in; 1607 fe->start_func = start_func; 1608 fe->end_func = end_func; 1609 fe->arg_num = arg_num; 1610 fe->opt_arg_num = opt_arg_num; 1611 fe->name = e->name; 1612 1613 for (i = 0; i < arg_num; i++) { 1614 fe->arg_types[i] = arg_types[i]; 1615 } 1616 for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) { 1617 if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT; 1618 if (fe->arg_types[i] == ONIG_TYPE_STRING) { 1619 OnigValue* val; 1620 UChar* ds; 1621 1622 val = opt_defaults + j; 1623 ds = onigenc_strdup(enc, val->s.start, val->s.end); 1624 CHECK_NULL_RETURN_MEMERR(ds); 1625 1626 fe->opt_defaults[i].s.start = ds; 1627 fe->opt_defaults[i].s.end = ds + (val->s.end - val->s.start); 1628 } 1629 else { 1630 fe->opt_defaults[i] = opt_defaults[j]; 1631 } 1632 } 1633 1634 r = id; 1635 return r; 1636 } 1637 1638 static int 1639 get_callout_name_id_by_name(OnigEncoding enc, int is_not_single, 1640 UChar* name, UChar* name_end, int* rid) 1641 { 1642 int r; 1643 CalloutNameEntry* e; 1644 1645 if (! is_allowed_callout_name(enc, name, name_end)) { 1646 return ONIGERR_INVALID_CALLOUT_NAME; 1647 } 1648 1649 e = callout_name_find(enc, is_not_single, name, name_end); 1650 if (IS_NULL(e)) { 1651 return ONIGERR_UNDEFINED_CALLOUT_NAME; 1652 } 1653 1654 r = ONIG_NORMAL; 1655 *rid = e->id; 1656 1657 return r; 1658 } 1659 1660 extern OnigCalloutFunc 1661 onig_get_callout_start_func(regex_t* reg, int callout_num) 1662 { 1663 /* If used for callouts of contents, return 0. */ 1664 CalloutListEntry* e; 1665 1666 e = onig_reg_callout_list_at(reg, callout_num); 1667 CHECK_NULL_RETURN(e); 1668 return e->start_func; 1669 } 1670 1671 extern const UChar* 1672 onig_get_callout_tag_start(regex_t* reg, int callout_num) 1673 { 1674 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num); 1675 CHECK_NULL_RETURN(e); 1676 return e->tag_start; 1677 } 1678 1679 extern const UChar* 1680 onig_get_callout_tag_end(regex_t* reg, int callout_num) 1681 { 1682 CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num); 1683 CHECK_NULL_RETURN(e); 1684 return e->tag_end; 1685 } 1686 1687 1688 extern OnigCalloutType 1689 onig_get_callout_type_by_name_id(int name_id) 1690 { 1691 if (name_id < 0 || name_id >= GlobalCalloutNameList->n) 1692 return 0; 1693 1694 return GlobalCalloutNameList->v[name_id].type; 1695 } 1696 1697 extern OnigCalloutFunc 1698 onig_get_callout_start_func_by_name_id(int name_id) 1699 { 1700 if (name_id < 0 || name_id >= GlobalCalloutNameList->n) 1701 return 0; 1702 1703 return GlobalCalloutNameList->v[name_id].start_func; 1704 } 1705 1706 extern OnigCalloutFunc 1707 onig_get_callout_end_func_by_name_id(int name_id) 1708 { 1709 if (name_id < 0 || name_id >= GlobalCalloutNameList->n) 1710 return 0; 1711 1712 return GlobalCalloutNameList->v[name_id].end_func; 1713 } 1714 1715 extern int 1716 onig_get_callout_in_by_name_id(int name_id) 1717 { 1718 if (name_id < 0 || name_id >= GlobalCalloutNameList->n) 1719 return 0; 1720 1721 return GlobalCalloutNameList->v[name_id].in; 1722 } 1723 1724 static int 1725 get_callout_arg_num_by_name_id(int name_id) 1726 { 1727 return GlobalCalloutNameList->v[name_id].arg_num; 1728 } 1729 1730 static int 1731 get_callout_opt_arg_num_by_name_id(int name_id) 1732 { 1733 return GlobalCalloutNameList->v[name_id].opt_arg_num; 1734 } 1735 1736 static unsigned int 1737 get_callout_arg_type_by_name_id(int name_id, int index) 1738 { 1739 return GlobalCalloutNameList->v[name_id].arg_types[index]; 1740 } 1741 1742 static OnigValue 1743 get_callout_opt_default_by_name_id(int name_id, int index) 1744 { 1745 return GlobalCalloutNameList->v[name_id].opt_defaults[index]; 1746 } 1747 1748 extern UChar* 1749 onig_get_callout_name_by_name_id(int name_id) 1750 { 1751 if (name_id < 0 || name_id >= GlobalCalloutNameList->n) 1752 return 0; 1753 1754 return GlobalCalloutNameList->v[name_id].name; 1755 } 1756 1757 extern int 1758 onig_global_callout_names_free(void) 1759 { 1760 free_callout_func_list(GlobalCalloutNameList); 1761 GlobalCalloutNameList = 0; 1762 1763 global_callout_name_table_free(); 1764 return ONIG_NORMAL; 1765 } 1766 1767 1768 typedef st_table CalloutTagTable; 1769 typedef intptr_t CalloutTagVal; 1770 1771 #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST (1<<0) 1772 1773 static int 1774 i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg) 1775 { 1776 int num; 1777 RegexExt* ext = (RegexExt* )arg; 1778 1779 num = (int )e - 1; 1780 ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST; 1781 return ST_CONTINUE; 1782 } 1783 1784 static int 1785 setup_ext_callout_list_values(regex_t* reg) 1786 { 1787 int i, j; 1788 RegexExt* ext; 1789 1790 ext = reg->extp; 1791 if (IS_NOT_NULL(ext->tag_table)) { 1792 onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set, 1793 (st_data_t )ext); 1794 } 1795 1796 for (i = 0; i < ext->callout_num; i++) { 1797 CalloutListEntry* e = ext->callout_list + i; 1798 if (e->of == ONIG_CALLOUT_OF_NAME) { 1799 for (j = 0; j < e->u.arg.num; j++) { 1800 if (e->u.arg.types[j] == ONIG_TYPE_TAG) { 1801 UChar* start; 1802 UChar* end; 1803 int num; 1804 start = e->u.arg.vals[j].s.start; 1805 end = e->u.arg.vals[j].s.end; 1806 num = onig_get_callout_num_by_tag(reg, start, end); 1807 if (num < 0) return num; 1808 e->u.arg.vals[j].tag = num; 1809 } 1810 } 1811 } 1812 } 1813 1814 return ONIG_NORMAL; 1815 } 1816 1817 extern int 1818 onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num) 1819 { 1820 RegexExt* ext = reg->extp; 1821 1822 if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0; 1823 if (callout_num > ext->callout_num) return 0; 1824 1825 return (ext->callout_list[callout_num].flag & 1826 CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0; 1827 } 1828 1829 static int 1830 i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED) 1831 { 1832 xfree(key); 1833 return ST_DELETE; 1834 } 1835 1836 static int 1837 callout_tag_table_clear(CalloutTagTable* t) 1838 { 1839 if (IS_NOT_NULL(t)) { 1840 onig_st_foreach(t, i_free_callout_tag_entry, 0); 1841 } 1842 return 0; 1843 } 1844 1845 extern int 1846 onig_callout_tag_table_free(void* table) 1847 { 1848 CalloutTagTable* t = (CalloutTagTable* )table; 1849 1850 if (IS_NOT_NULL(t)) { 1851 int r = callout_tag_table_clear(t); 1852 if (r != 0) return r; 1853 1854 onig_st_free_table(t); 1855 } 1856 1857 return 0; 1858 } 1859 1860 extern int 1861 onig_get_callout_num_by_tag(regex_t* reg, 1862 const UChar* tag, const UChar* tag_end) 1863 { 1864 int r; 1865 RegexExt* ext; 1866 CalloutTagVal e; 1867 1868 ext = reg->extp; 1869 if (IS_NULL(ext) || IS_NULL(ext->tag_table)) 1870 return ONIGERR_INVALID_CALLOUT_TAG_NAME; 1871 1872 r = onig_st_lookup_strend(ext->tag_table, tag, tag_end, 1873 (HashDataType* )((void* )(&e))); 1874 if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; 1875 return (int )e; 1876 } 1877 1878 static CalloutTagVal 1879 callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end) 1880 { 1881 CalloutTagVal e; 1882 1883 e = -1; 1884 if (IS_NOT_NULL(t)) { 1885 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); 1886 } 1887 return e; 1888 } 1889 1890 static int 1891 callout_tag_table_new(CalloutTagTable** rt) 1892 { 1893 CalloutTagTable* t; 1894 1895 *rt = 0; 1896 t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM); 1897 CHECK_NULL_RETURN_MEMERR(t); 1898 1899 *rt = t; 1900 return ONIG_NORMAL; 1901 } 1902 1903 static int 1904 callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, 1905 UChar* name_end, CalloutTagVal entry_val) 1906 { 1907 int r; 1908 CalloutTagVal val; 1909 1910 if (name_end - name <= 0) 1911 return ONIGERR_INVALID_CALLOUT_TAG_NAME; 1912 1913 val = callout_tag_find(t, name, name_end); 1914 if (val >= 0) { 1915 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, 1916 name, name_end); 1917 return ONIGERR_MULTIPLEX_DEFINED_NAME; 1918 } 1919 1920 r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); 1921 if (r < 0) return r; 1922 1923 return ONIG_NORMAL; 1924 } 1925 1926 static int 1927 ext_ensure_tag_table(regex_t* reg) 1928 { 1929 int r; 1930 RegexExt* ext; 1931 CalloutTagTable* t; 1932 1933 ext = onig_get_regex_ext(reg); 1934 CHECK_NULL_RETURN_MEMERR(ext); 1935 1936 if (IS_NULL(ext->tag_table)) { 1937 r = callout_tag_table_new(&t); 1938 if (r != ONIG_NORMAL) return r; 1939 1940 ext->tag_table = t; 1941 } 1942 1943 return ONIG_NORMAL; 1944 } 1945 1946 static int 1947 callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, 1948 CalloutTagVal entry_val) 1949 { 1950 int r; 1951 RegexExt* ext; 1952 CalloutListEntry* e; 1953 1954 r = ext_ensure_tag_table(reg); 1955 if (r != ONIG_NORMAL) return r; 1956 1957 ext = onig_get_regex_ext(reg); 1958 CHECK_NULL_RETURN_MEMERR(ext); 1959 r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); 1960 1961 e = onig_reg_callout_list_at(reg, (int )entry_val); 1962 CHECK_NULL_RETURN_MEMERR(e); 1963 e->tag_start = name; 1964 e->tag_end = name_end; 1965 1966 return r; 1967 } 1968 1969 #endif /* USE_CALLOUT */ 1970 1971 1972 #define INIT_SCANENV_MEMENV_ALLOC_SIZE 16 1973 1974 static void 1975 scan_env_clear(ScanEnv* env) 1976 { 1977 MEM_STATUS_CLEAR(env->cap_history); 1978 MEM_STATUS_CLEAR(env->backtrack_mem); 1979 MEM_STATUS_CLEAR(env->backrefed_mem); 1980 env->error = (UChar* )NULL; 1981 env->error_end = (UChar* )NULL; 1982 env->num_call = 0; 1983 1984 #ifdef USE_CALL 1985 env->unset_addr_list = NULL; 1986 env->has_call_zero = 0; 1987 #endif 1988 1989 env->num_mem = 0; 1990 env->num_named = 0; 1991 env->mem_alloc = 0; 1992 env->mem_env_dynamic = (MemEnv* )NULL; 1993 1994 xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); 1995 1996 env->parse_depth = 0; 1997 #ifdef ONIG_DEBUG_PARSE 1998 env->max_parse_depth = 0; 1999 #endif 2000 env->backref_num = 0; 2001 env->keep_num = 0; 2002 env->id_num = 0; 2003 env->save_alloc_num = 0; 2004 env->saves = 0; 2005 } 2006 2007 static int 2008 scan_env_add_mem_entry(ScanEnv* env) 2009 { 2010 int i, need, alloc; 2011 MemEnv* p; 2012 2013 need = env->num_mem + 1; 2014 if (need > MaxCaptureNum && MaxCaptureNum != 0) 2015 return ONIGERR_TOO_MANY_CAPTURES; 2016 2017 if (need >= SCANENV_MEMENV_SIZE) { 2018 if (env->mem_alloc <= need) { 2019 if (IS_NULL(env->mem_env_dynamic)) { 2020 alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE; 2021 p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc); 2022 CHECK_NULL_RETURN_MEMERR(p); 2023 xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static)); 2024 } 2025 else { 2026 alloc = env->mem_alloc * 2; 2027 p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc); 2028 CHECK_NULL_RETURN_MEMERR(p); 2029 } 2030 2031 for (i = env->num_mem + 1; i < alloc; i++) { 2032 p[i].mem_node = NULL_NODE; 2033 p[i].empty_repeat_node = NULL_NODE; 2034 } 2035 2036 env->mem_env_dynamic = p; 2037 env->mem_alloc = alloc; 2038 } 2039 } 2040 2041 env->num_mem++; 2042 return env->num_mem; 2043 } 2044 2045 static int 2046 scan_env_set_mem_node(ScanEnv* env, int num, Node* node) 2047 { 2048 if (env->num_mem >= num) 2049 SCANENV_MEMENV(env)[num].mem_node = node; 2050 else 2051 return ONIGERR_PARSER_BUG; 2052 return 0; 2053 } 2054 2055 static void 2056 node_free_body(Node* node) 2057 { 2058 if (IS_NULL(node)) return ; 2059 2060 switch (NODE_TYPE(node)) { 2061 case NODE_STRING: 2062 if (STR_(node)->capacity != 0 && 2063 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) { 2064 xfree(STR_(node)->s); 2065 } 2066 break; 2067 2068 case NODE_LIST: 2069 case NODE_ALT: 2070 onig_node_free(NODE_CAR(node)); 2071 node = NODE_CDR(node); 2072 while (IS_NOT_NULL(node)) { 2073 Node* next = NODE_CDR(node); 2074 onig_node_free(NODE_CAR(node)); 2075 xfree(node); 2076 node = next; 2077 } 2078 break; 2079 2080 case NODE_CCLASS: 2081 { 2082 CClassNode* cc = CCLASS_(node); 2083 2084 if (cc->mbuf) 2085 bbuf_free(cc->mbuf); 2086 } 2087 break; 2088 2089 case NODE_BACKREF: 2090 if (IS_NOT_NULL(BACKREF_(node)->back_dynamic)) 2091 xfree(BACKREF_(node)->back_dynamic); 2092 break; 2093 2094 case NODE_BAG: 2095 if (NODE_BODY(node)) 2096 onig_node_free(NODE_BODY(node)); 2097 2098 { 2099 BagNode* en = BAG_(node); 2100 if (en->type == BAG_IF_ELSE) { 2101 onig_node_free(en->te.Then); 2102 onig_node_free(en->te.Else); 2103 } 2104 } 2105 break; 2106 2107 case NODE_QUANT: 2108 if (NODE_BODY(node)) 2109 onig_node_free(NODE_BODY(node)); 2110 break; 2111 2112 case NODE_ANCHOR: 2113 if (NODE_BODY(node)) 2114 onig_node_free(NODE_BODY(node)); 2115 if (IS_NOT_NULL(ANCHOR_(node)->lead_node)) 2116 onig_node_free(ANCHOR_(node)->lead_node); 2117 break; 2118 2119 case NODE_CTYPE: 2120 case NODE_CALL: 2121 case NODE_GIMMICK: 2122 break; 2123 } 2124 } 2125 2126 extern void 2127 onig_node_free(Node* node) 2128 { 2129 if (IS_NULL(node)) return ; 2130 2131 #ifdef DEBUG_NODE_FREE 2132 fprintf(stderr, "onig_node_free: %p\n", node); 2133 #endif 2134 2135 node_free_body(node); 2136 xfree(node); 2137 } 2138 2139 static void 2140 cons_node_free_alone(Node* node) 2141 { 2142 NODE_CAR(node) = 0; 2143 NODE_CDR(node) = 0; 2144 onig_node_free(node); 2145 } 2146 2147 static Node* 2148 node_new(void) 2149 { 2150 Node* node; 2151 2152 node = (Node* )xmalloc(sizeof(Node)); 2153 CHECK_NULL_RETURN(node); 2154 xmemset(node, 0, sizeof(*node)); 2155 2156 #ifdef DEBUG_NODE_FREE 2157 fprintf(stderr, "node_new: %p\n", node); 2158 #endif 2159 return node; 2160 } 2161 2162 extern int 2163 onig_node_copy(Node** rcopy, Node* from) 2164 { 2165 int r; 2166 Node* copy; 2167 2168 *rcopy = NULL_NODE; 2169 2170 switch (NODE_TYPE(from)) { 2171 case NODE_LIST: 2172 case NODE_ALT: 2173 case NODE_ANCHOR: 2174 /* These node's link to other nodes are processed by caller. */ 2175 break; 2176 case NODE_STRING: 2177 case NODE_CCLASS: 2178 case NODE_CTYPE: 2179 /* Fixed contents after copy. */ 2180 break; 2181 default: 2182 /* Not supported yet. */ 2183 return ONIGERR_TYPE_BUG; 2184 break; 2185 } 2186 2187 copy = node_new(); 2188 CHECK_NULL_RETURN_MEMERR(copy); 2189 xmemcpy(copy, from, sizeof(*copy)); 2190 2191 switch (NODE_TYPE(copy)) { 2192 case NODE_STRING: 2193 r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE); 2194 if (r != 0) { 2195 err: 2196 onig_node_free(copy); 2197 return r; 2198 } 2199 break; 2200 2201 case NODE_CCLASS: 2202 { 2203 CClassNode *fcc, *tcc; 2204 2205 fcc = CCLASS_(from); 2206 tcc = CCLASS_(copy); 2207 if (IS_NOT_NULL(fcc->mbuf)) { 2208 r = bbuf_clone(&(tcc->mbuf), fcc->mbuf); 2209 if (r != 0) goto err; 2210 } 2211 } 2212 break; 2213 2214 default: 2215 break; 2216 } 2217 2218 *rcopy = copy; 2219 return ONIG_NORMAL; 2220 } 2221 2222 2223 static void 2224 initialize_cclass(CClassNode* cc) 2225 { 2226 BITSET_CLEAR(cc->bs); 2227 cc->flags = 0; 2228 cc->mbuf = NULL; 2229 } 2230 2231 static Node* 2232 node_new_cclass(void) 2233 { 2234 Node* node = node_new(); 2235 CHECK_NULL_RETURN(node); 2236 2237 NODE_SET_TYPE(node, NODE_CCLASS); 2238 initialize_cclass(CCLASS_(node)); 2239 return node; 2240 } 2241 2242 static Node* 2243 node_new_ctype(int type, int not, OnigOptionType options) 2244 { 2245 Node* node = node_new(); 2246 CHECK_NULL_RETURN(node); 2247 2248 NODE_SET_TYPE(node, NODE_CTYPE); 2249 CTYPE_(node)->ctype = type; 2250 CTYPE_(node)->not = not; 2251 CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options); 2252 return node; 2253 } 2254 2255 static Node* 2256 node_new_anychar(OnigOptionType options) 2257 { 2258 Node* node; 2259 2260 node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options); 2261 CHECK_NULL_RETURN(node); 2262 2263 if (OPTON_MULTILINE(options)) 2264 NODE_STATUS_ADD(node, MULTILINE); 2265 return node; 2266 } 2267 2268 static int 2269 node_new_no_newline(Node** node, ScanEnv* env) 2270 { 2271 Node* n; 2272 2273 n = node_new_anychar(ONIG_OPTION_NONE); 2274 CHECK_NULL_RETURN_MEMERR(n); 2275 *node = n; 2276 return 0; 2277 } 2278 2279 static int 2280 node_new_true_anychar(Node** node) 2281 { 2282 Node* n; 2283 2284 n = node_new_anychar(ONIG_OPTION_MULTILINE); 2285 CHECK_NULL_RETURN_MEMERR(n); 2286 *node = n; 2287 return 0; 2288 } 2289 2290 static Node* 2291 node_new_list(Node* left, Node* right) 2292 { 2293 Node* node = node_new(); 2294 CHECK_NULL_RETURN(node); 2295 2296 NODE_SET_TYPE(node, NODE_LIST); 2297 NODE_CAR(node) = left; 2298 NODE_CDR(node) = right; 2299 return node; 2300 } 2301 2302 extern Node* 2303 onig_node_new_list(Node* left, Node* right) 2304 { 2305 return node_new_list(left, right); 2306 } 2307 2308 extern Node* 2309 onig_node_new_alt(Node* left, Node* right) 2310 { 2311 Node* node = node_new(); 2312 CHECK_NULL_RETURN(node); 2313 2314 NODE_SET_TYPE(node, NODE_ALT); 2315 NODE_CAR(node) = left; 2316 NODE_CDR(node) = right; 2317 return node; 2318 } 2319 2320 static Node* 2321 make_list_or_alt(NodeType type, int n, Node* ns[]) 2322 { 2323 Node* r; 2324 2325 if (n <= 0) return NULL_NODE; 2326 2327 if (n == 1) { 2328 r = node_new(); 2329 CHECK_NULL_RETURN(r); 2330 NODE_SET_TYPE(r, type); 2331 NODE_CAR(r) = ns[0]; 2332 NODE_CDR(r) = NULL_NODE; 2333 } 2334 else { 2335 Node* right; 2336 2337 r = node_new(); 2338 CHECK_NULL_RETURN(r); 2339 2340 right = make_list_or_alt(type, n - 1, ns + 1); 2341 if (IS_NULL(right)) { 2342 onig_node_free(r); 2343 return NULL_NODE; 2344 } 2345 2346 NODE_SET_TYPE(r, type); 2347 NODE_CAR(r) = ns[0]; 2348 NODE_CDR(r) = right; 2349 } 2350 2351 return r; 2352 } 2353 2354 static Node* 2355 make_list(int n, Node* ns[]) 2356 { 2357 return make_list_or_alt(NODE_LIST, n, ns); 2358 } 2359 2360 static Node* 2361 make_alt(int n, Node* ns[]) 2362 { 2363 return make_list_or_alt(NODE_ALT, n, ns); 2364 } 2365 2366 static Node* 2367 node_new_anchor(int type) 2368 { 2369 Node* node; 2370 2371 node = node_new(); 2372 CHECK_NULL_RETURN(node); 2373 2374 NODE_SET_TYPE(node, NODE_ANCHOR); 2375 ANCHOR_(node)->type = type; 2376 ANCHOR_(node)->char_min_len = 0; 2377 ANCHOR_(node)->char_max_len = INFINITE_LEN; 2378 ANCHOR_(node)->ascii_mode = 0; 2379 ANCHOR_(node)->lead_node = NULL_NODE; 2380 return node; 2381 } 2382 2383 static Node* 2384 node_new_anchor_with_options(int type, OnigOptionType options) 2385 { 2386 int ascii_mode; 2387 Node* node; 2388 2389 node = node_new_anchor(type); 2390 CHECK_NULL_RETURN(node); 2391 2392 ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0; 2393 ANCHOR_(node)->ascii_mode = ascii_mode; 2394 2395 if (type == ANCR_TEXT_SEGMENT_BOUNDARY || 2396 type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) { 2397 if (OPTON_TEXT_SEGMENT_WORD(options)) 2398 NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD); 2399 } 2400 2401 return node; 2402 } 2403 2404 static Node* 2405 node_new_backref(int back_num, int* backrefs, int by_name, 2406 #ifdef USE_BACKREF_WITH_LEVEL 2407 int exist_level, int nest_level, 2408 #endif 2409 ScanEnv* env) 2410 { 2411 int i; 2412 Node* node; 2413 2414 node = node_new(); 2415 CHECK_NULL_RETURN(node); 2416 2417 NODE_SET_TYPE(node, NODE_BACKREF); 2418 BACKREF_(node)->back_num = back_num; 2419 BACKREF_(node)->back_dynamic = (int* )NULL; 2420 if (by_name != 0) 2421 NODE_STATUS_ADD(node, BY_NAME); 2422 2423 if (OPTON_IGNORECASE(env->options)) 2424 NODE_STATUS_ADD(node, IGNORECASE); 2425 2426 #ifdef USE_BACKREF_WITH_LEVEL 2427 if (exist_level != 0) { 2428 NODE_STATUS_ADD(node, NEST_LEVEL); 2429 BACKREF_(node)->nest_level = nest_level; 2430 } 2431 #endif 2432 2433 for (i = 0; i < back_num; i++) { 2434 if (backrefs[i] <= env->num_mem && 2435 IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { 2436 NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ 2437 break; 2438 } 2439 } 2440 2441 if (back_num <= NODE_BACKREFS_SIZE) { 2442 for (i = 0; i < back_num; i++) 2443 BACKREF_(node)->back_static[i] = backrefs[i]; 2444 } 2445 else { 2446 int* p = (int* )xmalloc(sizeof(int) * back_num); 2447 if (IS_NULL(p)) { 2448 onig_node_free(node); 2449 return NULL; 2450 } 2451 BACKREF_(node)->back_dynamic = p; 2452 for (i = 0; i < back_num; i++) 2453 p[i] = backrefs[i]; 2454 } 2455 2456 env->backref_num++; 2457 return node; 2458 } 2459 2460 static Node* 2461 node_new_backref_checker(int back_num, int* backrefs, int by_name, 2462 #ifdef USE_BACKREF_WITH_LEVEL 2463 int exist_level, int nest_level, 2464 #endif 2465 ScanEnv* env) 2466 { 2467 Node* node; 2468 2469 node = node_new_backref(back_num, backrefs, by_name, 2470 #ifdef USE_BACKREF_WITH_LEVEL 2471 exist_level, nest_level, 2472 #endif 2473 env); 2474 CHECK_NULL_RETURN(node); 2475 2476 NODE_STATUS_ADD(node, CHECKER); 2477 return node; 2478 } 2479 2480 #ifdef USE_CALL 2481 static Node* 2482 node_new_call(UChar* name, UChar* name_end, int gnum, int by_number) 2483 { 2484 Node* node = node_new(); 2485 CHECK_NULL_RETURN(node); 2486 2487 NODE_SET_TYPE(node, NODE_CALL); 2488 CALL_(node)->by_number = by_number; 2489 CALL_(node)->name = name; 2490 CALL_(node)->name_end = name_end; 2491 CALL_(node)->group_num = gnum; 2492 CALL_(node)->entry_count = 1; 2493 return node; 2494 } 2495 #endif 2496 2497 static Node* 2498 node_new_quantifier(int lower, int upper, int by_number) 2499 { 2500 Node* node = node_new(); 2501 CHECK_NULL_RETURN(node); 2502 2503 NODE_SET_TYPE(node, NODE_QUANT); 2504 QUANT_(node)->lower = lower; 2505 QUANT_(node)->upper = upper; 2506 QUANT_(node)->greedy = 1; 2507 QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; 2508 QUANT_(node)->head_exact = NULL_NODE; 2509 QUANT_(node)->next_head_exact = NULL_NODE; 2510 QUANT_(node)->include_referred = 0; 2511 if (by_number != 0) 2512 NODE_STATUS_ADD(node, BY_NUMBER); 2513 2514 return node; 2515 } 2516 2517 static Node* 2518 node_new_bag(enum BagType type) 2519 { 2520 Node* node = node_new(); 2521 CHECK_NULL_RETURN(node); 2522 2523 NODE_SET_TYPE(node, NODE_BAG); 2524 BAG_(node)->type = type; 2525 2526 switch (type) { 2527 case BAG_MEMORY: 2528 BAG_(node)->m.regnum = 0; 2529 BAG_(node)->m.called_addr = -1; 2530 BAG_(node)->m.entry_count = 1; 2531 BAG_(node)->m.called_state = 0; 2532 break; 2533 2534 case BAG_OPTION: 2535 BAG_(node)->o.options = 0; 2536 break; 2537 2538 case BAG_STOP_BACKTRACK: 2539 break; 2540 2541 case BAG_IF_ELSE: 2542 BAG_(node)->te.Then = 0; 2543 BAG_(node)->te.Else = 0; 2544 break; 2545 } 2546 2547 BAG_(node)->opt_count = 0; 2548 return node; 2549 } 2550 2551 extern Node* 2552 onig_node_new_bag(enum BagType type) 2553 { 2554 return node_new_bag(type); 2555 } 2556 2557 static Node* 2558 node_new_bag_if_else(Node* cond, Node* Then, Node* Else) 2559 { 2560 Node* n; 2561 n = node_new_bag(BAG_IF_ELSE); 2562 CHECK_NULL_RETURN(n); 2563 2564 NODE_BODY(n) = cond; 2565 BAG_(n)->te.Then = Then; 2566 BAG_(n)->te.Else = Else; 2567 return n; 2568 } 2569 2570 static Node* 2571 node_new_memory(int is_named) 2572 { 2573 Node* node = node_new_bag(BAG_MEMORY); 2574 CHECK_NULL_RETURN(node); 2575 if (is_named != 0) 2576 NODE_STATUS_ADD(node, NAMED_GROUP); 2577 2578 return node; 2579 } 2580 2581 static Node* 2582 node_new_option(OnigOptionType option) 2583 { 2584 Node* node = node_new_bag(BAG_OPTION); 2585 CHECK_NULL_RETURN(node); 2586 BAG_(node)->o.options = option; 2587 return node; 2588 } 2589 2590 static Node* 2591 node_new_group(Node* content) 2592 { 2593 Node* node; 2594 2595 node = node_new(); 2596 CHECK_NULL_RETURN(node); 2597 NODE_SET_TYPE(node, NODE_LIST); 2598 NODE_CAR(node) = content; 2599 NODE_CDR(node) = NULL_NODE; 2600 2601 return node; 2602 } 2603 2604 static Node* 2605 node_drop_group(Node* group) 2606 { 2607 Node* content; 2608 2609 content = NODE_CAR(group); 2610 NODE_CAR(group) = NULL_NODE; 2611 onig_node_free(group); 2612 return content; 2613 } 2614 2615 static int 2616 node_set_fail(Node* node) 2617 { 2618 NODE_SET_TYPE(node, NODE_GIMMICK); 2619 GIMMICK_(node)->type = GIMMICK_FAIL; 2620 return ONIG_NORMAL; 2621 } 2622 2623 static int 2624 node_new_fail(Node** node, ScanEnv* env) 2625 { 2626 *node = node_new(); 2627 CHECK_NULL_RETURN_MEMERR(*node); 2628 2629 return node_set_fail(*node); 2630 } 2631 2632 extern int 2633 onig_node_reset_fail(Node* node) 2634 { 2635 node_free_body(node); 2636 return node_set_fail(node); 2637 } 2638 2639 static int 2640 node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) 2641 { 2642 int id; 2643 2644 ID_ENTRY(env, id); 2645 2646 *node = node_new(); 2647 CHECK_NULL_RETURN_MEMERR(*node); 2648 2649 NODE_SET_TYPE(*node, NODE_GIMMICK); 2650 GIMMICK_(*node)->id = id; 2651 GIMMICK_(*node)->type = GIMMICK_SAVE; 2652 GIMMICK_(*node)->detail_type = (int )save_type; 2653 2654 return ONIG_NORMAL; 2655 } 2656 2657 static int 2658 node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, 2659 int id, ScanEnv* env) 2660 { 2661 *node = node_new(); 2662 CHECK_NULL_RETURN_MEMERR(*node); 2663 2664 NODE_SET_TYPE(*node, NODE_GIMMICK); 2665 GIMMICK_(*node)->id = id; 2666 GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR; 2667 GIMMICK_(*node)->detail_type = (int )update_var_type; 2668 2669 return ONIG_NORMAL; 2670 } 2671 2672 static int 2673 node_new_keep(Node** node, ScanEnv* env) 2674 { 2675 int r; 2676 2677 r = node_new_save_gimmick(node, SAVE_KEEP, env); 2678 if (r != 0) return r; 2679 2680 env->keep_num++; 2681 return ONIG_NORMAL; 2682 } 2683 2684 #ifdef USE_CALLOUT 2685 2686 extern void 2687 onig_free_reg_callout_list(int n, CalloutListEntry* list) 2688 { 2689 int i; 2690 int j; 2691 2692 if (IS_NULL(list)) return ; 2693 2694 for (i = 0; i < n; i++) { 2695 if (list[i].of == ONIG_CALLOUT_OF_NAME) { 2696 for (j = 0; j < list[i].u.arg.passed_num; j++) { 2697 if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) { 2698 if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start)) 2699 xfree(list[i].u.arg.vals[j].s.start); 2700 } 2701 } 2702 } 2703 else { /* ONIG_CALLOUT_OF_CONTENTS */ 2704 if (IS_NOT_NULL(list[i].u.content.start)) { 2705 xfree((void* )list[i].u.content.start); 2706 } 2707 } 2708 } 2709 2710 xfree(list); 2711 } 2712 2713 extern CalloutListEntry* 2714 onig_reg_callout_list_at(regex_t* reg, int num) 2715 { 2716 RegexExt* ext = reg->extp; 2717 CHECK_NULL_RETURN(ext); 2718 2719 if (num <= 0 || num > ext->callout_num) 2720 return 0; 2721 2722 num--; 2723 return ext->callout_list + num; 2724 } 2725 2726 static int 2727 reg_callout_list_entry(ScanEnv* env, int* rnum) 2728 { 2729 #define INIT_CALLOUT_LIST_NUM 3 2730 2731 int num; 2732 CalloutListEntry* list; 2733 CalloutListEntry* e; 2734 RegexExt* ext; 2735 2736 ext = onig_get_regex_ext(env->reg); 2737 CHECK_NULL_RETURN_MEMERR(ext); 2738 2739 if (IS_NULL(ext->callout_list)) { 2740 list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM); 2741 CHECK_NULL_RETURN_MEMERR(list); 2742 2743 ext->callout_list = list; 2744 ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM; 2745 ext->callout_num = 0; 2746 } 2747 2748 num = ext->callout_num + 1; 2749 if (num > ext->callout_list_alloc) { 2750 int alloc = ext->callout_list_alloc * 2; 2751 list = (CalloutListEntry* )xrealloc(ext->callout_list, 2752 sizeof(CalloutListEntry) * alloc); 2753 CHECK_NULL_RETURN_MEMERR(list); 2754 2755 ext->callout_list = list; 2756 ext->callout_list_alloc = alloc; 2757 } 2758 2759 e = ext->callout_list + (num - 1); 2760 2761 e->flag = 0; 2762 e->of = 0; 2763 e->in = ONIG_CALLOUT_OF_CONTENTS; 2764 e->type = 0; 2765 e->tag_start = 0; 2766 e->tag_end = 0; 2767 e->start_func = 0; 2768 e->end_func = 0; 2769 e->u.arg.num = 0; 2770 e->u.arg.passed_num = 0; 2771 2772 ext->callout_num = num; 2773 *rnum = num; 2774 return ONIG_NORMAL; 2775 } 2776 2777 static int 2778 node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, 2779 ScanEnv* env) 2780 { 2781 *node = node_new(); 2782 CHECK_NULL_RETURN_MEMERR(*node); 2783 2784 NODE_SET_TYPE(*node, NODE_GIMMICK); 2785 GIMMICK_(*node)->id = id; 2786 GIMMICK_(*node)->num = num; 2787 GIMMICK_(*node)->type = GIMMICK_CALLOUT; 2788 GIMMICK_(*node)->detail_type = (int )callout_of; 2789 2790 return ONIG_NORMAL; 2791 } 2792 #endif 2793 2794 static int 2795 make_text_segment(Node** node, ScanEnv* env) 2796 { 2797 int r; 2798 int i; 2799 Node* x; 2800 Node* ns[2]; 2801 2802 /* \X == (?>\O(?:\Y\O)*) */ 2803 2804 ns[1] = NULL_NODE; 2805 2806 r = ONIGERR_MEMORY; 2807 ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options); 2808 if (IS_NULL(ns[0])) goto err; 2809 2810 r = node_new_true_anychar(&ns[1]); 2811 if (r != 0) goto err1; 2812 2813 x = make_list(2, ns); 2814 if (IS_NULL(x)) goto err; 2815 ns[0] = x; 2816 ns[1] = NULL_NODE; 2817 2818 x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); 2819 if (IS_NULL(x)) goto err; 2820 2821 NODE_BODY(x) = ns[0]; 2822 ns[0] = NULL_NODE; 2823 ns[1] = x; 2824 2825 r = node_new_true_anychar(&ns[0]); 2826 if (r != 0) goto err1; 2827 2828 x = make_list(2, ns); 2829 if (IS_NULL(x)) goto err; 2830 2831 ns[0] = x; 2832 ns[1] = NULL_NODE; 2833 2834 x = node_new_bag(BAG_STOP_BACKTRACK); 2835 if (IS_NULL(x)) goto err; 2836 2837 NODE_BODY(x) = ns[0]; 2838 2839 *node = x; 2840 return ONIG_NORMAL; 2841 2842 err: 2843 r = ONIGERR_MEMORY; 2844 err1: 2845 for (i = 0; i < 2; i++) onig_node_free(ns[i]); 2846 return r; 2847 } 2848 2849 static int 2850 make_absent_engine(Node** node, int pre_save_right_id, Node* absent, 2851 Node* step_one, int lower, int upper, int possessive, 2852 int is_range_cutter, ScanEnv* env) 2853 { 2854 int r; 2855 int i; 2856 int id; 2857 Node* x; 2858 Node* ns[4]; 2859 2860 for (i = 0; i < 4; i++) ns[i] = NULL_NODE; 2861 2862 ns[1] = absent; 2863 ns[3] = step_one; /* for err */ 2864 r = node_new_save_gimmick(&ns[0], SAVE_S, env); 2865 if (r != 0) goto err; 2866 2867 id = GIMMICK_(ns[0])->id; 2868 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK, 2869 id, env); 2870 if (r != 0) goto err; 2871 2872 if (is_range_cutter != 0) 2873 NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS); 2874 2875 r = node_new_fail(&ns[3], env); 2876 if (r != 0) goto err; 2877 2878 x = make_list(4, ns); 2879 if (IS_NULL(x)) goto err0; 2880 2881 ns[0] = x; 2882 ns[1] = step_one; 2883 ns[2] = ns[3] = NULL_NODE; 2884 2885 x = make_alt(2, ns); 2886 if (IS_NULL(x)) goto err0; 2887 2888 ns[0] = x; 2889 2890 x = node_new_quantifier(lower, upper, FALSE); 2891 if (IS_NULL(x)) goto err0; 2892 2893 NODE_BODY(x) = ns[0]; 2894 ns[0] = x; 2895 2896 if (possessive != 0) { 2897 x = node_new_bag(BAG_STOP_BACKTRACK); 2898 if (IS_NULL(x)) goto err0; 2899 2900 NODE_BODY(x) = ns[0]; 2901 ns[0] = x; 2902 } 2903 2904 r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, 2905 pre_save_right_id, env); 2906 if (r != 0) goto err; 2907 2908 r = node_new_fail(&ns[2], env); 2909 if (r != 0) goto err; 2910 2911 x = make_list(2, ns + 1); 2912 if (IS_NULL(x)) goto err0; 2913 2914 ns[1] = x; ns[2] = NULL_NODE; 2915 2916 x = make_alt(2, ns); 2917 if (IS_NULL(x)) goto err0; 2918 2919 if (is_range_cutter != FALSE) 2920 NODE_STATUS_ADD(x, SUPER); 2921 2922 *node = x; 2923 return ONIG_NORMAL; 2924 2925 err0: 2926 r = ONIGERR_MEMORY; 2927 err: 2928 for (i = 0; i < 4; i++) onig_node_free(ns[i]); 2929 return r; 2930 } 2931 2932 static int 2933 make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, 2934 ScanEnv* env) 2935 { 2936 int r; 2937 int id; 2938 Node* save; 2939 Node* x; 2940 Node* ns[2]; 2941 2942 *node1 = *node2 = NULL_NODE; 2943 save = ns[0] = ns[1] = NULL_NODE; 2944 2945 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env); 2946 if (r != 0) goto err; 2947 2948 id = GIMMICK_(save)->id; 2949 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, 2950 id, env); 2951 if (r != 0) goto err; 2952 2953 r = node_new_fail(&ns[1], env); 2954 if (r != 0) goto err; 2955 2956 x = make_list(2, ns); 2957 if (IS_NULL(x)) goto err0; 2958 2959 ns[0] = NULL_NODE; ns[1] = x; 2960 2961 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, 2962 pre_save_right_id, env); 2963 if (r != 0) goto err; 2964 2965 x = make_alt(2, ns); 2966 if (IS_NULL(x)) goto err0; 2967 2968 *node1 = save; 2969 *node2 = x; 2970 return ONIG_NORMAL; 2971 2972 err0: 2973 r = ONIGERR_MEMORY; 2974 err: 2975 onig_node_free(save); 2976 onig_node_free(ns[0]); 2977 onig_node_free(ns[1]); 2978 return r; 2979 } 2980 2981 static int 2982 make_range_clear(Node** node, ScanEnv* env) 2983 { 2984 int r; 2985 int id; 2986 Node* save; 2987 Node* x; 2988 Node* ns[2]; 2989 2990 *node = NULL_NODE; 2991 save = ns[0] = ns[1] = NULL_NODE; 2992 2993 r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env); 2994 if (r != 0) goto err; 2995 2996 id = GIMMICK_(save)->id; 2997 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, 2998 id, env); 2999 if (r != 0) goto err; 3000 3001 r = node_new_fail(&ns[1], env); 3002 if (r != 0) goto err; 3003 3004 x = make_list(2, ns); 3005 if (IS_NULL(x)) goto err0; 3006 3007 ns[0] = NULL_NODE; ns[1] = x; 3008 3009 #define ID_NOT_USED_DONT_CARE_ME 0 3010 3011 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 3012 ID_NOT_USED_DONT_CARE_ME, env); 3013 if (r != 0) goto err; 3014 NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS); 3015 3016 x = make_alt(2, ns); 3017 if (IS_NULL(x)) goto err0; 3018 3019 NODE_STATUS_ADD(x, SUPER); 3020 3021 ns[0] = save; 3022 ns[1] = x; 3023 save = NULL_NODE; 3024 x = make_list(2, ns); 3025 if (IS_NULL(x)) goto err0; 3026 3027 *node = x; 3028 return ONIG_NORMAL; 3029 3030 err0: 3031 r = ONIGERR_MEMORY; 3032 err: 3033 onig_node_free(save); 3034 onig_node_free(ns[0]); 3035 onig_node_free(ns[1]); 3036 return r; 3037 } 3038 3039 static int 3040 is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, 3041 int* is_possessive, ScanEnv* env) 3042 { 3043 Node* quant; 3044 Node* body; 3045 3046 *rquant = *rbody = 0; 3047 *is_possessive = 0; 3048 3049 if (NODE_TYPE(node) == NODE_QUANT) { 3050 quant = node; 3051 } 3052 else { 3053 if (NODE_TYPE(node) == NODE_BAG) { 3054 BagNode* en = BAG_(node); 3055 if (en->type == BAG_STOP_BACKTRACK) { 3056 *is_possessive = 1; 3057 quant = NODE_BAG_BODY(en); 3058 if (NODE_TYPE(quant) != NODE_QUANT) 3059 return 0; 3060 } 3061 else 3062 return 0; 3063 } 3064 else 3065 return 0; 3066 } 3067 3068 if (QUANT_(quant)->greedy == 0) 3069 return 0; 3070 3071 body = NODE_BODY(quant); 3072 switch (NODE_TYPE(body)) { 3073 case NODE_STRING: 3074 { 3075 int len; 3076 StrNode* sn = STR_(body); 3077 UChar *s = sn->s; 3078 3079 len = 0; 3080 while (s < sn->end) { 3081 s += enclen(env->enc, s); 3082 len++; 3083 } 3084 if (len != 1) 3085 return 0; 3086 } 3087 3088 case NODE_CCLASS: 3089 break; 3090 3091 default: 3092 return 0; 3093 break; 3094 } 3095 3096 if (node != quant) { 3097 NODE_BODY(node) = 0; 3098 onig_node_free(node); 3099 } 3100 NODE_BODY(quant) = NULL_NODE; 3101 *rquant = quant; 3102 *rbody = body; 3103 return 1; 3104 } 3105 3106 static int 3107 make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant, 3108 Node* body, int possessive, ScanEnv* env) 3109 { 3110 int r; 3111 int i; 3112 int id1; 3113 int lower, upper; 3114 Node* x; 3115 Node* ns[4]; 3116 3117 *node = NULL_NODE; 3118 r = ONIGERR_MEMORY; 3119 ns[0] = ns[1] = NULL_NODE; 3120 ns[2] = body, ns[3] = absent; 3121 3122 lower = QUANT_(quant)->lower; 3123 upper = QUANT_(quant)->upper; 3124 onig_node_free(quant); 3125 3126 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env); 3127 if (r != 0) goto err; 3128 3129 id1 = GIMMICK_(ns[0])->id; 3130 3131 r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, 3132 FALSE, env); 3133 if (r != 0) goto err; 3134 3135 ns[2] = ns[3] = NULL_NODE; 3136 3137 r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, 3138 id1, env); 3139 if (r != 0) goto err; 3140 3141 x = make_list(3, ns); 3142 if (IS_NULL(x)) goto err0; 3143 3144 *node = x; 3145 return ONIG_NORMAL; 3146 3147 err0: 3148 r = ONIGERR_MEMORY; 3149 err: 3150 for (i = 0; i < 4; i++) onig_node_free(ns[i]); 3151 return r; 3152 } 3153 3154 static int 3155 make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, 3156 ScanEnv* env) 3157 { 3158 int r; 3159 int i; 3160 int id1, id2; 3161 int possessive; 3162 Node* x; 3163 Node* ns[7]; 3164 3165 r = ONIGERR_MEMORY; 3166 for (i = 0; i < 7; i++) ns[i] = NULL_NODE; 3167 ns[4] = expr; ns[5] = absent; 3168 3169 if (is_range_cutter == 0) { 3170 Node* quant; 3171 Node* body; 3172 3173 if (expr == NULL_NODE) { 3174 /* default expr \O* */ 3175 quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); 3176 if (IS_NULL(quant)) goto err0; 3177 3178 r = node_new_true_anychar(&body); 3179 if (r != 0) { 3180 onig_node_free(quant); 3181 goto err; 3182 } 3183 possessive = 0; 3184 goto simple; 3185 } 3186 else { 3187 if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) { 3188 simple: 3189 r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant, 3190 body, possessive, env); 3191 if (r != 0) { 3192 ns[4] = NULL_NODE; 3193 onig_node_free(quant); 3194 onig_node_free(body); 3195 goto err; 3196 } 3197 3198 return ONIG_NORMAL; 3199 } 3200 } 3201 } 3202 3203 r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env); 3204 if (r != 0) goto err; 3205 3206 id1 = GIMMICK_(ns[0])->id; 3207 3208 r = node_new_save_gimmick(&ns[1], SAVE_S, env); 3209 if (r != 0) goto err; 3210 3211 id2 = GIMMICK_(ns[1])->id; 3212 3213 r = node_new_true_anychar(&ns[3]); 3214 if (r != 0) goto err; 3215 3216 possessive = 1; 3217 r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, 3218 possessive, is_range_cutter, env); 3219 if (r != 0) goto err; 3220 3221 ns[3] = NULL_NODE; 3222 ns[5] = NULL_NODE; 3223 3224 r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env); 3225 if (r != 0) goto err; 3226 3227 if (is_range_cutter != 0) { 3228 x = make_list(4, ns); 3229 if (IS_NULL(x)) goto err0; 3230 } 3231 else { 3232 r = make_absent_tail(&ns[5], &ns[6], id1, env); 3233 if (r != 0) goto err; 3234 3235 x = make_list(7, ns); 3236 if (IS_NULL(x)) goto err0; 3237 } 3238 3239 *node = x; 3240 return ONIG_NORMAL; 3241 3242 err0: 3243 r = ONIGERR_MEMORY; 3244 err: 3245 for (i = 0; i < 7; i++) onig_node_free(ns[i]); 3246 return r; 3247 } 3248 3249 extern int 3250 onig_node_str_cat(Node* node, const UChar* s, const UChar* end) 3251 { 3252 int addlen = (int )(end - s); 3253 3254 if (addlen > 0) { 3255 int len = (int )(STR_(node)->end - STR_(node)->s); 3256 3257 if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) { 3258 UChar* p; 3259 int capa = len + addlen + NODE_STRING_MARGIN; 3260 3261 if (capa <= STR_(node)->capacity) { 3262 onig_strcpy(STR_(node)->s + len, s, end); 3263 } 3264 else { 3265 if (STR_(node)->s == STR_(node)->buf) 3266 p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end, 3267 s, end, capa); 3268 else 3269 p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa); 3270 3271 CHECK_NULL_RETURN_MEMERR(p); 3272 STR_(node)->s = p; 3273 STR_(node)->capacity = capa; 3274 } 3275 } 3276 else { 3277 onig_strcpy(STR_(node)->s + len, s, end); 3278 } 3279 STR_(node)->end = STR_(node)->s + len + addlen; 3280 } 3281 3282 return 0; 3283 } 3284 3285 extern int 3286 onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free) 3287 { 3288 onig_node_str_clear(node, need_free); 3289 return onig_node_str_cat(node, s, end); 3290 } 3291 3292 static int 3293 node_str_cat_char(Node* node, UChar c) 3294 { 3295 UChar s[1]; 3296 3297 s[0] = c; 3298 return onig_node_str_cat(node, s, s + 1); 3299 } 3300 3301 extern void 3302 onig_node_str_clear(Node* node, int need_free) 3303 { 3304 if (need_free != 0 && 3305 STR_(node)->capacity != 0 && 3306 IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) { 3307 xfree(STR_(node)->s); 3308 } 3309 3310 STR_(node)->flag = 0; 3311 STR_(node)->s = STR_(node)->buf; 3312 STR_(node)->end = STR_(node)->buf; 3313 STR_(node)->capacity = 0; 3314 } 3315 3316 static int 3317 node_set_str(Node* node, const UChar* s, const UChar* end) 3318 { 3319 int r; 3320 3321 NODE_SET_TYPE(node, NODE_STRING); 3322 STR_(node)->flag = 0; 3323 STR_(node)->s = STR_(node)->buf; 3324 STR_(node)->end = STR_(node)->buf; 3325 STR_(node)->capacity = 0; 3326 3327 r = onig_node_str_cat(node, s, end); 3328 return r; 3329 } 3330 3331 static Node* 3332 node_new_str(const UChar* s, const UChar* end) 3333 { 3334 int r; 3335 Node* node = node_new(); 3336 CHECK_NULL_RETURN(node); 3337 3338 r = node_set_str(node, s, end); 3339 if (r != 0) { 3340 onig_node_free(node); 3341 return NULL; 3342 } 3343 3344 return node; 3345 } 3346 3347 static int 3348 node_reset_str(Node* node, const UChar* s, const UChar* end) 3349 { 3350 node_free_body(node); 3351 return node_set_str(node, s, end); 3352 } 3353 3354 extern int 3355 onig_node_reset_empty(Node* node) 3356 { 3357 return node_reset_str(node, NULL, NULL); 3358 } 3359 3360 extern Node* 3361 onig_node_new_str(const UChar* s, const UChar* end) 3362 { 3363 return node_new_str(s, end); 3364 } 3365 3366 static Node* 3367 node_new_str_with_options(const UChar* s, const UChar* end, 3368 OnigOptionType options) 3369 { 3370 Node* node; 3371 node = node_new_str(s, end); 3372 3373 if (OPTON_IGNORECASE(options)) 3374 NODE_STATUS_ADD(node, IGNORECASE); 3375 3376 return node; 3377 } 3378 3379 static Node* 3380 node_new_str_crude(UChar* s, UChar* end, OnigOptionType options) 3381 { 3382 Node* node = node_new_str_with_options(s, end, options); 3383 CHECK_NULL_RETURN(node); 3384 NODE_STRING_SET_CRUDE(node); 3385 return node; 3386 } 3387 3388 static Node* 3389 node_new_empty(void) 3390 { 3391 return node_new_str(NULL, NULL); 3392 } 3393 3394 static Node* 3395 node_new_str_crude_char(UChar c, OnigOptionType options) 3396 { 3397 int i; 3398 UChar p[1]; 3399 Node* node; 3400 3401 p[0] = c; 3402 node = node_new_str_crude(p, p + 1, options); 3403 3404 /* clear buf tail */ 3405 for (i = 1; i < NODE_STRING_BUF_SIZE; i++) 3406 STR_(node)->buf[i] = '\0'; 3407 3408 return node; 3409 } 3410 3411 static Node* 3412 str_node_split_last_char(Node* node, OnigEncoding enc) 3413 { 3414 const UChar *p; 3415 Node* rn; 3416 StrNode* sn; 3417 3418 sn = STR_(node); 3419 rn = NULL_NODE; 3420 if (sn->end > sn->s) { 3421 p = onigenc_get_prev_char_head(enc, sn->s, sn->end); 3422 if (p && p > sn->s) { /* can be split. */ 3423 rn = node_new_str(p, sn->end); 3424 CHECK_NULL_RETURN(rn); 3425 3426 sn->end = (UChar* )p; 3427 STR_(rn)->flag = sn->flag; 3428 NODE_STATUS(rn) = NODE_STATUS(node); 3429 } 3430 } 3431 3432 return rn; 3433 } 3434 3435 static int 3436 str_node_can_be_split(Node* node, OnigEncoding enc) 3437 { 3438 StrNode* sn = STR_(node); 3439 if (sn->end > sn->s) { 3440 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0); 3441 } 3442 return 0; 3443 } 3444 3445 static int 3446 scan_number(UChar** src, const UChar* end, OnigEncoding enc) 3447 { 3448 int num, val; 3449 OnigCodePoint c; 3450 UChar* p = *src; 3451 PFETCH_READY; 3452 3453 num = 0; 3454 while (! PEND) { 3455 PFETCH(c); 3456 if (IS_CODE_DIGIT_ASCII(enc, c)) { 3457 val = (int )DIGITVAL(c); 3458 if ((ONIG_INT_MAX - val) / 10 < num) 3459 return -1; /* overflow */ 3460 3461 num = num * 10 + val; 3462 } 3463 else { 3464 PUNFETCH; 3465 break; 3466 } 3467 } 3468 *src = p; 3469 return num; 3470 } 3471 3472 static int 3473 scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, 3474 OnigEncoding enc, OnigCodePoint* rcode) 3475 { 3476 OnigCodePoint code; 3477 OnigCodePoint c; 3478 unsigned int val; 3479 int n; 3480 UChar* p = *src; 3481 PFETCH_READY; 3482 3483 code = 0; 3484 n = 0; 3485 while (! PEND && n < maxlen) { 3486 PFETCH(c); 3487 if (IS_CODE_XDIGIT_ASCII(enc, c)) { 3488 n++; 3489 val = (unsigned int )XDIGITVAL(enc, c); 3490 if ((UINT_MAX - val) / 16UL < code) 3491 return ONIGERR_TOO_BIG_NUMBER; /* overflow */ 3492 3493 code = (code << 4) + val; 3494 } 3495 else { 3496 PUNFETCH; 3497 break; 3498 } 3499 } 3500 3501 if (n < minlen) 3502 return ONIGERR_INVALID_CODE_POINT_VALUE; 3503 3504 *rcode = code; 3505 *src = p; 3506 return ONIG_NORMAL; 3507 } 3508 3509 static int 3510 scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, 3511 OnigEncoding enc, OnigCodePoint* rcode) 3512 { 3513 OnigCodePoint code; 3514 OnigCodePoint c; 3515 unsigned int val; 3516 int n; 3517 UChar* p = *src; 3518 PFETCH_READY; 3519 3520 code = 0; 3521 n = 0; 3522 while (! PEND && n < maxlen) { 3523 PFETCH(c); 3524 if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { 3525 n++; 3526 val = (unsigned int )ODIGITVAL(c); 3527 if ((UINT_MAX - val) / 8UL < code) 3528 return ONIGERR_TOO_BIG_NUMBER; /* overflow */ 3529 3530 code = (code << 3) + val; 3531 } 3532 else { 3533 PUNFETCH; 3534 break; 3535 } 3536 } 3537 3538 if (n < minlen) 3539 return ONIGERR_INVALID_CODE_POINT_VALUE; 3540 3541 *rcode = code; 3542 *src = p; 3543 return ONIG_NORMAL; 3544 } 3545 3546 3547 #define BB_WRITE_CODE_POINT(bbuf,pos,code) \ 3548 BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) 3549 3550 /* data format: 3551 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] 3552 (all data size is OnigCodePoint) 3553 */ 3554 static int 3555 new_code_range(BBuf** pbuf) 3556 { 3557 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) 3558 int r; 3559 OnigCodePoint n; 3560 BBuf* bbuf; 3561 3562 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); 3563 CHECK_NULL_RETURN_MEMERR(bbuf); 3564 r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE); 3565 if (r != 0) { 3566 xfree(bbuf); 3567 *pbuf = 0; 3568 return r; 3569 } 3570 3571 n = 0; 3572 BB_WRITE_CODE_POINT(bbuf, 0, n); 3573 return 0; 3574 } 3575 3576 static int 3577 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) 3578 { 3579 int r, inc_n, pos; 3580 int low, high, bound, x; 3581 OnigCodePoint n, *data; 3582 BBuf* bbuf; 3583 3584 if (from > to) { 3585 n = from; from = to; to = n; 3586 } 3587 3588 if (IS_NULL(*pbuf)) { 3589 r = new_code_range(pbuf); 3590 if (r != 0) return r; 3591 bbuf = *pbuf; 3592 n = 0; 3593 } 3594 else { 3595 bbuf = *pbuf; 3596 GET_CODE_POINT(n, bbuf->p); 3597 } 3598 data = (OnigCodePoint* )(bbuf->p); 3599 data++; 3600 3601 for (low = 0, bound = n; low < bound; ) { 3602 x = (low + bound) >> 1; 3603 if (from > data[x*2 + 1]) 3604 low = x + 1; 3605 else 3606 bound = x; 3607 } 3608 3609 high = (to == ~((OnigCodePoint )0)) ? n : low; 3610 for (bound = n; high < bound; ) { 3611 x = (high + bound) >> 1; 3612 if (to + 1 >= data[x*2]) 3613 high = x + 1; 3614 else 3615 bound = x; 3616 } 3617 3618 inc_n = low + 1 - high; 3619 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) 3620 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; 3621 3622 if (inc_n != 1) { 3623 if (from > data[low*2]) 3624 from = data[low*2]; 3625 if (to < data[(high - 1)*2 + 1]) 3626 to = data[(high - 1)*2 + 1]; 3627 } 3628 3629 if (inc_n != 0 && (OnigCodePoint )high < n) { 3630 int from_pos = SIZE_CODE_POINT * (1 + high * 2); 3631 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); 3632 int size = (n - high) * 2 * SIZE_CODE_POINT; 3633 3634 if (inc_n > 0) { 3635 BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size); 3636 } 3637 else { 3638 BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); 3639 } 3640 } 3641 3642 pos = SIZE_CODE_POINT * (1 + low * 2); 3643 BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); 3644 BB_WRITE_CODE_POINT(bbuf, pos, from); 3645 BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); 3646 n += inc_n; 3647 BB_WRITE_CODE_POINT(bbuf, 0, n); 3648 3649 return 0; 3650 } 3651 3652 static int 3653 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 3654 { 3655 if (from > to) { 3656 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 3657 return 0; 3658 else 3659 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 3660 } 3661 3662 return add_code_range_to_buf(pbuf, from, to); 3663 } 3664 3665 static int 3666 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) 3667 { 3668 int r, i, n; 3669 OnigCodePoint pre, from, *data, to = 0; 3670 3671 *pbuf = (BBuf* )NULL; 3672 if (IS_NULL(bbuf)) { 3673 set_all: 3674 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 3675 } 3676 3677 data = (OnigCodePoint* )(bbuf->p); 3678 GET_CODE_POINT(n, data); 3679 data++; 3680 if (n <= 0) goto set_all; 3681 3682 r = 0; 3683 pre = MBCODE_START_POS(enc); 3684 for (i = 0; i < n; i++) { 3685 from = data[i*2]; 3686 to = data[i*2+1]; 3687 if (pre <= from - 1) { 3688 r = add_code_range_to_buf(pbuf, pre, from - 1); 3689 if (r != 0) return r; 3690 } 3691 if (to == ~((OnigCodePoint )0)) break; 3692 pre = to + 1; 3693 } 3694 if (to < ~((OnigCodePoint )0)) { 3695 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0)); 3696 } 3697 return r; 3698 } 3699 3700 #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\ 3701 BBuf *tbuf; \ 3702 int tnot; \ 3703 tnot = not1; not1 = not2; not2 = tnot; \ 3704 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ 3705 } while (0) 3706 3707 static int 3708 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, 3709 BBuf* bbuf2, int not2, BBuf** pbuf) 3710 { 3711 int r; 3712 OnigCodePoint i, n1, *data1; 3713 OnigCodePoint from, to; 3714 3715 *pbuf = (BBuf* )NULL; 3716 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { 3717 if (not1 != 0 || not2 != 0) 3718 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 3719 return 0; 3720 } 3721 3722 r = 0; 3723 if (IS_NULL(bbuf2)) 3724 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); 3725 3726 if (IS_NULL(bbuf1)) { 3727 if (not1 != 0) { 3728 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 3729 } 3730 else { 3731 if (not2 == 0) { 3732 return bbuf_clone(pbuf, bbuf2); 3733 } 3734 else { 3735 return not_code_range_buf(enc, bbuf2, pbuf); 3736 } 3737 } 3738 } 3739 3740 if (not1 != 0) 3741 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); 3742 3743 data1 = (OnigCodePoint* )(bbuf1->p); 3744 GET_CODE_POINT(n1, data1); 3745 data1++; 3746 3747 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ 3748 r = bbuf_clone(pbuf, bbuf2); 3749 } 3750 else if (not1 == 0) { /* 1 OR (not 2) */ 3751 r = not_code_range_buf(enc, bbuf2, pbuf); 3752 } 3753 if (r != 0) return r; 3754 3755 for (i = 0; i < n1; i++) { 3756 from = data1[i*2]; 3757 to = data1[i*2+1]; 3758 r = add_code_range_to_buf(pbuf, from, to); 3759 if (r != 0) return r; 3760 } 3761 return 0; 3762 } 3763 3764 static int 3765 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1, 3766 OnigCodePoint* data, int n) 3767 { 3768 int i, r; 3769 OnigCodePoint from2, to2; 3770 3771 for (i = 0; i < n; i++) { 3772 from2 = data[i*2]; 3773 to2 = data[i*2+1]; 3774 if (from2 < from1) { 3775 if (to2 < from1) continue; 3776 else { 3777 from1 = to2 + 1; 3778 } 3779 } 3780 else if (from2 <= to1) { 3781 if (to2 < to1) { 3782 if (from1 <= from2 - 1) { 3783 r = add_code_range_to_buf(pbuf, from1, from2-1); 3784 if (r != 0) return r; 3785 } 3786 from1 = to2 + 1; 3787 } 3788 else { 3789 to1 = from2 - 1; 3790 } 3791 } 3792 else { 3793 from1 = from2; 3794 } 3795 if (from1 > to1) break; 3796 } 3797 if (from1 <= to1) { 3798 r = add_code_range_to_buf(pbuf, from1, to1); 3799 if (r != 0) return r; 3800 } 3801 return 0; 3802 } 3803 3804 static int 3805 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) 3806 { 3807 int r; 3808 OnigCodePoint i, j, n1, n2, *data1, *data2; 3809 OnigCodePoint from, to, from1, to1, from2, to2; 3810 3811 *pbuf = (BBuf* )NULL; 3812 if (IS_NULL(bbuf1)) { 3813 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ 3814 return bbuf_clone(pbuf, bbuf2); 3815 return 0; 3816 } 3817 else if (IS_NULL(bbuf2)) { 3818 if (not2 != 0) 3819 return bbuf_clone(pbuf, bbuf1); 3820 return 0; 3821 } 3822 3823 if (not1 != 0) 3824 SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); 3825 3826 data1 = (OnigCodePoint* )(bbuf1->p); 3827 data2 = (OnigCodePoint* )(bbuf2->p); 3828 GET_CODE_POINT(n1, data1); 3829 GET_CODE_POINT(n2, data2); 3830 data1++; 3831 data2++; 3832 3833 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ 3834 for (i = 0; i < n1; i++) { 3835 from1 = data1[i*2]; 3836 to1 = data1[i*2+1]; 3837 for (j = 0; j < n2; j++) { 3838 from2 = data2[j*2]; 3839 to2 = data2[j*2+1]; 3840 if (from2 > to1) break; 3841 if (to2 < from1) continue; 3842 from = MAX(from1, from2); 3843 to = MIN(to1, to2); 3844 r = add_code_range_to_buf(pbuf, from, to); 3845 if (r != 0) return r; 3846 } 3847 } 3848 } 3849 else if (not1 == 0) { /* 1 AND (not 2) */ 3850 for (i = 0; i < n1; i++) { 3851 from1 = data1[i*2]; 3852 to1 = data1[i*2+1]; 3853 r = and_code_range1(pbuf, from1, to1, data2, n2); 3854 if (r != 0) return r; 3855 } 3856 } 3857 3858 return 0; 3859 } 3860 3861 static int 3862 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) 3863 { 3864 int r, not1, not2; 3865 BBuf *buf1, *buf2, *pbuf; 3866 BitSetRef bsr1, bsr2; 3867 BitSet bs1, bs2; 3868 3869 not1 = IS_NCCLASS_NOT(dest); 3870 bsr1 = dest->bs; 3871 buf1 = dest->mbuf; 3872 not2 = IS_NCCLASS_NOT(cc); 3873 bsr2 = cc->bs; 3874 buf2 = cc->mbuf; 3875 3876 if (not1 != 0) { 3877 bitset_invert_to(bsr1, bs1); 3878 bsr1 = bs1; 3879 } 3880 if (not2 != 0) { 3881 bitset_invert_to(bsr2, bs2); 3882 bsr2 = bs2; 3883 } 3884 bitset_and(bsr1, bsr2); 3885 if (bsr1 != dest->bs) { 3886 bitset_copy(dest->bs, bsr1); 3887 } 3888 if (not1 != 0) { 3889 bitset_invert(dest->bs); 3890 } 3891 3892 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 3893 if (not1 != 0 && not2 != 0) { 3894 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); 3895 } 3896 else { 3897 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); 3898 if (r == 0 && not1 != 0) { 3899 BBuf *tbuf; 3900 r = not_code_range_buf(enc, pbuf, &tbuf); 3901 if (r != 0) { 3902 bbuf_free(pbuf); 3903 return r; 3904 } 3905 bbuf_free(pbuf); 3906 pbuf = tbuf; 3907 } 3908 } 3909 if (r != 0) return r; 3910 3911 dest->mbuf = pbuf; 3912 bbuf_free(buf1); 3913 return r; 3914 } 3915 return 0; 3916 } 3917 3918 static int 3919 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) 3920 { 3921 int r, not1, not2; 3922 BBuf *buf1, *buf2, *pbuf; 3923 BitSetRef bsr1, bsr2; 3924 BitSet bs1, bs2; 3925 3926 not1 = IS_NCCLASS_NOT(dest); 3927 bsr1 = dest->bs; 3928 buf1 = dest->mbuf; 3929 not2 = IS_NCCLASS_NOT(cc); 3930 bsr2 = cc->bs; 3931 buf2 = cc->mbuf; 3932 3933 if (not1 != 0) { 3934 bitset_invert_to(bsr1, bs1); 3935 bsr1 = bs1; 3936 } 3937 if (not2 != 0) { 3938 bitset_invert_to(bsr2, bs2); 3939 bsr2 = bs2; 3940 } 3941 bitset_or(bsr1, bsr2); 3942 if (bsr1 != dest->bs) { 3943 bitset_copy(dest->bs, bsr1); 3944 } 3945 if (not1 != 0) { 3946 bitset_invert(dest->bs); 3947 } 3948 3949 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 3950 if (not1 != 0 && not2 != 0) { 3951 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); 3952 } 3953 else { 3954 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); 3955 if (r == 0 && not1 != 0) { 3956 BBuf *tbuf; 3957 r = not_code_range_buf(enc, pbuf, &tbuf); 3958 if (r != 0) { 3959 bbuf_free(pbuf); 3960 return r; 3961 } 3962 bbuf_free(pbuf); 3963 pbuf = tbuf; 3964 } 3965 } 3966 if (r != 0) return r; 3967 3968 dest->mbuf = pbuf; 3969 bbuf_free(buf1); 3970 return r; 3971 } 3972 else 3973 return 0; 3974 } 3975 3976 static OnigCodePoint 3977 conv_backslash_value(OnigCodePoint c, ScanEnv* env) 3978 { 3979 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { 3980 switch (c) { 3981 case 'n': return '\n'; 3982 case 't': return '\t'; 3983 case 'r': return '\r'; 3984 case 'f': return '\f'; 3985 case 'a': return '\007'; 3986 case 'b': return '\010'; 3987 case 'e': return '\033'; 3988 case 'v': 3989 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) 3990 return '\v'; 3991 break; 3992 3993 default: 3994 break; 3995 } 3996 } 3997 return c; 3998 } 3999 4000 static int 4001 is_invalid_quantifier_target(Node* node) 4002 { 4003 switch (NODE_TYPE(node)) { 4004 case NODE_ANCHOR: 4005 case NODE_GIMMICK: 4006 return 1; 4007 break; 4008 4009 case NODE_BAG: 4010 /* allow enclosed elements */ 4011 /* return is_invalid_quantifier_target(NODE_BODY(node)); */ 4012 break; 4013 4014 case NODE_LIST: 4015 do { 4016 if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0; 4017 } while (IS_NOT_NULL(node = NODE_CDR(node))); 4018 return 0; 4019 break; 4020 4021 case NODE_ALT: 4022 do { 4023 if (is_invalid_quantifier_target(NODE_CAR(node))) return 1; 4024 } while (IS_NOT_NULL(node = NODE_CDR(node))); 4025 break; 4026 4027 default: 4028 break; 4029 } 4030 return 0; 4031 } 4032 4033 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ 4034 static int 4035 quantifier_type_num(QuantNode* q) 4036 { 4037 if (q->greedy) { 4038 if (q->lower == 0) { 4039 if (q->upper == 1) return 0; 4040 else if (IS_INFINITE_REPEAT(q->upper)) return 1; 4041 } 4042 else if (q->lower == 1) { 4043 if (IS_INFINITE_REPEAT(q->upper)) return 2; 4044 } 4045 } 4046 else { 4047 if (q->lower == 0) { 4048 if (q->upper == 1) return 3; 4049 else if (IS_INFINITE_REPEAT(q->upper)) return 4; 4050 } 4051 else if (q->lower == 1) { 4052 if (IS_INFINITE_REPEAT(q->upper)) return 5; 4053 } 4054 } 4055 return -1; 4056 } 4057 4058 4059 enum ReduceType { 4060 RQ_ASIS = 0, /* as is */ 4061 RQ_DEL = 1, /* delete parent */ 4062 RQ_A, /* to '*' */ 4063 RQ_AQ, /* to '*?' */ 4064 RQ_QQ, /* to '??' */ 4065 RQ_P_QQ, /* to '+)??' */ 4066 RQ_PQ_Q /* to '+?)?' */ 4067 }; 4068 4069 static enum ReduceType ReduceTypeTable[6][6] = { 4070 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ 4071 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ 4072 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ 4073 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ 4074 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ 4075 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ 4076 }; 4077 4078 extern int 4079 onig_reduce_nested_quantifier(Node* pnode) 4080 { 4081 int pnum, cnum; 4082 QuantNode *p, *c; 4083 Node* cnode; 4084 4085 cnode = NODE_BODY(pnode); 4086 4087 p = QUANT_(pnode); 4088 c = QUANT_(cnode); 4089 pnum = quantifier_type_num(p); 4090 cnum = quantifier_type_num(c); 4091 if (pnum < 0 || cnum < 0) { 4092 if (p->lower == p->upper && c->lower == c->upper) { 4093 int n = onig_positive_int_multiply(p->lower, c->lower); 4094 if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 4095 4096 p->lower = p->upper = n; 4097 NODE_BODY(pnode) = NODE_BODY(cnode); 4098 goto remove_cnode; 4099 } 4100 4101 return 0; 4102 } 4103 4104 switch(ReduceTypeTable[cnum][pnum]) { 4105 case RQ_DEL: 4106 *pnode = *cnode; 4107 goto remove_cnode; 4108 break; 4109 case RQ_A: 4110 NODE_BODY(pnode) = NODE_BODY(cnode); 4111 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; 4112 goto remove_cnode; 4113 break; 4114 case RQ_AQ: 4115 NODE_BODY(pnode) = NODE_BODY(cnode); 4116 p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; 4117 goto remove_cnode; 4118 break; 4119 case RQ_QQ: 4120 NODE_BODY(pnode) = NODE_BODY(cnode); 4121 p->lower = 0; p->upper = 1; p->greedy = 0; 4122 goto remove_cnode; 4123 break; 4124 case RQ_P_QQ: 4125 p->lower = 0; p->upper = 1; p->greedy = 0; 4126 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; 4127 break; 4128 case RQ_PQ_Q: 4129 p->lower = 0; p->upper = 1; p->greedy = 1; 4130 c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; 4131 break; 4132 case RQ_ASIS: 4133 break; 4134 } 4135 4136 return 0; 4137 4138 remove_cnode: 4139 NODE_BODY(cnode) = NULL_NODE; 4140 onig_node_free(cnode); 4141 return 0; 4142 } 4143 4144 static int 4145 node_new_general_newline(Node** node, ScanEnv* env) 4146 { 4147 int r; 4148 int dlen, alen; 4149 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; 4150 Node* crnl; 4151 Node* ncc; 4152 Node* x; 4153 CClassNode* cc; 4154 4155 dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf); 4156 if (dlen < 0) return dlen; 4157 alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen); 4158 if (alen < 0) return alen; 4159 4160 crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE); 4161 CHECK_NULL_RETURN_MEMERR(crnl); 4162 4163 ncc = node_new_cclass(); 4164 if (IS_NULL(ncc)) goto err2; 4165 4166 cc = CCLASS_(ncc); 4167 if (dlen == 1) { 4168 bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d); 4169 } 4170 else { 4171 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d); 4172 if (r != 0) { 4173 err1: 4174 onig_node_free(ncc); 4175 err2: 4176 onig_node_free(crnl); 4177 return ONIGERR_MEMORY; 4178 } 4179 } 4180 4181 if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) { 4182 r = add_code_range(&(cc->mbuf), env, 0x85, 0x85); 4183 if (r != 0) goto err1; 4184 r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); 4185 if (r != 0) goto err1; 4186 } 4187 4188 x = node_new_bag_if_else(crnl, NULL_NODE, ncc); 4189 if (IS_NULL(x)) goto err1; 4190 4191 *node = x; 4192 return 0; 4193 } 4194 4195 enum TokenSyms { 4196 TK_EOT = 0, /* end of token */ 4197 TK_CRUDE_BYTE = 1, 4198 TK_CHAR, 4199 TK_STRING, 4200 TK_CODE_POINT, 4201 TK_ANYCHAR, 4202 TK_CHAR_TYPE, 4203 TK_BACKREF, 4204 TK_CALL, 4205 TK_ANCHOR, 4206 TK_REPEAT, 4207 TK_INTERVAL, 4208 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ 4209 TK_ALT, 4210 TK_SUBEXP_OPEN, 4211 TK_SUBEXP_CLOSE, 4212 TK_OPEN_CC, 4213 TK_QUOTE_OPEN, 4214 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ 4215 TK_KEEP, /* \K */ 4216 TK_GENERAL_NEWLINE, /* \R */ 4217 TK_NO_NEWLINE, /* \N */ 4218 TK_TRUE_ANYCHAR, /* \O */ 4219 TK_TEXT_SEGMENT, /* \X */ 4220 4221 /* in cc */ 4222 TK_CC_CLOSE, 4223 TK_CC_RANGE, 4224 TK_CC_POSIX_BRACKET_OPEN, 4225 TK_CC_AND, /* && */ 4226 TK_CC_OPEN_CC /* [ */ 4227 }; 4228 4229 typedef struct { 4230 enum TokenSyms type; 4231 int escaped; 4232 int base; /* is number: 8, 16 (used in [....]) */ 4233 UChar* backp; 4234 union { 4235 UChar* s; 4236 UChar byte; 4237 OnigCodePoint code; 4238 int anchor; 4239 int subtype; 4240 struct { 4241 int lower; 4242 int upper; 4243 int greedy; 4244 int possessive; 4245 } repeat; 4246 struct { 4247 int num; 4248 int ref1; 4249 int* refs; 4250 int by_name; 4251 #ifdef USE_BACKREF_WITH_LEVEL 4252 int exist_level; 4253 int level; /* \k<name+n> */ 4254 #endif 4255 } backref; 4256 struct { 4257 UChar* name; 4258 UChar* name_end; 4259 int gnum; 4260 int by_number; 4261 } call; 4262 struct { 4263 int ctype; 4264 int not; 4265 } prop; 4266 } u; 4267 } PToken; 4268 4269 4270 static int 4271 fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) 4272 { 4273 int low, up, syn_allow, non_low = 0; 4274 int r = 0; 4275 OnigCodePoint c; 4276 OnigEncoding enc = env->enc; 4277 UChar* p = *src; 4278 PFETCH_READY; 4279 4280 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); 4281 4282 if (PEND) { 4283 if (syn_allow) 4284 return 1; /* "....{" : OK! */ 4285 else 4286 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ 4287 } 4288 4289 if (! syn_allow) { 4290 c = PPEEK; 4291 if (c == ')' || c == '(' || c == '|') { 4292 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; 4293 } 4294 } 4295 4296 low = scan_number(&p, end, env->enc); 4297 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 4298 if (low > ONIG_MAX_REPEAT_NUM) 4299 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 4300 4301 if (p == *src) { /* can't read low */ 4302 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { 4303 /* allow {,n} as {0,n} */ 4304 low = 0; 4305 non_low = 1; 4306 } 4307 else 4308 goto invalid; 4309 } 4310 4311 if (PEND) goto invalid; 4312 PFETCH(c); 4313 if (c == ',') { 4314 UChar* prev = p; 4315 up = scan_number(&p, end, env->enc); 4316 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 4317 if (up > ONIG_MAX_REPEAT_NUM) 4318 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 4319 4320 if (p == prev) { 4321 if (non_low != 0) 4322 goto invalid; 4323 up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ 4324 } 4325 } 4326 else { 4327 if (non_low != 0) 4328 goto invalid; 4329 4330 PUNFETCH; 4331 up = low; /* {n} : exact n times */ 4332 r = 2; /* fixed */ 4333 } 4334 4335 if (PEND) goto invalid; 4336 PFETCH(c); 4337 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { 4338 if (c != MC_ESC(env->syntax) || PEND) goto invalid; 4339 PFETCH(c); 4340 } 4341 if (c != '}') goto invalid; 4342 4343 if (!IS_INFINITE_REPEAT(up) && low > up) { 4344 /* {n,m}+ supported case */ 4345 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) 4346 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; 4347 4348 tok->u.repeat.possessive = 1; 4349 { 4350 int tmp; 4351 tmp = low; low = up; up = tmp; 4352 } 4353 } 4354 else 4355 tok->u.repeat.possessive = 0; 4356 4357 tok->type = TK_INTERVAL; 4358 tok->u.repeat.lower = low; 4359 tok->u.repeat.upper = up; 4360 *src = p; 4361 return r; /* 0: normal {n,m}, 2: fixed {n} */ 4362 4363 invalid: 4364 if (syn_allow) { 4365 /* *src = p; */ /* !!! Don't do this line !!! */ 4366 return 1; /* OK */ 4367 } 4368 else 4369 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; 4370 } 4371 4372 /* \M-, \C-, \c, or \... */ 4373 static int 4374 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) 4375 { 4376 int v; 4377 OnigCodePoint c; 4378 OnigEncoding enc = env->enc; 4379 UChar* p = *src; 4380 4381 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 4382 4383 PFETCH_S(c); 4384 switch (c) { 4385 case 'M': 4386 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { 4387 if (PEND) return ONIGERR_END_PATTERN_AT_META; 4388 PFETCH_S(c); 4389 if (c != '-') return ONIGERR_META_CODE_SYNTAX; 4390 if (PEND) return ONIGERR_END_PATTERN_AT_META; 4391 PFETCH_S(c); 4392 if (c == MC_ESC(env->syntax)) { 4393 v = fetch_escaped_value(&p, end, env, &c); 4394 if (v < 0) return v; 4395 } 4396 c = ((c & 0xff) | 0x80); 4397 } 4398 else 4399 goto backslash; 4400 break; 4401 4402 case 'C': 4403 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { 4404 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 4405 PFETCH_S(c); 4406 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; 4407 goto control; 4408 } 4409 else 4410 goto backslash; 4411 4412 case 'c': 4413 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { 4414 control: 4415 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 4416 PFETCH_S(c); 4417 if (c == '?') { 4418 c = 0177; 4419 } 4420 else { 4421 if (c == MC_ESC(env->syntax)) { 4422 v = fetch_escaped_value(&p, end, env, &c); 4423 if (v < 0) return v; 4424 } 4425 c &= 0x9f; 4426 } 4427 break; 4428 } 4429 /* fall through */ 4430 4431 default: 4432 { 4433 backslash: 4434 c = conv_backslash_value(c, env); 4435 } 4436 break; 4437 } 4438 4439 *src = p; 4440 *val = c; 4441 return 0; 4442 } 4443 4444 static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env); 4445 4446 static OnigCodePoint 4447 get_name_end_code_point(OnigCodePoint start) 4448 { 4449 switch (start) { 4450 case '<': return (OnigCodePoint )'>'; break; 4451 case '\'': return (OnigCodePoint )'\''; break; 4452 case '(': return (OnigCodePoint )')'; break; 4453 default: 4454 break; 4455 } 4456 4457 return (OnigCodePoint )0; 4458 } 4459 4460 enum REF_NUM { 4461 IS_NOT_NUM = 0, 4462 IS_ABS_NUM = 1, 4463 IS_REL_NUM = 2 4464 }; 4465 4466 #ifdef USE_BACKREF_WITH_LEVEL 4467 /* 4468 \k<name+n>, \k<name-n> 4469 \k<num+n>, \k<num-n> 4470 \k<-num+n>, \k<-num-n> 4471 \k<+num+n>, \k<+num-n> 4472 */ 4473 static int 4474 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, 4475 UChar** rname_end, ScanEnv* env, 4476 int* rback_num, int* rlevel, enum REF_NUM* num_type) 4477 { 4478 int r, sign, exist_level; 4479 int digit_count; 4480 OnigCodePoint end_code; 4481 OnigCodePoint c = 0; 4482 OnigEncoding enc = env->enc; 4483 UChar *name_end; 4484 UChar *pnum_head; 4485 UChar *p = *src; 4486 PFETCH_READY; 4487 4488 *rback_num = 0; 4489 exist_level = 0; 4490 *num_type = IS_NOT_NUM; 4491 sign = 1; 4492 pnum_head = *src; 4493 4494 end_code = get_name_end_code_point(start_code); 4495 4496 digit_count = 0; 4497 name_end = end; 4498 r = 0; 4499 if (PEND) { 4500 return ONIGERR_EMPTY_GROUP_NAME; 4501 } 4502 else { 4503 PFETCH(c); 4504 if (c == end_code) 4505 return ONIGERR_EMPTY_GROUP_NAME; 4506 4507 if (IS_CODE_DIGIT_ASCII(enc, c)) { 4508 *num_type = IS_ABS_NUM; 4509 digit_count++; 4510 } 4511 else if (c == '-') { 4512 *num_type = IS_REL_NUM; 4513 sign = -1; 4514 pnum_head = p; 4515 } 4516 else if (c == '+') { 4517 *num_type = IS_REL_NUM; 4518 sign = 1; 4519 pnum_head = p; 4520 } 4521 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 4522 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4523 } 4524 } 4525 4526 while (!PEND) { 4527 name_end = p; 4528 PFETCH(c); 4529 if (c == end_code || c == ')' || c == '+' || c == '-') { 4530 if (*num_type != IS_NOT_NUM && digit_count == 0) 4531 r = ONIGERR_INVALID_GROUP_NAME; 4532 break; 4533 } 4534 4535 if (*num_type != IS_NOT_NUM) { 4536 if (IS_CODE_DIGIT_ASCII(enc, c)) { 4537 digit_count++; 4538 } 4539 else { 4540 r = ONIGERR_INVALID_GROUP_NAME; 4541 *num_type = IS_NOT_NUM; 4542 } 4543 } 4544 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 4545 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4546 } 4547 } 4548 4549 if (r == 0 && c != end_code) { 4550 if (c == '+' || c == '-') { 4551 int level; 4552 int flag = (c == '-' ? -1 : 1); 4553 4554 if (PEND) { 4555 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4556 goto end; 4557 } 4558 PFETCH(c); 4559 if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; 4560 PUNFETCH; 4561 level = scan_number(&p, end, enc); 4562 if (level < 0) return ONIGERR_TOO_BIG_NUMBER; 4563 *rlevel = (level * flag); 4564 exist_level = 1; 4565 4566 if (!PEND) { 4567 PFETCH(c); 4568 if (c == end_code) 4569 goto end; 4570 } 4571 } 4572 4573 err: 4574 name_end = end; 4575 err2: 4576 r = ONIGERR_INVALID_GROUP_NAME; 4577 } 4578 4579 end: 4580 if (r == 0) { 4581 if (*num_type != IS_NOT_NUM) { 4582 *rback_num = scan_number(&pnum_head, name_end, enc); 4583 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 4584 else if (*rback_num == 0) { 4585 if (*num_type == IS_REL_NUM) 4586 goto err2; 4587 } 4588 4589 *rback_num *= sign; 4590 } 4591 4592 *rname_end = name_end; 4593 *src = p; 4594 return (exist_level ? 1 : 0); 4595 } 4596 else { 4597 onig_scan_env_set_error_string(env, r, *src, name_end); 4598 return r; 4599 } 4600 } 4601 #endif /* USE_BACKREF_WITH_LEVEL */ 4602 4603 /* 4604 ref: 0 -> define name (don't allow number name) 4605 1 -> reference name (allow number name) 4606 */ 4607 static int 4608 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 4609 UChar** rname_end, ScanEnv* env, int* rback_num, 4610 enum REF_NUM* num_type, int is_ref) 4611 { 4612 int r, sign; 4613 int digit_count; 4614 OnigCodePoint end_code; 4615 OnigCodePoint c = 0; 4616 OnigEncoding enc = env->enc; 4617 UChar *name_end; 4618 UChar *pnum_head; 4619 UChar *p = *src; 4620 4621 *rback_num = 0; 4622 4623 end_code = get_name_end_code_point(start_code); 4624 4625 digit_count = 0; 4626 name_end = end; 4627 pnum_head = *src; 4628 r = 0; 4629 *num_type = IS_NOT_NUM; 4630 sign = 1; 4631 if (PEND) { 4632 return ONIGERR_EMPTY_GROUP_NAME; 4633 } 4634 else { 4635 PFETCH_S(c); 4636 if (c == end_code) 4637 return ONIGERR_EMPTY_GROUP_NAME; 4638 4639 if (IS_CODE_DIGIT_ASCII(enc, c)) { 4640 if (is_ref == TRUE) 4641 *num_type = IS_ABS_NUM; 4642 else { 4643 r = ONIGERR_INVALID_GROUP_NAME; 4644 } 4645 digit_count++; 4646 } 4647 else if (c == '-') { 4648 if (is_ref == TRUE) { 4649 *num_type = IS_REL_NUM; 4650 sign = -1; 4651 pnum_head = p; 4652 } 4653 else { 4654 r = ONIGERR_INVALID_GROUP_NAME; 4655 } 4656 } 4657 else if (c == '+') { 4658 if (is_ref == TRUE) { 4659 *num_type = IS_REL_NUM; 4660 sign = 1; 4661 pnum_head = p; 4662 } 4663 else { 4664 r = ONIGERR_INVALID_GROUP_NAME; 4665 } 4666 } 4667 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 4668 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4669 } 4670 } 4671 4672 if (r == 0) { 4673 while (!PEND) { 4674 name_end = p; 4675 PFETCH_S(c); 4676 if (c == end_code || c == ')') { 4677 if (*num_type != IS_NOT_NUM && digit_count == 0) 4678 r = ONIGERR_INVALID_GROUP_NAME; 4679 break; 4680 } 4681 4682 if (*num_type != IS_NOT_NUM) { 4683 if (IS_CODE_DIGIT_ASCII(enc, c)) { 4684 digit_count++; 4685 } 4686 else { 4687 if (!ONIGENC_IS_CODE_WORD(enc, c)) 4688 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4689 else 4690 r = ONIGERR_INVALID_GROUP_NAME; 4691 4692 *num_type = IS_NOT_NUM; 4693 } 4694 } 4695 else { 4696 if (!ONIGENC_IS_CODE_WORD(enc, c)) { 4697 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 4698 } 4699 } 4700 } 4701 4702 if (c != end_code) { 4703 r = ONIGERR_INVALID_GROUP_NAME; 4704 goto err; 4705 } 4706 4707 if (*num_type != IS_NOT_NUM) { 4708 *rback_num = scan_number(&pnum_head, name_end, enc); 4709 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 4710 else if (*rback_num == 0) { 4711 if (*num_type == IS_REL_NUM) { 4712 r = ONIGERR_INVALID_GROUP_NAME; 4713 goto err; 4714 } 4715 } 4716 4717 *rback_num *= sign; 4718 } 4719 4720 *rname_end = name_end; 4721 *src = p; 4722 return 0; 4723 } 4724 else { 4725 while (!PEND) { 4726 name_end = p; 4727 PFETCH_S(c); 4728 if (c == end_code || c == ')') 4729 break; 4730 } 4731 if (PEND) 4732 name_end = end; 4733 4734 err: 4735 onig_scan_env_set_error_string(env, r, *src, name_end); 4736 return r; 4737 } 4738 } 4739 4740 static void 4741 CC_ESC_WARN(ScanEnv* env, UChar *c) 4742 { 4743 if (onig_warn == onig_null_warn) return ; 4744 4745 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && 4746 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { 4747 UChar buf[WARN_BUFSIZE]; 4748 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 4749 env->pattern, env->pattern_end, 4750 (UChar* )"character class has '%s' without escape", 4751 c); 4752 (*onig_warn)((char* )buf); 4753 } 4754 } 4755 4756 static void 4757 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) 4758 { 4759 if (onig_warn == onig_null_warn) return ; 4760 4761 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { 4762 UChar buf[WARN_BUFSIZE]; 4763 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, 4764 (env)->pattern, (env)->pattern_end, 4765 (UChar* )"regular expression has '%s' without escape", c); 4766 (*onig_warn)((char* )buf); 4767 } 4768 } 4769 4770 static UChar* 4771 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, 4772 UChar **next, OnigEncoding enc) 4773 { 4774 int i; 4775 OnigCodePoint x; 4776 UChar *q; 4777 UChar *p = from; 4778 4779 while (p < to) { 4780 x = ONIGENC_MBC_TO_CODE(enc, p, to); 4781 q = p + enclen(enc, p); 4782 if (x == s[0]) { 4783 for (i = 1; i < n && q < to; i++) { 4784 x = ONIGENC_MBC_TO_CODE(enc, q, to); 4785 if (x != s[i]) break; 4786 q += enclen(enc, q); 4787 } 4788 if (i >= n) { 4789 if (IS_NOT_NULL(next)) 4790 *next = q; 4791 return p; 4792 } 4793 } 4794 p = q; 4795 } 4796 return NULL_UCHARP; 4797 } 4798 4799 static int 4800 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, 4801 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn) 4802 { 4803 int i, in_esc; 4804 OnigCodePoint x; 4805 UChar *q; 4806 UChar *p = from; 4807 4808 in_esc = 0; 4809 while (p < to) { 4810 if (in_esc) { 4811 in_esc = 0; 4812 p += enclen(enc, p); 4813 } 4814 else { 4815 x = ONIGENC_MBC_TO_CODE(enc, p, to); 4816 q = p + enclen(enc, p); 4817 if (x == s[0]) { 4818 for (i = 1; i < n && q < to; i++) { 4819 x = ONIGENC_MBC_TO_CODE(enc, q, to); 4820 if (x != s[i]) break; 4821 q += enclen(enc, q); 4822 } 4823 if (i >= n) return 1; 4824 p += enclen(enc, p); 4825 } 4826 else { 4827 x = ONIGENC_MBC_TO_CODE(enc, p, to); 4828 if (x == bad) return 0; 4829 else if (x == MC_ESC(syn)) in_esc = 1; 4830 p = q; 4831 } 4832 } 4833 } 4834 return 0; 4835 } 4836 4837 static int 4838 fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) 4839 { 4840 int r; 4841 OnigCodePoint code; 4842 OnigCodePoint c, c2; 4843 OnigSyntaxType* syn = env->syntax; 4844 OnigEncoding enc = env->enc; 4845 UChar* prev; 4846 UChar* p = *src; 4847 PFETCH_READY; 4848 4849 if (PEND) { 4850 tok->type = TK_EOT; 4851 return tok->type; 4852 } 4853 4854 PFETCH(c); 4855 tok->type = TK_CHAR; 4856 tok->base = 0; 4857 tok->u.code = c; 4858 tok->escaped = 0; 4859 4860 if (c == ']') { 4861 tok->type = TK_CC_CLOSE; 4862 } 4863 else if (c == '-') { 4864 tok->type = TK_CC_RANGE; 4865 } 4866 else if (c == MC_ESC(syn)) { 4867 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) 4868 goto end; 4869 4870 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 4871 4872 PFETCH(c); 4873 tok->escaped = 1; 4874 tok->u.code = c; 4875 switch (c) { 4876 case 'w': 4877 tok->type = TK_CHAR_TYPE; 4878 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 4879 tok->u.prop.not = 0; 4880 break; 4881 case 'W': 4882 tok->type = TK_CHAR_TYPE; 4883 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 4884 tok->u.prop.not = 1; 4885 break; 4886 case 'd': 4887 tok->type = TK_CHAR_TYPE; 4888 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 4889 tok->u.prop.not = 0; 4890 break; 4891 case 'D': 4892 tok->type = TK_CHAR_TYPE; 4893 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 4894 tok->u.prop.not = 1; 4895 break; 4896 case 's': 4897 tok->type = TK_CHAR_TYPE; 4898 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 4899 tok->u.prop.not = 0; 4900 break; 4901 case 'S': 4902 tok->type = TK_CHAR_TYPE; 4903 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 4904 tok->u.prop.not = 1; 4905 break; 4906 case 'h': 4907 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 4908 tok->type = TK_CHAR_TYPE; 4909 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 4910 tok->u.prop.not = 0; 4911 break; 4912 case 'H': 4913 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 4914 tok->type = TK_CHAR_TYPE; 4915 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 4916 tok->u.prop.not = 1; 4917 break; 4918 4919 case 'p': 4920 case 'P': 4921 if (PEND) break; 4922 4923 c2 = PPEEK; 4924 if (c2 == '{' && 4925 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 4926 PINC; 4927 tok->type = TK_CHAR_PROPERTY; 4928 tok->u.prop.not = c == 'P'; 4929 4930 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 4931 PFETCH(c2); 4932 if (c2 == '^') { 4933 tok->u.prop.not = tok->u.prop.not == 0; 4934 } 4935 else 4936 PUNFETCH; 4937 } 4938 } 4939 break; 4940 4941 case 'o': 4942 if (PEND) break; 4943 4944 prev = p; 4945 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { 4946 PINC; 4947 r = scan_octal_number(&p, end, 0, 11, enc, &code); 4948 if (r < 0) return r; 4949 if (!PEND) { 4950 c2 = PPEEK; 4951 if (IS_CODE_DIGIT_ASCII(enc, c2)) 4952 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 4953 } 4954 4955 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { 4956 PINC; 4957 tok->type = TK_CODE_POINT; 4958 tok->base = 8; 4959 tok->u.code = code; 4960 } 4961 else { 4962 /* can't read nothing or invalid format */ 4963 p = prev; 4964 } 4965 } 4966 break; 4967 4968 case 'x': 4969 if (PEND) break; 4970 4971 prev = p; 4972 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 4973 PINC; 4974 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); 4975 if (r < 0) return r; 4976 if (!PEND) { 4977 c2 = PPEEK; 4978 if (IS_CODE_XDIGIT_ASCII(enc, c2)) 4979 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 4980 } 4981 4982 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { 4983 PINC; 4984 tok->type = TK_CODE_POINT; 4985 tok->base = 16; 4986 tok->u.code = code; 4987 } 4988 else { 4989 /* can't read nothing or invalid format */ 4990 p = prev; 4991 } 4992 } 4993 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 4994 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); 4995 if (r < 0) return r; 4996 if (p == prev) { /* can't read nothing. */ 4997 code = 0; /* but, it's not error */ 4998 } 4999 tok->type = TK_CRUDE_BYTE; 5000 tok->base = 16; 5001 tok->u.byte = (UChar )code; 5002 } 5003 break; 5004 5005 case 'u': 5006 if (PEND) break; 5007 5008 prev = p; 5009 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 5010 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); 5011 if (r < 0) return r; 5012 if (p == prev) { /* can't read nothing. */ 5013 code = 0; /* but, it's not error */ 5014 } 5015 tok->type = TK_CODE_POINT; 5016 tok->base = 16; 5017 tok->u.code = code; 5018 } 5019 break; 5020 5021 case '0': 5022 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 5023 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 5024 PUNFETCH; 5025 prev = p; 5026 r = scan_octal_number(&p, end, 0, 3, enc, &code); 5027 if (r < 0) return r; 5028 if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; 5029 if (p == prev) { /* can't read nothing. */ 5030 code = 0; /* but, it's not error */ 5031 } 5032 tok->type = TK_CRUDE_BYTE; 5033 tok->base = 8; 5034 tok->u.byte = (UChar )code; 5035 } 5036 break; 5037 5038 default: 5039 PUNFETCH; 5040 r = fetch_escaped_value(&p, end, env, &c2); 5041 if (r < 0) return r; 5042 if (tok->u.code != c2) { 5043 tok->u.code = c2; 5044 tok->type = TK_CODE_POINT; 5045 } 5046 break; 5047 } 5048 } 5049 else if (c == '[') { 5050 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { 5051 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; 5052 tok->backp = p; /* point at '[' is read */ 5053 PINC; 5054 if (str_exist_check_with_esc(send, 2, p, end, 5055 (OnigCodePoint )']', enc, syn)) { 5056 tok->type = TK_CC_POSIX_BRACKET_OPEN; 5057 } 5058 else { 5059 PUNFETCH; 5060 goto cc_in_cc; 5061 } 5062 } 5063 else { 5064 cc_in_cc: 5065 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { 5066 tok->type = TK_CC_OPEN_CC; 5067 } 5068 else { 5069 CC_ESC_WARN(env, (UChar* )"["); 5070 } 5071 } 5072 } 5073 else if (c == '&') { 5074 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && 5075 !PEND && (PPEEK_IS('&'))) { 5076 PINC; 5077 tok->type = TK_CC_AND; 5078 } 5079 } 5080 5081 end: 5082 *src = p; 5083 return tok->type; 5084 } 5085 5086 static int 5087 fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) 5088 { 5089 int r; 5090 OnigCodePoint code; 5091 OnigCodePoint c; 5092 OnigEncoding enc = env->enc; 5093 OnigSyntaxType* syn = env->syntax; 5094 UChar* prev; 5095 UChar* p = *src; 5096 PFETCH_READY; 5097 5098 start: 5099 if (PEND) { 5100 tok->type = TK_EOT; 5101 return tok->type; 5102 } 5103 5104 tok->type = TK_STRING; 5105 tok->base = 0; 5106 tok->backp = p; 5107 5108 PFETCH(c); 5109 if (IS_MC_ESC_CODE(c, syn)) { 5110 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 5111 5112 tok->backp = p; 5113 PFETCH(c); 5114 5115 tok->u.code = c; 5116 tok->escaped = 1; 5117 switch (c) { 5118 case '*': 5119 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; 5120 tok->type = TK_REPEAT; 5121 tok->u.repeat.lower = 0; 5122 tok->u.repeat.upper = INFINITE_REPEAT; 5123 goto greedy_check; 5124 break; 5125 5126 case '+': 5127 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; 5128 tok->type = TK_REPEAT; 5129 tok->u.repeat.lower = 1; 5130 tok->u.repeat.upper = INFINITE_REPEAT; 5131 goto greedy_check; 5132 break; 5133 5134 case '?': 5135 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; 5136 tok->type = TK_REPEAT; 5137 tok->u.repeat.lower = 0; 5138 tok->u.repeat.upper = 1; 5139 greedy_check: 5140 tok->u.repeat.possessive = 0; 5141 greedy_check2: 5142 if (!PEND && PPEEK_IS('?') && 5143 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) && 5144 tok->u.repeat.possessive == 0) { 5145 PFETCH(c); 5146 tok->u.repeat.greedy = 0; 5147 tok->u.repeat.possessive = 0; 5148 } 5149 else { 5150 possessive_check: 5151 tok->u.repeat.greedy = 1; 5152 if (!PEND && PPEEK_IS('+') && 5153 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && 5154 tok->type != TK_INTERVAL) || 5155 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && 5156 tok->type == TK_INTERVAL)) && 5157 tok->u.repeat.possessive == 0) { 5158 PFETCH(c); 5159 tok->u.repeat.possessive = 1; 5160 } 5161 } 5162 break; 5163 5164 case '{': 5165 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; 5166 r = fetch_interval(&p, end, tok, env); 5167 if (r < 0) return r; /* error */ 5168 if (r == 0) goto greedy_check2; 5169 else if (r == 2) { /* {n} */ 5170 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 5171 goto possessive_check; 5172 5173 goto greedy_check2; 5174 } 5175 /* r == 1 : normal char */ 5176 break; 5177 5178 case '|': 5179 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; 5180 tok->type = TK_ALT; 5181 break; 5182 5183 case '(': 5184 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 5185 tok->type = TK_SUBEXP_OPEN; 5186 break; 5187 5188 case ')': 5189 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 5190 tok->type = TK_SUBEXP_CLOSE; 5191 break; 5192 5193 case 'w': 5194 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 5195 tok->type = TK_CHAR_TYPE; 5196 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 5197 tok->u.prop.not = 0; 5198 break; 5199 5200 case 'W': 5201 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 5202 tok->type = TK_CHAR_TYPE; 5203 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 5204 tok->u.prop.not = 1; 5205 break; 5206 5207 case 'b': 5208 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 5209 tok->type = TK_ANCHOR; 5210 tok->u.anchor = ANCR_WORD_BOUNDARY; 5211 break; 5212 5213 case 'B': 5214 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 5215 tok->type = TK_ANCHOR; 5216 tok->u.anchor = ANCR_NO_WORD_BOUNDARY; 5217 break; 5218 5219 case 'y': 5220 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; 5221 tok->type = TK_ANCHOR; 5222 tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY; 5223 break; 5224 5225 case 'Y': 5226 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; 5227 tok->type = TK_ANCHOR; 5228 tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY; 5229 break; 5230 5231 #ifdef USE_WORD_BEGIN_END 5232 case '<': 5233 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 5234 tok->type = TK_ANCHOR; 5235 tok->u.anchor = ANCR_WORD_BEGIN; 5236 break; 5237 5238 case '>': 5239 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 5240 tok->type = TK_ANCHOR; 5241 tok->u.anchor = ANCR_WORD_END; 5242 break; 5243 #endif 5244 5245 case 's': 5246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 5247 tok->type = TK_CHAR_TYPE; 5248 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 5249 tok->u.prop.not = 0; 5250 break; 5251 5252 case 'S': 5253 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 5254 tok->type = TK_CHAR_TYPE; 5255 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 5256 tok->u.prop.not = 1; 5257 break; 5258 5259 case 'd': 5260 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 5261 tok->type = TK_CHAR_TYPE; 5262 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 5263 tok->u.prop.not = 0; 5264 break; 5265 5266 case 'D': 5267 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 5268 tok->type = TK_CHAR_TYPE; 5269 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 5270 tok->u.prop.not = 1; 5271 break; 5272 5273 case 'h': 5274 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 5275 tok->type = TK_CHAR_TYPE; 5276 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 5277 tok->u.prop.not = 0; 5278 break; 5279 5280 case 'H': 5281 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 5282 tok->type = TK_CHAR_TYPE; 5283 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 5284 tok->u.prop.not = 1; 5285 break; 5286 5287 case 'K': 5288 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break; 5289 tok->type = TK_KEEP; 5290 break; 5291 5292 case 'R': 5293 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break; 5294 tok->type = TK_GENERAL_NEWLINE; 5295 break; 5296 5297 case 'N': 5298 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break; 5299 tok->type = TK_NO_NEWLINE; 5300 break; 5301 5302 case 'O': 5303 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break; 5304 tok->type = TK_TRUE_ANYCHAR; 5305 break; 5306 5307 case 'X': 5308 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; 5309 tok->type = TK_TEXT_SEGMENT; 5310 break; 5311 5312 case 'A': 5313 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 5314 begin_buf: 5315 tok->type = TK_ANCHOR; 5316 tok->u.subtype = ANCR_BEGIN_BUF; 5317 break; 5318 5319 case 'Z': 5320 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 5321 tok->type = TK_ANCHOR; 5322 tok->u.subtype = ANCR_SEMI_END_BUF; 5323 break; 5324 5325 case 'z': 5326 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 5327 end_buf: 5328 tok->type = TK_ANCHOR; 5329 tok->u.subtype = ANCR_END_BUF; 5330 break; 5331 5332 case 'G': 5333 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; 5334 tok->type = TK_ANCHOR; 5335 tok->u.subtype = ANCR_BEGIN_POSITION; 5336 break; 5337 5338 case '`': 5339 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 5340 goto begin_buf; 5341 break; 5342 5343 case '\'': 5344 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 5345 goto end_buf; 5346 break; 5347 5348 case 'o': 5349 if (PEND) break; 5350 5351 prev = p; 5352 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { 5353 PINC; 5354 r = scan_octal_number(&p, end, 0, 11, enc, &code); 5355 if (r < 0) return r; 5356 if (!PEND) { 5357 if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) 5358 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 5359 } 5360 5361 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { 5362 PINC; 5363 tok->type = TK_CODE_POINT; 5364 tok->u.code = code; 5365 } 5366 else { 5367 /* can't read nothing or invalid format */ 5368 p = prev; 5369 } 5370 } 5371 break; 5372 5373 case 'x': 5374 if (PEND) break; 5375 5376 prev = p; 5377 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 5378 PINC; 5379 r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); 5380 if (r < 0) return r; 5381 if (!PEND) { 5382 if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) 5383 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 5384 } 5385 5386 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { 5387 PINC; 5388 tok->type = TK_CODE_POINT; 5389 tok->u.code = code; 5390 } 5391 else { 5392 /* can't read nothing or invalid format */ 5393 p = prev; 5394 } 5395 } 5396 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 5397 r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); 5398 if (r < 0) return r; 5399 if (p == prev) { /* can't read nothing. */ 5400 code = 0; /* but, it's not error */ 5401 } 5402 tok->type = TK_CRUDE_BYTE; 5403 tok->base = 16; 5404 tok->u.byte = (UChar )code; 5405 } 5406 break; 5407 5408 case 'u': 5409 if (PEND) break; 5410 5411 prev = p; 5412 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 5413 r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); 5414 if (r < 0) return r; 5415 if (p == prev) { /* can't read nothing. */ 5416 code = 0; /* but, it's not error */ 5417 } 5418 tok->type = TK_CODE_POINT; 5419 tok->base = 16; 5420 tok->u.code = code; 5421 } 5422 break; 5423 5424 case '1': case '2': case '3': case '4': 5425 case '5': case '6': case '7': case '8': case '9': 5426 PUNFETCH; 5427 prev = p; 5428 r = scan_number(&p, end, enc); 5429 if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { 5430 goto skip_backref; 5431 } 5432 5433 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && 5434 (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ 5435 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 5436 if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) 5437 return ONIGERR_INVALID_BACKREF; 5438 } 5439 5440 tok->type = TK_BACKREF; 5441 tok->u.backref.num = 1; 5442 tok->u.backref.ref1 = r; 5443 tok->u.backref.by_name = 0; 5444 #ifdef USE_BACKREF_WITH_LEVEL 5445 tok->u.backref.exist_level = 0; 5446 #endif 5447 break; 5448 } 5449 5450 skip_backref: 5451 if (c == '8' || c == '9') { 5452 /* normal char */ 5453 p = prev; PINC; 5454 break; 5455 } 5456 5457 p = prev; 5458 /* fall through */ 5459 case '0': 5460 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 5461 prev = p; 5462 r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); 5463 if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; 5464 if (p == prev) { /* can't read nothing. */ 5465 code = 0; /* but, it's not error */ 5466 } 5467 tok->type = TK_CRUDE_BYTE; 5468 tok->base = 8; 5469 tok->u.byte = (UChar )code; 5470 } 5471 else if (c != '0') { 5472 PINC; 5473 } 5474 break; 5475 5476 case 'k': 5477 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { 5478 PFETCH(c); 5479 if (c == '<' || c == '\'') { 5480 UChar* name_end; 5481 int* backs; 5482 int back_num; 5483 enum REF_NUM num_type; 5484 5485 prev = p; 5486 5487 #ifdef USE_BACKREF_WITH_LEVEL 5488 name_end = NULL_UCHARP; /* no need. escape gcc warning. */ 5489 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, 5490 env, &back_num, &tok->u.backref.level, &num_type); 5491 if (r == 1) tok->u.backref.exist_level = 1; 5492 else tok->u.backref.exist_level = 0; 5493 #else 5494 r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); 5495 #endif 5496 if (r < 0) return r; 5497 5498 if (num_type != IS_NOT_NUM) { 5499 if (num_type == IS_REL_NUM) { 5500 back_num = backref_rel_to_abs(back_num, env); 5501 } 5502 if (back_num <= 0) 5503 return ONIGERR_INVALID_BACKREF; 5504 5505 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 5506 if (back_num > env->num_mem || 5507 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) 5508 return ONIGERR_INVALID_BACKREF; 5509 } 5510 tok->type = TK_BACKREF; 5511 tok->u.backref.by_name = 0; 5512 tok->u.backref.num = 1; 5513 tok->u.backref.ref1 = back_num; 5514 } 5515 else { 5516 int num = name_to_group_numbers(env, prev, name_end, &backs); 5517 if (num <= 0) { 5518 return ONIGERR_UNDEFINED_NAME_REFERENCE; 5519 } 5520 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 5521 int i; 5522 for (i = 0; i < num; i++) { 5523 if (backs[i] > env->num_mem || 5524 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) 5525 return ONIGERR_INVALID_BACKREF; 5526 } 5527 } 5528 5529 tok->type = TK_BACKREF; 5530 tok->u.backref.by_name = 1; 5531 if (num == 1) { 5532 tok->u.backref.num = 1; 5533 tok->u.backref.ref1 = backs[0]; 5534 } 5535 else { 5536 tok->u.backref.num = num; 5537 tok->u.backref.refs = backs; 5538 } 5539 } 5540 } 5541 else 5542 PUNFETCH; 5543 } 5544 break; 5545 5546 #ifdef USE_CALL 5547 case 'g': 5548 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { 5549 PFETCH(c); 5550 if (c == '<' || c == '\'') { 5551 int gnum; 5552 UChar* name_end; 5553 enum REF_NUM num_type; 5554 5555 prev = p; 5556 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, 5557 &gnum, &num_type, TRUE); 5558 if (r < 0) return r; 5559 5560 if (num_type != IS_NOT_NUM) { 5561 if (num_type == IS_REL_NUM) { 5562 gnum = backref_rel_to_abs(gnum, env); 5563 if (gnum < 0) { 5564 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, 5565 prev, name_end); 5566 return ONIGERR_UNDEFINED_GROUP_REFERENCE; 5567 } 5568 } 5569 tok->u.call.by_number = 1; 5570 tok->u.call.gnum = gnum; 5571 } 5572 else { 5573 tok->u.call.by_number = 0; 5574 tok->u.call.gnum = 0; 5575 } 5576 5577 tok->type = TK_CALL; 5578 tok->u.call.name = prev; 5579 tok->u.call.name_end = name_end; 5580 } 5581 else 5582 PUNFETCH; 5583 } 5584 break; 5585 #endif 5586 5587 case 'Q': 5588 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { 5589 tok->type = TK_QUOTE_OPEN; 5590 } 5591 break; 5592 5593 case 'p': 5594 case 'P': 5595 if (!PEND && PPEEK_IS('{') && 5596 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 5597 PINC; 5598 tok->type = TK_CHAR_PROPERTY; 5599 tok->u.prop.not = c == 'P'; 5600 5601 if (!PEND && 5602 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 5603 PFETCH(c); 5604 if (c == '^') { 5605 tok->u.prop.not = tok->u.prop.not == 0; 5606 } 5607 else 5608 PUNFETCH; 5609 } 5610 } 5611 break; 5612 5613 default: 5614 { 5615 OnigCodePoint c2; 5616 5617 PUNFETCH; 5618 r = fetch_escaped_value(&p, end, env, &c2); 5619 if (r < 0) return r; 5620 if (tok->u.code != c2) { 5621 tok->type = TK_CODE_POINT; 5622 tok->u.code = c2; 5623 } 5624 else { /* string */ 5625 p = tok->backp + enclen(enc, tok->backp); 5626 } 5627 } 5628 break; 5629 } 5630 } 5631 else { 5632 tok->u.code = c; 5633 tok->escaped = 0; 5634 5635 #ifdef USE_VARIABLE_META_CHARS 5636 if ((c != ONIG_INEFFECTIVE_META_CHAR) && 5637 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { 5638 if (c == MC_ANYCHAR(syn)) 5639 goto any_char; 5640 else if (c == MC_ANYTIME(syn)) 5641 goto any_time; 5642 else if (c == MC_ZERO_OR_ONE_TIME(syn)) 5643 goto zero_or_one_time; 5644 else if (c == MC_ONE_OR_MORE_TIME(syn)) 5645 goto one_or_more_time; 5646 else if (c == MC_ANYCHAR_ANYTIME(syn)) { 5647 tok->type = TK_ANYCHAR_ANYTIME; 5648 goto out; 5649 } 5650 } 5651 #endif 5652 5653 switch (c) { 5654 case '.': 5655 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; 5656 #ifdef USE_VARIABLE_META_CHARS 5657 any_char: 5658 #endif 5659 tok->type = TK_ANYCHAR; 5660 break; 5661 5662 case '*': 5663 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; 5664 #ifdef USE_VARIABLE_META_CHARS 5665 any_time: 5666 #endif 5667 tok->type = TK_REPEAT; 5668 tok->u.repeat.lower = 0; 5669 tok->u.repeat.upper = INFINITE_REPEAT; 5670 goto greedy_check; 5671 break; 5672 5673 case '+': 5674 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; 5675 #ifdef USE_VARIABLE_META_CHARS 5676 one_or_more_time: 5677 #endif 5678 tok->type = TK_REPEAT; 5679 tok->u.repeat.lower = 1; 5680 tok->u.repeat.upper = INFINITE_REPEAT; 5681 goto greedy_check; 5682 break; 5683 5684 case '?': 5685 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; 5686 #ifdef USE_VARIABLE_META_CHARS 5687 zero_or_one_time: 5688 #endif 5689 tok->type = TK_REPEAT; 5690 tok->u.repeat.lower = 0; 5691 tok->u.repeat.upper = 1; 5692 goto greedy_check; 5693 break; 5694 5695 case '{': 5696 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; 5697 r = fetch_interval(&p, end, tok, env); 5698 if (r < 0) return r; /* error */ 5699 if (r == 0) goto greedy_check2; 5700 else if (r == 2) { /* {n} */ 5701 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 5702 goto possessive_check; 5703 5704 goto greedy_check2; 5705 } 5706 /* r == 1 : normal char */ 5707 break; 5708 5709 case '|': 5710 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; 5711 tok->type = TK_ALT; 5712 break; 5713 5714 case '(': 5715 if (!PEND && PPEEK_IS('?') && 5716 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 5717 PINC; 5718 if (! PEND) { 5719 c = PPEEK; 5720 if (c == '#') { 5721 PFETCH(c); 5722 while (1) { 5723 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 5724 PFETCH(c); 5725 if (c == MC_ESC(syn)) { 5726 if (! PEND) PFETCH(c); 5727 } 5728 else { 5729 if (c == ')') break; 5730 } 5731 } 5732 goto start; 5733 } 5734 else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) { 5735 int gnum; 5736 UChar* name; 5737 UChar* name_end; 5738 enum REF_NUM num_type; 5739 5740 switch (c) { 5741 case '&': 5742 { 5743 PINC; 5744 name = p; 5745 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, 5746 &gnum, &num_type, FALSE); 5747 if (r < 0) return r; 5748 5749 tok->type = TK_CALL; 5750 tok->u.call.by_number = 0; 5751 tok->u.call.gnum = 0; 5752 tok->u.call.name = name; 5753 tok->u.call.name_end = name_end; 5754 } 5755 break; 5756 5757 case 'R': 5758 tok->type = TK_CALL; 5759 tok->u.call.by_number = 1; 5760 tok->u.call.gnum = 0; 5761 tok->u.call.name = p; 5762 PINC; 5763 if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; 5764 tok->u.call.name_end = p; 5765 break; 5766 5767 case '-': 5768 case '+': 5769 goto lparen_qmark_num; 5770 break; 5771 default: 5772 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end; 5773 5774 lparen_qmark_num: 5775 { 5776 name = p; 5777 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, 5778 &gnum, &num_type, TRUE); 5779 if (r < 0) return r; 5780 5781 if (num_type == IS_NOT_NUM) { 5782 return ONIGERR_INVALID_GROUP_NAME; 5783 } 5784 else { 5785 if (num_type == IS_REL_NUM) { 5786 gnum = backref_rel_to_abs(gnum, env); 5787 if (gnum < 0) { 5788 onig_scan_env_set_error_string(env, 5789 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end); 5790 return ONIGERR_UNDEFINED_GROUP_REFERENCE; 5791 } 5792 } 5793 tok->u.call.by_number = 1; 5794 tok->u.call.gnum = gnum; 5795 } 5796 5797 tok->type = TK_CALL; 5798 tok->u.call.name = name; 5799 tok->u.call.name_end = name_end; 5800 } 5801 break; 5802 } 5803 } 5804 } 5805 lparen_qmark_end: 5806 PUNFETCH; 5807 } 5808 5809 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 5810 tok->type = TK_SUBEXP_OPEN; 5811 break; 5812 5813 case ')': 5814 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 5815 tok->type = TK_SUBEXP_CLOSE; 5816 break; 5817 5818 case '^': 5819 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 5820 tok->type = TK_ANCHOR; 5821 tok->u.subtype = (OPTON_SINGLELINE(env->options) 5822 ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE); 5823 break; 5824 5825 case '$': 5826 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 5827 tok->type = TK_ANCHOR; 5828 tok->u.subtype = (OPTON_SINGLELINE(env->options) 5829 ? ANCR_SEMI_END_BUF : ANCR_END_LINE); 5830 break; 5831 5832 case '[': 5833 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; 5834 tok->type = TK_OPEN_CC; 5835 break; 5836 5837 case ']': 5838 if (*src > env->pattern) /* /].../ is allowed. */ 5839 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); 5840 break; 5841 5842 case '#': 5843 if (OPTON_EXTEND(env->options)) { 5844 while (!PEND) { 5845 PFETCH(c); 5846 if (ONIGENC_IS_CODE_NEWLINE(enc, c)) 5847 break; 5848 } 5849 goto start; 5850 break; 5851 } 5852 break; 5853 5854 case ' ': case '\t': case '\n': case '\r': case '\f': 5855 if (OPTON_EXTEND(env->options)) 5856 goto start; 5857 break; 5858 5859 default: 5860 /* string */ 5861 break; 5862 } 5863 } 5864 5865 #ifdef USE_VARIABLE_META_CHARS 5866 out: 5867 #endif 5868 *src = p; 5869 return tok->type; 5870 } 5871 5872 static int 5873 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, 5874 OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out, 5875 const OnigCodePoint mbr[]) 5876 { 5877 int i, r; 5878 OnigCodePoint j; 5879 5880 int n = ONIGENC_CODE_RANGE_NUM(mbr); 5881 5882 if (not == 0) { 5883 for (i = 0; i < n; i++) { 5884 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); 5885 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { 5886 if (j >= sb_out) { 5887 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { 5888 r = add_code_range_to_buf(&(cc->mbuf), j, 5889 ONIGENC_CODE_RANGE_TO(mbr, i)); 5890 if (r != 0) return r; 5891 i++; 5892 } 5893 5894 goto sb_end; 5895 } 5896 BITSET_SET_BIT(cc->bs, j); 5897 } 5898 } 5899 5900 sb_end: 5901 for ( ; i < n; i++) { 5902 r = add_code_range_to_buf(&(cc->mbuf), 5903 ONIGENC_CODE_RANGE_FROM(mbr, i), 5904 ONIGENC_CODE_RANGE_TO(mbr, i)); 5905 if (r != 0) return r; 5906 } 5907 } 5908 else { 5909 OnigCodePoint prev = 0; 5910 5911 for (i = 0; i < n; i++) { 5912 for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { 5913 if (j >= sb_out) { 5914 goto sb_end2; 5915 } 5916 BITSET_SET_BIT(cc->bs, j); 5917 } 5918 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 5919 } 5920 for (j = prev; j < sb_out; j++) { 5921 BITSET_SET_BIT(cc->bs, j); 5922 } 5923 5924 sb_end2: 5925 prev = sb_out; 5926 5927 for (i = 0; i < n; i++) { 5928 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { 5929 r = add_code_range_to_buf(&(cc->mbuf), prev, 5930 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); 5931 if (r != 0) return r; 5932 } 5933 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 5934 if (prev == 0) goto end; 5935 } 5936 5937 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT); 5938 if (r != 0) return r; 5939 } 5940 5941 end: 5942 return 0; 5943 } 5944 5945 static int 5946 add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not, 5947 OnigEncoding enc ARG_UNUSED, 5948 OnigCodePoint sb_out, 5949 const OnigCodePoint mbr[], OnigCodePoint limit) 5950 { 5951 int i, r; 5952 OnigCodePoint j; 5953 OnigCodePoint from; 5954 OnigCodePoint to; 5955 5956 int n = ONIGENC_CODE_RANGE_NUM(mbr); 5957 5958 if (not == 0) { 5959 for (i = 0; i < n; i++) { 5960 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); 5961 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { 5962 if (j > limit) goto end; 5963 if (j >= sb_out) { 5964 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { 5965 to = ONIGENC_CODE_RANGE_TO(mbr, i); 5966 if (to > limit) to = limit; 5967 r = add_code_range_to_buf(&(cc->mbuf), j, to); 5968 if (r != 0) return r; 5969 i++; 5970 } 5971 5972 goto sb_end; 5973 } 5974 BITSET_SET_BIT(cc->bs, j); 5975 } 5976 } 5977 5978 sb_end: 5979 for ( ; i < n; i++) { 5980 from = ONIGENC_CODE_RANGE_FROM(mbr, i); 5981 to = ONIGENC_CODE_RANGE_TO(mbr, i); 5982 if (from > limit) break; 5983 if (to > limit) to = limit; 5984 r = add_code_range_to_buf(&(cc->mbuf), from, to); 5985 if (r != 0) return r; 5986 } 5987 } 5988 else { 5989 OnigCodePoint prev = 0; 5990 5991 for (i = 0; i < n; i++) { 5992 from = ONIGENC_CODE_RANGE_FROM(mbr, i); 5993 if (from > limit) { 5994 for (j = prev; j < sb_out; j++) { 5995 BITSET_SET_BIT(cc->bs, j); 5996 } 5997 goto sb_end2; 5998 } 5999 for (j = prev; j < from; j++) { 6000 if (j >= sb_out) goto sb_end2; 6001 BITSET_SET_BIT(cc->bs, j); 6002 } 6003 prev = ONIGENC_CODE_RANGE_TO(mbr, i); 6004 if (prev > limit) prev = limit; 6005 prev++; 6006 if (prev == 0) goto end; 6007 } 6008 for (j = prev; j < sb_out; j++) { 6009 BITSET_SET_BIT(cc->bs, j); 6010 } 6011 6012 sb_end2: 6013 prev = sb_out; 6014 6015 for (i = 0; i < n; i++) { 6016 from = ONIGENC_CODE_RANGE_FROM(mbr, i); 6017 if (from > limit) goto last; 6018 6019 if (prev < from) { 6020 r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1); 6021 if (r != 0) return r; 6022 } 6023 prev = ONIGENC_CODE_RANGE_TO(mbr, i); 6024 if (prev > limit) prev = limit; 6025 prev++; 6026 if (prev == 0) goto end; 6027 } 6028 6029 last: 6030 r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT); 6031 if (r != 0) return r; 6032 } 6033 6034 end: 6035 return 0; 6036 } 6037 6038 static int 6039 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) 6040 { 6041 int c, r; 6042 int ascii_mode; 6043 int is_single; 6044 const OnigCodePoint *ranges; 6045 OnigCodePoint limit; 6046 OnigCodePoint sb_out; 6047 OnigEncoding enc = env->enc; 6048 6049 ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options); 6050 6051 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); 6052 if (r == 0) { 6053 if (ascii_mode == 0) 6054 r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges); 6055 else 6056 r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out, 6057 ranges, ASCII_LIMIT); 6058 return r; 6059 } 6060 else if (r != ONIG_NO_SUPPORT_CONFIG) { 6061 return r; 6062 } 6063 6064 r = 0; 6065 is_single = ONIGENC_IS_SINGLEBYTE(enc); 6066 limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; 6067 6068 switch (ctype) { 6069 case ONIGENC_CTYPE_ALPHA: 6070 case ONIGENC_CTYPE_BLANK: 6071 case ONIGENC_CTYPE_CNTRL: 6072 case ONIGENC_CTYPE_DIGIT: 6073 case ONIGENC_CTYPE_LOWER: 6074 case ONIGENC_CTYPE_PUNCT: 6075 case ONIGENC_CTYPE_SPACE: 6076 case ONIGENC_CTYPE_UPPER: 6077 case ONIGENC_CTYPE_XDIGIT: 6078 case ONIGENC_CTYPE_ASCII: 6079 case ONIGENC_CTYPE_ALNUM: 6080 if (not != 0) { 6081 for (c = 0; c < (int )limit; c++) { 6082 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { 6083 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 6084 BITSET_SET_BIT(cc->bs, c); 6085 } 6086 } 6087 for (c = limit; c < SINGLE_BYTE_SIZE; c++) { 6088 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) 6089 BITSET_SET_BIT(cc->bs, c); 6090 } 6091 6092 if (is_single == 0) 6093 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 6094 } 6095 else { 6096 for (c = 0; c < (int )limit; c++) { 6097 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { 6098 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 6099 BITSET_SET_BIT(cc->bs, c); 6100 } 6101 } 6102 } 6103 break; 6104 6105 case ONIGENC_CTYPE_GRAPH: 6106 case ONIGENC_CTYPE_PRINT: 6107 case ONIGENC_CTYPE_WORD: 6108 if (not != 0) { 6109 for (c = 0; c < (int )limit; c++) { 6110 /* check invalid code point */ 6111 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) 6112 && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 6113 BITSET_SET_BIT(cc->bs, c); 6114 } 6115 for (c = limit; c < SINGLE_BYTE_SIZE; c++) { 6116 if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) 6117 BITSET_SET_BIT(cc->bs, c); 6118 } 6119 if (ascii_mode != 0 && is_single == 0) 6120 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 6121 } 6122 else { 6123 for (c = 0; c < (int )limit; c++) { 6124 if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) 6125 && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 6126 BITSET_SET_BIT(cc->bs, c); 6127 } 6128 if (ascii_mode == 0 && is_single == 0) 6129 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 6130 } 6131 break; 6132 6133 default: 6134 return ONIGERR_PARSER_BUG; 6135 break; 6136 } 6137 6138 return r; 6139 } 6140 6141 static int 6142 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) 6143 { 6144 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 6145 #define POSIX_BRACKET_NAME_MIN_LEN 4 6146 6147 static PosixBracketEntryType PBS[] = { 6148 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, 6149 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, 6150 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, 6151 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, 6152 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, 6153 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, 6154 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, 6155 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, 6156 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, 6157 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, 6158 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, 6159 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, 6160 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, 6161 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, 6162 { (UChar* )NULL, -1, 0 } 6163 }; 6164 6165 PosixBracketEntryType *pb; 6166 int not, i, r; 6167 OnigCodePoint c; 6168 OnigEncoding enc = env->enc; 6169 UChar *p = *src; 6170 6171 if (PPEEK_IS('^')) { 6172 PINC_S; 6173 not = 1; 6174 } 6175 else 6176 not = 0; 6177 6178 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) 6179 goto not_posix_bracket; 6180 6181 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { 6182 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { 6183 p = (UChar* )onigenc_step(enc, p, end, pb->len); 6184 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) 6185 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 6186 6187 r = add_ctype_to_cc(cc, pb->ctype, not, env); 6188 if (r != 0) return r; 6189 6190 PINC_S; PINC_S; 6191 *src = p; 6192 return 0; 6193 } 6194 } 6195 6196 not_posix_bracket: 6197 c = 0; 6198 i = 0; 6199 while (!PEND && ((c = PPEEK) != ':') && c != ']') { 6200 PINC_S; 6201 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; 6202 } 6203 if (c == ':' && ! PEND) { 6204 PINC_S; 6205 if (! PEND) { 6206 PFETCH_S(c); 6207 if (c == ']') 6208 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 6209 } 6210 } 6211 6212 return 1; /* 1: is not POSIX bracket, but no error. */ 6213 } 6214 6215 static int 6216 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) 6217 { 6218 int r; 6219 OnigCodePoint c; 6220 OnigEncoding enc; 6221 UChar *prev, *start, *p; 6222 6223 p = *src; 6224 enc = env->enc; 6225 r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 6226 start = prev = p; 6227 6228 while (!PEND) { 6229 prev = p; 6230 PFETCH_S(c); 6231 if (c == '}') { 6232 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); 6233 if (r >= 0) { 6234 *src = p; 6235 } 6236 else { 6237 onig_scan_env_set_error_string(env, r, *src, prev); 6238 } 6239 6240 return r; 6241 } 6242 else if (c == '(' || c == ')' || c == '{' || c == '|') { 6243 break; 6244 } 6245 } 6246 6247 return r; 6248 } 6249 6250 static int 6251 parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) 6252 { 6253 int r, ctype; 6254 CClassNode* cc; 6255 6256 ctype = fetch_char_property_to_ctype(src, end, env); 6257 if (ctype < 0) return ctype; 6258 6259 *np = node_new_cclass(); 6260 CHECK_NULL_RETURN_MEMERR(*np); 6261 cc = CCLASS_(*np); 6262 r = add_ctype_to_cc(cc, ctype, FALSE, env); 6263 if (r != 0) return r; 6264 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 6265 6266 return 0; 6267 } 6268 6269 6270 typedef enum { 6271 CS_VALUE, 6272 CS_RANGE, 6273 CS_COMPLETE, 6274 CS_START 6275 } CSTATE; 6276 6277 typedef enum { 6278 CV_UNDEF, 6279 CV_SB, 6280 CV_MB, 6281 CV_CPROP 6282 } CVAL; 6283 6284 static int 6285 cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, 6286 ScanEnv* env) 6287 { 6288 int r; 6289 6290 if (*state == CS_RANGE) 6291 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; 6292 6293 if (*state == CS_VALUE) { 6294 if (*val == CV_SB) 6295 BITSET_SET_BIT(cc->bs, (int )(*pcode)); 6296 else if (*val == CV_MB) { 6297 r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); 6298 if (r < 0) return r; 6299 } 6300 } 6301 6302 *state = CS_VALUE; 6303 *val = CV_CPROP; 6304 return 0; 6305 } 6306 6307 static int 6308 cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, 6309 int* from_raw, int to_raw, CVAL intype, CVAL* type, 6310 CSTATE* state, ScanEnv* env) 6311 { 6312 int r; 6313 6314 switch (*state) { 6315 case CS_VALUE: 6316 if (*type == CV_SB) { 6317 if (*from > 0xff) 6318 return ONIGERR_INVALID_CODE_POINT_VALUE; 6319 6320 BITSET_SET_BIT(cc->bs, (int )(*from)); 6321 } 6322 else if (*type == CV_MB) { 6323 r = add_code_range(&(cc->mbuf), env, *from, *from); 6324 if (r < 0) return r; 6325 } 6326 break; 6327 6328 case CS_RANGE: 6329 if (intype == *type) { 6330 if (intype == CV_SB) { 6331 if (*from > 0xff || to > 0xff) 6332 return ONIGERR_INVALID_CODE_POINT_VALUE; 6333 6334 if (*from > to) { 6335 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 6336 goto ccs_range_end; 6337 else 6338 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 6339 } 6340 bitset_set_range(cc->bs, (int )*from, (int )to); 6341 } 6342 else { 6343 r = add_code_range(&(cc->mbuf), env, *from, to); 6344 if (r < 0) return r; 6345 } 6346 } 6347 else { 6348 if (*from > to) { 6349 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 6350 goto ccs_range_end; 6351 else 6352 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 6353 } 6354 bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); 6355 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to); 6356 if (r < 0) return r; 6357 } 6358 ccs_range_end: 6359 *state = CS_COMPLETE; 6360 break; 6361 6362 case CS_COMPLETE: 6363 case CS_START: 6364 *state = CS_VALUE; 6365 break; 6366 6367 default: 6368 break; 6369 } 6370 6371 *from_raw = to_raw; 6372 *from = to; 6373 *type = intype; 6374 return 0; 6375 } 6376 6377 static int 6378 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, 6379 ScanEnv* env) 6380 { 6381 int in_esc; 6382 OnigCodePoint code; 6383 OnigEncoding enc = env->enc; 6384 UChar* p = from; 6385 6386 in_esc = 0; 6387 while (! PEND) { 6388 if (ignore_escaped && in_esc) { 6389 in_esc = 0; 6390 } 6391 else { 6392 PFETCH_S(code); 6393 if (code == c) return 1; 6394 if (code == MC_ESC(env->syntax)) in_esc = 1; 6395 } 6396 } 6397 return 0; 6398 } 6399 6400 static int 6401 parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) 6402 { 6403 int r, neg, len, fetched, and_start; 6404 OnigCodePoint in_code, curr_code; 6405 UChar *p; 6406 Node* node; 6407 CClassNode *cc, *prev_cc; 6408 CClassNode work_cc; 6409 int curr_raw, in_raw; 6410 CSTATE state; 6411 CVAL in_type; 6412 CVAL curr_type; 6413 6414 *np = NULL_NODE; 6415 INC_PARSE_DEPTH(env->parse_depth); 6416 6417 prev_cc = (CClassNode* )NULL; 6418 r = fetch_token_in_cc(tok, src, end, env); 6419 if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { 6420 neg = 1; 6421 r = fetch_token_in_cc(tok, src, end, env); 6422 } 6423 else { 6424 neg = 0; 6425 } 6426 6427 if (r < 0) return r; 6428 if (r == TK_CC_CLOSE) { 6429 if (! code_exist_check((OnigCodePoint )']', 6430 *src, env->pattern_end, 1, env)) 6431 return ONIGERR_EMPTY_CHAR_CLASS; 6432 6433 CC_ESC_WARN(env, (UChar* )"]"); 6434 r = tok->type = TK_CHAR; /* allow []...] */ 6435 } 6436 6437 *np = node = node_new_cclass(); 6438 CHECK_NULL_RETURN_MEMERR(node); 6439 cc = CCLASS_(node); 6440 6441 and_start = 0; 6442 state = CS_START; 6443 curr_type = CV_UNDEF; 6444 6445 p = *src; 6446 while (r != TK_CC_CLOSE) { 6447 fetched = 0; 6448 switch (r) { 6449 case TK_CHAR: 6450 any_char_in: 6451 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); 6452 if (len < 0) { 6453 r = len; 6454 goto err; 6455 } 6456 in_type = (len == 1) ? CV_SB : CV_MB; 6457 in_code = tok->u.code; 6458 in_raw = 0; 6459 goto val_entry2; 6460 break; 6461 6462 case TK_CRUDE_BYTE: 6463 /* tok->base != 0 : octal or hexadec. */ 6464 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { 6465 int i, j; 6466 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 6467 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; 6468 UChar* psave = p; 6469 int base = tok->base; 6470 6471 buf[0] = tok->u.byte; 6472 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { 6473 r = fetch_token_in_cc(tok, &p, end, env); 6474 if (r < 0) goto err; 6475 if (r != TK_CRUDE_BYTE || tok->base != base) { 6476 fetched = 1; 6477 break; 6478 } 6479 buf[i] = tok->u.byte; 6480 } 6481 6482 if (i < ONIGENC_MBC_MINLEN(env->enc)) { 6483 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 6484 goto err; 6485 } 6486 6487 /* clear buf tail */ 6488 for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; 6489 6490 len = enclen(env->enc, buf); 6491 if (i < len) { 6492 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 6493 goto err; 6494 } 6495 else if (i > len) { /* fetch back */ 6496 p = psave; 6497 for (i = 1; i < len; i++) { 6498 r = fetch_token_in_cc(tok, &p, end, env); 6499 } 6500 fetched = 0; 6501 } 6502 6503 if (i == 1) { 6504 in_code = (OnigCodePoint )buf[0]; 6505 goto crude_single; 6506 } 6507 else { 6508 in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); 6509 in_type = CV_MB; 6510 } 6511 } 6512 else { 6513 in_code = (OnigCodePoint )tok->u.byte; 6514 crude_single: 6515 in_type = CV_SB; 6516 } 6517 in_raw = 1; 6518 goto val_entry2; 6519 break; 6520 6521 case TK_CODE_POINT: 6522 in_code = tok->u.code; 6523 in_raw = 1; 6524 val_entry: 6525 len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); 6526 if (len < 0) { 6527 if (state != CS_RANGE || 6528 ! IS_SYNTAX_BV(env->syntax, 6529 ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || 6530 in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { 6531 r = len; 6532 goto err; 6533 } 6534 } 6535 in_type = (len == 1 ? CV_SB : CV_MB); 6536 val_entry2: 6537 r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, 6538 &curr_type, &state, env); 6539 if (r != 0) goto err; 6540 break; 6541 6542 case TK_CC_POSIX_BRACKET_OPEN: 6543 r = parse_posix_bracket(cc, &p, end, env); 6544 if (r < 0) goto err; 6545 if (r == 1) { /* is not POSIX bracket */ 6546 CC_ESC_WARN(env, (UChar* )"["); 6547 p = tok->backp; 6548 in_code = tok->u.code; 6549 in_raw = 0; 6550 goto val_entry; 6551 } 6552 goto next_cprop; 6553 break; 6554 6555 case TK_CHAR_TYPE: 6556 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); 6557 if (r != 0) goto err; 6558 6559 next_cprop: 6560 r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); 6561 if (r != 0) goto err; 6562 break; 6563 6564 case TK_CHAR_PROPERTY: 6565 { 6566 int ctype = fetch_char_property_to_ctype(&p, end, env); 6567 if (ctype < 0) { 6568 r = ctype; 6569 goto err; 6570 } 6571 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); 6572 if (r != 0) goto err; 6573 goto next_cprop; 6574 } 6575 break; 6576 6577 case TK_CC_RANGE: 6578 if (state == CS_VALUE) { 6579 r = fetch_token_in_cc(tok, &p, end, env); 6580 if (r < 0) goto err; 6581 6582 fetched = 1; 6583 if (r == TK_CC_CLOSE) { /* allow [x-] */ 6584 range_end_val: 6585 in_code = (OnigCodePoint )'-'; 6586 in_raw = 0; 6587 goto val_entry; 6588 } 6589 else if (r == TK_CC_AND) { 6590 CC_ESC_WARN(env, (UChar* )"-"); 6591 goto range_end_val; 6592 } 6593 6594 if (curr_type == CV_CPROP) { 6595 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; 6596 goto err; 6597 } 6598 6599 state = CS_RANGE; 6600 } 6601 else if (state == CS_START) { 6602 /* [-xa] is allowed */ 6603 in_code = tok->u.code; 6604 in_raw = 0; 6605 6606 r = fetch_token_in_cc(tok, &p, end, env); 6607 if (r < 0) goto err; 6608 6609 fetched = 1; 6610 /* [--x] or [a&&-x] is warned. */ 6611 if (r == TK_CC_RANGE || and_start != 0) 6612 CC_ESC_WARN(env, (UChar* )"-"); 6613 6614 goto val_entry; 6615 } 6616 else if (state == CS_RANGE) { 6617 CC_ESC_WARN(env, (UChar* )"-"); 6618 goto any_char_in; /* [!--] is allowed */ 6619 } 6620 else { /* CS_COMPLETE */ 6621 r = fetch_token_in_cc(tok, &p, end, env); 6622 if (r < 0) goto err; 6623 6624 fetched = 1; 6625 if (r == TK_CC_CLOSE) 6626 goto range_end_val; /* allow [a-b-] */ 6627 else if (r == TK_CC_AND) { 6628 CC_ESC_WARN(env, (UChar* )"-"); 6629 goto range_end_val; 6630 } 6631 6632 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { 6633 CC_ESC_WARN(env, (UChar* )"-"); 6634 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ 6635 } 6636 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; 6637 goto err; 6638 } 6639 break; 6640 6641 case TK_CC_OPEN_CC: /* [ */ 6642 { 6643 Node *anode; 6644 CClassNode* acc; 6645 6646 if (state == CS_VALUE) { 6647 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, 6648 &state, env); 6649 if (r != 0) goto err; 6650 } 6651 state = CS_COMPLETE; 6652 6653 r = parse_cc(&anode, tok, &p, end, env); 6654 if (r != 0) { 6655 onig_node_free(anode); 6656 goto cc_open_err; 6657 } 6658 acc = CCLASS_(anode); 6659 r = or_cclass(cc, acc, env->enc); 6660 onig_node_free(anode); 6661 6662 cc_open_err: 6663 if (r != 0) goto err; 6664 } 6665 break; 6666 6667 case TK_CC_AND: /* && */ 6668 { 6669 if (state == CS_VALUE) { 6670 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, 6671 &state, env); 6672 if (r != 0) goto err; 6673 } 6674 /* initialize local variables */ 6675 and_start = 1; 6676 state = CS_START; 6677 6678 if (IS_NOT_NULL(prev_cc)) { 6679 r = and_cclass(prev_cc, cc, env->enc); 6680 if (r != 0) goto err; 6681 bbuf_free(cc->mbuf); 6682 } 6683 else { 6684 prev_cc = cc; 6685 cc = &work_cc; 6686 } 6687 initialize_cclass(cc); 6688 } 6689 break; 6690 6691 case TK_EOT: 6692 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; 6693 goto err; 6694 break; 6695 default: 6696 r = ONIGERR_PARSER_BUG; 6697 goto err; 6698 break; 6699 } 6700 6701 if (fetched) 6702 r = tok->type; 6703 else { 6704 r = fetch_token_in_cc(tok, &p, end, env); 6705 if (r < 0) goto err; 6706 } 6707 } 6708 6709 if (state == CS_VALUE) { 6710 r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, 6711 &state, env); 6712 if (r != 0) goto err; 6713 } 6714 6715 if (IS_NOT_NULL(prev_cc)) { 6716 r = and_cclass(prev_cc, cc, env->enc); 6717 if (r != 0) goto err; 6718 bbuf_free(cc->mbuf); 6719 cc = prev_cc; 6720 } 6721 6722 if (neg != 0) 6723 NCCLASS_SET_NOT(cc); 6724 else 6725 NCCLASS_CLEAR_NOT(cc); 6726 if (IS_NCCLASS_NOT(cc) && 6727 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { 6728 int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); 6729 if (is_empty != 0) 6730 BITSET_IS_EMPTY(cc->bs, is_empty); 6731 6732 if (is_empty == 0) { 6733 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { 6734 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) 6735 BITSET_SET_BIT(cc->bs, NEWLINE_CODE); 6736 else 6737 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); 6738 } 6739 } 6740 } 6741 *src = p; 6742 DEC_PARSE_DEPTH(env->parse_depth); 6743 return 0; 6744 6745 err: 6746 if (cc != CCLASS_(*np)) 6747 bbuf_free(cc->mbuf); 6748 return r; 6749 } 6750 6751 static int parse_alts(Node** top, PToken* tok, int term, 6752 UChar** src, UChar* end, ScanEnv* env, int group_head); 6753 6754 #ifdef USE_CALLOUT 6755 6756 /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */ 6757 static int 6758 parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) 6759 { 6760 int r; 6761 int i; 6762 int in; 6763 int num; 6764 OnigCodePoint c; 6765 UChar* code_start; 6766 UChar* code_end; 6767 UChar* contents; 6768 UChar* tag_start; 6769 UChar* tag_end; 6770 int brace_nest; 6771 CalloutListEntry* e; 6772 RegexExt* ext; 6773 OnigEncoding enc = env->enc; 6774 UChar* p = *src; 6775 6776 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6777 6778 brace_nest = 0; 6779 while (PPEEK_IS('{')) { 6780 brace_nest++; 6781 PINC_S; 6782 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6783 } 6784 6785 in = ONIG_CALLOUT_IN_PROGRESS; 6786 code_start = p; 6787 while (1) { 6788 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6789 6790 code_end = p; 6791 PFETCH_S(c); 6792 if (c == '}') { 6793 i = brace_nest; 6794 while (i > 0) { 6795 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6796 PFETCH_S(c); 6797 if (c == '}') i--; 6798 else break; 6799 } 6800 if (i == 0) break; 6801 } 6802 } 6803 6804 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6805 6806 PFETCH_S(c); 6807 if (c == '[') { 6808 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6809 tag_end = tag_start = p; 6810 while (! PEND) { 6811 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6812 tag_end = p; 6813 PFETCH_S(c); 6814 if (c == ']') break; 6815 } 6816 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end)) 6817 return ONIGERR_INVALID_CALLOUT_TAG_NAME; 6818 6819 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6820 PFETCH_S(c); 6821 } 6822 else { 6823 tag_start = tag_end = 0; 6824 } 6825 6826 if (c == 'X') { 6827 in |= ONIG_CALLOUT_IN_RETRACTION; 6828 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6829 PFETCH_S(c); 6830 } 6831 else if (c == '<') { 6832 in = ONIG_CALLOUT_IN_RETRACTION; 6833 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6834 PFETCH_S(c); 6835 } 6836 else if (c == '>') { /* no needs (default) */ 6837 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 6838 PFETCH_S(c); 6839 } 6840 6841 if (c != cterm) 6842 return ONIGERR_INVALID_CALLOUT_PATTERN; 6843 6844 r = reg_callout_list_entry(env, &num); 6845 if (r != 0) return r; 6846 6847 ext = onig_get_regex_ext(env->reg); 6848 CHECK_NULL_RETURN_MEMERR(ext); 6849 if (IS_NULL(ext->pattern)) { 6850 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end); 6851 if (r != ONIG_NORMAL) return r; 6852 } 6853 6854 if (tag_start != tag_end) { 6855 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); 6856 if (r != ONIG_NORMAL) return r; 6857 } 6858 6859 contents = onigenc_strdup(enc, code_start, code_end); 6860 CHECK_NULL_RETURN_MEMERR(contents); 6861 6862 r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env); 6863 if (r != 0) { 6864 xfree(contents); 6865 return r; 6866 } 6867 6868 e = onig_reg_callout_list_at(env->reg, num); 6869 if (IS_NULL(e)) { 6870 xfree(contents); 6871 return ONIGERR_MEMORY; 6872 } 6873 6874 e->of = ONIG_CALLOUT_OF_CONTENTS; 6875 e->in = in; 6876 e->name_id = ONIG_NON_NAME_ID; 6877 e->u.content.start = contents; 6878 e->u.content.end = contents + (code_end - code_start); 6879 6880 *src = p; 6881 return 0; 6882 } 6883 6884 static long 6885 parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl) 6886 { 6887 long v; 6888 long d; 6889 int flag; 6890 UChar* p; 6891 OnigCodePoint c; 6892 6893 if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG; 6894 6895 flag = 1; 6896 v = 0; 6897 p = s; 6898 while (p < end) { 6899 c = ONIGENC_MBC_TO_CODE(enc, p, end); 6900 p += ONIGENC_MBC_ENC_LEN(enc, p); 6901 if (c >= '0' && c <= '9') { 6902 d = (long )(c - '0'); 6903 if (v > (max - d) / 10) 6904 return ONIGERR_INVALID_CALLOUT_ARG; 6905 6906 v = v * 10 + d; 6907 } 6908 else if (sign_on != 0 && (c == '-' || c == '+')) { 6909 if (c == '-') flag = -1; 6910 } 6911 else 6912 return ONIGERR_INVALID_CALLOUT_ARG; 6913 6914 sign_on = 0; 6915 } 6916 6917 *rl = flag * v; 6918 return ONIG_NORMAL; 6919 } 6920 6921 static int 6922 parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, 6923 int max_arg_num, unsigned int types[], OnigValue vals[], 6924 ScanEnv* env) 6925 { 6926 #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 6927 6928 int r; 6929 int n; 6930 int esc; 6931 int cn; 6932 UChar* s; 6933 UChar* e; 6934 UChar* eesc; 6935 OnigCodePoint c; 6936 UChar* bufend; 6937 UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH]; 6938 OnigEncoding enc = env->enc; 6939 UChar* p = *src; 6940 6941 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6942 6943 c = 0; 6944 n = 0; 6945 while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { 6946 cn = 0; 6947 esc = 0; 6948 eesc = 0; 6949 bufend = buf; 6950 s = e = p; 6951 while (1) { 6952 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 6953 6954 e = p; 6955 PFETCH_S(c); 6956 if (esc != 0) { 6957 esc = 0; 6958 if (c == '\\' || c == cterm || c == ',') { 6959 /* */ 6960 } 6961 else { 6962 e = eesc; 6963 cn++; 6964 } 6965 goto add_char; 6966 } 6967 else { 6968 if (c == '\\') { 6969 esc = 1; 6970 eesc = e; 6971 } 6972 else if (c == cterm || c == ',') 6973 break; 6974 else { 6975 size_t clen; 6976 6977 add_char: 6978 if (skip_mode == FALSE) { 6979 clen = p - e; 6980 if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) 6981 return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ 6982 6983 xmemcpy(bufend, e, clen); 6984 bufend += clen; 6985 } 6986 cn++; 6987 } 6988 } 6989 } 6990 6991 if (cn != 0) { 6992 if (max_arg_num >= 0 && n >= max_arg_num) 6993 return ONIGERR_INVALID_CALLOUT_ARG; 6994 6995 if (skip_mode == FALSE) { 6996 if ((types[n] & ONIG_TYPE_LONG) != 0) { 6997 int fixed = 0; 6998 if (cn > 0) { 6999 long rl; 7000 r = parse_long(enc, buf, bufend, 1, LONG_MAX, &rl); 7001 if (r == ONIG_NORMAL) { 7002 vals[n].l = rl; 7003 fixed = 1; 7004 types[n] = ONIG_TYPE_LONG; 7005 } 7006 } 7007 7008 if (fixed == 0) { 7009 types[n] = (types[n] & ~ONIG_TYPE_LONG); 7010 if (types[n] == ONIG_TYPE_VOID) 7011 return ONIGERR_INVALID_CALLOUT_ARG; 7012 } 7013 } 7014 7015 switch (types[n]) { 7016 case ONIG_TYPE_LONG: 7017 break; 7018 7019 case ONIG_TYPE_CHAR: 7020 if (cn != 1) return ONIGERR_INVALID_CALLOUT_ARG; 7021 vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend); 7022 break; 7023 7024 case ONIG_TYPE_STRING: 7025 { 7026 UChar* rs = onigenc_strdup(enc, buf, bufend); 7027 CHECK_NULL_RETURN_MEMERR(rs); 7028 vals[n].s.start = rs; 7029 vals[n].s.end = rs + (e - s); 7030 } 7031 break; 7032 7033 case ONIG_TYPE_TAG: 7034 if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) 7035 return ONIGERR_INVALID_CALLOUT_TAG_NAME; 7036 7037 vals[n].s.start = s; 7038 vals[n].s.end = e; 7039 break; 7040 7041 case ONIG_TYPE_VOID: 7042 case ONIG_TYPE_POINTER: 7043 return ONIGERR_PARSER_BUG; 7044 break; 7045 } 7046 } 7047 7048 n++; 7049 } 7050 7051 if (c == cterm) break; 7052 } 7053 7054 if (c != cterm) return ONIGERR_INVALID_CALLOUT_PATTERN; 7055 7056 *src = p; 7057 return n; 7058 } 7059 7060 /* (*name[TAG]) (*name[TAG]{a,b,..}) */ 7061 static int 7062 parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) 7063 { 7064 int r; 7065 int i; 7066 int in; 7067 int num; 7068 int name_id; 7069 int arg_num; 7070 int max_arg_num; 7071 int opt_arg_num; 7072 int is_not_single; 7073 OnigCodePoint c; 7074 UChar* name_start; 7075 UChar* name_end; 7076 UChar* tag_start; 7077 UChar* tag_end; 7078 Node* node; 7079 CalloutListEntry* e; 7080 RegexExt* ext; 7081 unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM]; 7082 OnigValue vals[ONIG_CALLOUT_MAX_ARGS_NUM]; 7083 OnigEncoding enc = env->enc; 7084 UChar* p = *src; 7085 7086 /* PFETCH_READY; */ 7087 if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; 7088 7089 node = 0; 7090 name_start = p; 7091 while (1) { 7092 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7093 name_end = p; 7094 PFETCH_S(c); 7095 if (c == cterm || c == '[' || c == '{') break; 7096 } 7097 7098 if (! is_allowed_callout_name(enc, name_start, name_end)) 7099 return ONIGERR_INVALID_CALLOUT_NAME; 7100 7101 if (c == '[') { 7102 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7103 tag_end = tag_start = p; 7104 while (! PEND) { 7105 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7106 tag_end = p; 7107 PFETCH_S(c); 7108 if (c == ']') break; 7109 } 7110 if (! is_allowed_callout_tag_name(enc, tag_start, tag_end)) 7111 return ONIGERR_INVALID_CALLOUT_TAG_NAME; 7112 7113 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7114 PFETCH_S(c); 7115 } 7116 else { 7117 tag_start = tag_end = 0; 7118 } 7119 7120 if (c == '{') { 7121 UChar* save; 7122 7123 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7124 7125 /* read for single check only */ 7126 save = p; 7127 arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); 7128 if (arg_num < 0) return arg_num; 7129 7130 is_not_single = PPEEK_IS(cterm) ? 0 : 1; 7131 p = save; 7132 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end, 7133 &name_id); 7134 if (r != ONIG_NORMAL) return r; 7135 7136 max_arg_num = get_callout_arg_num_by_name_id(name_id); 7137 for (i = 0; i < max_arg_num; i++) { 7138 types[i] = get_callout_arg_type_by_name_id(name_id, i); 7139 } 7140 7141 arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); 7142 if (arg_num < 0) return arg_num; 7143 7144 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7145 PFETCH_S(c); 7146 } 7147 else { 7148 arg_num = 0; 7149 7150 is_not_single = 0; 7151 r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end, 7152 &name_id); 7153 if (r != ONIG_NORMAL) return r; 7154 7155 max_arg_num = get_callout_arg_num_by_name_id(name_id); 7156 for (i = 0; i < max_arg_num; i++) { 7157 types[i] = get_callout_arg_type_by_name_id(name_id, i); 7158 } 7159 } 7160 7161 in = onig_get_callout_in_by_name_id(name_id); 7162 opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id); 7163 if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) 7164 return ONIGERR_INVALID_CALLOUT_ARG; 7165 7166 if (c != cterm) 7167 return ONIGERR_INVALID_CALLOUT_PATTERN; 7168 7169 r = reg_callout_list_entry(env, &num); 7170 if (r != 0) return r; 7171 7172 ext = onig_get_regex_ext(env->reg); 7173 CHECK_NULL_RETURN_MEMERR(ext); 7174 if (IS_NULL(ext->pattern)) { 7175 r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end); 7176 if (r != ONIG_NORMAL) return r; 7177 } 7178 7179 if (tag_start != tag_end) { 7180 r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); 7181 if (r != ONIG_NORMAL) return r; 7182 } 7183 7184 r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env); 7185 if (r != ONIG_NORMAL) return r; 7186 7187 e = onig_reg_callout_list_at(env->reg, num); 7188 CHECK_NULL_RETURN_MEMERR(e); 7189 7190 e->of = ONIG_CALLOUT_OF_NAME; 7191 e->in = in; 7192 e->name_id = name_id; 7193 e->type = onig_get_callout_type_by_name_id(name_id); 7194 e->start_func = onig_get_callout_start_func_by_name_id(name_id); 7195 e->end_func = onig_get_callout_end_func_by_name_id(name_id); 7196 e->u.arg.num = max_arg_num; 7197 e->u.arg.passed_num = arg_num; 7198 for (i = 0; i < max_arg_num; i++) { 7199 e->u.arg.types[i] = types[i]; 7200 if (i < arg_num) 7201 e->u.arg.vals[i] = vals[i]; 7202 else 7203 e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i); 7204 } 7205 7206 *np = node; 7207 *src = p; 7208 return 0; 7209 } 7210 #endif 7211 7212 static int 7213 parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, 7214 ScanEnv* env) 7215 { 7216 int r, num; 7217 Node *target; 7218 OnigOptionType option; 7219 OnigCodePoint c; 7220 int list_capture; 7221 OnigEncoding enc = env->enc; 7222 7223 UChar* p = *src; 7224 PFETCH_READY; 7225 7226 *np = NULL; 7227 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 7228 7229 option = env->options; 7230 c = PPEEK; 7231 if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 7232 PINC; 7233 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7234 7235 PFETCH(c); 7236 switch (c) { 7237 case ':': /* (?:...) grouping only */ 7238 group: 7239 r = fetch_token(tok, &p, end, env); 7240 if (r < 0) return r; 7241 r = parse_alts(np, tok, term, &p, end, env, FALSE); 7242 if (r < 0) return r; 7243 *src = p; 7244 return 1; /* group */ 7245 break; 7246 7247 case '=': 7248 *np = node_new_anchor(ANCR_PREC_READ); 7249 break; 7250 case '!': /* preceding read */ 7251 *np = node_new_anchor(ANCR_PREC_READ_NOT); 7252 break; 7253 case '>': /* (?>...) stop backtrack */ 7254 *np = node_new_bag(BAG_STOP_BACKTRACK); 7255 break; 7256 7257 case '\'': 7258 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 7259 goto named_group1; 7260 } 7261 else 7262 return ONIGERR_UNDEFINED_GROUP_OPTION; 7263 break; 7264 7265 case '<': /* look behind (?<=...), (?<!...) */ 7266 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 7267 PFETCH(c); 7268 if (c == '=') 7269 *np = node_new_anchor(ANCR_LOOK_BEHIND); 7270 else if (c == '!') 7271 *np = node_new_anchor(ANCR_LOOK_BEHIND_NOT); 7272 else { 7273 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 7274 UChar *name; 7275 UChar *name_end; 7276 enum REF_NUM num_type; 7277 7278 PUNFETCH; 7279 c = '<'; 7280 7281 named_group1: 7282 list_capture = 0; 7283 7284 #ifdef USE_CAPTURE_HISTORY 7285 named_group2: 7286 #endif 7287 name = p; 7288 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 7289 &num_type, FALSE); 7290 if (r < 0) return r; 7291 7292 num = scan_env_add_mem_entry(env); 7293 if (num < 0) return num; 7294 if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM) 7295 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 7296 7297 r = name_add(env->reg, name, name_end, num, env); 7298 if (r != 0) return r; 7299 *np = node_new_memory(1); 7300 CHECK_NULL_RETURN_MEMERR(*np); 7301 BAG_(*np)->m.regnum = num; 7302 if (list_capture != 0) 7303 MEM_STATUS_ON_SIMPLE(env->cap_history, num); 7304 env->num_named++; 7305 } 7306 else { 7307 return ONIGERR_UNDEFINED_GROUP_OPTION; 7308 } 7309 } 7310 break; 7311 7312 case '~': 7313 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) { 7314 Node* absent; 7315 Node* expr; 7316 int head_bar; 7317 int is_range_cutter; 7318 7319 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7320 7321 if (PPEEK_IS('|')) { /* (?~|generator|absent) */ 7322 PINC; 7323 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7324 7325 head_bar = 1; 7326 if (PPEEK_IS(')')) { /* (?~|) : range clear */ 7327 PINC; 7328 r = make_range_clear(np, env); 7329 if (r != 0) return r; 7330 goto end; 7331 } 7332 } 7333 else 7334 head_bar = 0; 7335 7336 r = fetch_token(tok, &p, end, env); 7337 if (r < 0) return r; 7338 r = parse_alts(&absent, tok, term, &p, end, env, TRUE); 7339 if (r < 0) { 7340 onig_node_free(absent); 7341 return r; 7342 } 7343 7344 expr = NULL_NODE; 7345 is_range_cutter = 0; 7346 if (head_bar != 0) { 7347 Node* top = absent; 7348 if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) { 7349 expr = NULL_NODE; 7350 is_range_cutter = 1; 7351 /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */ 7352 } 7353 else { 7354 absent = NODE_CAR(top); 7355 expr = NODE_CDR(top); 7356 NODE_CAR(top) = NULL_NODE; 7357 NODE_CDR(top) = NULL_NODE; 7358 onig_node_free(top); 7359 if (IS_NULL(NODE_CDR(expr))) { 7360 top = expr; 7361 expr = NODE_CAR(top); 7362 NODE_CAR(top) = NULL_NODE; 7363 onig_node_free(top); 7364 } 7365 } 7366 } 7367 7368 r = make_absent_tree(np, absent, expr, is_range_cutter, env); 7369 if (r != 0) { 7370 return r; 7371 } 7372 goto end; 7373 } 7374 else { 7375 return ONIGERR_UNDEFINED_GROUP_OPTION; 7376 } 7377 break; 7378 7379 #ifdef USE_CALLOUT 7380 case '{': 7381 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) 7382 return ONIGERR_UNDEFINED_GROUP_OPTION; 7383 7384 r = parse_callout_of_contents(np, ')', &p, end, env); 7385 if (r != 0) return r; 7386 7387 goto end; 7388 break; 7389 #endif 7390 7391 case '(': 7392 /* (?()...) */ 7393 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) { 7394 UChar *prev; 7395 Node* condition; 7396 int condition_is_checker; 7397 7398 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7399 PFETCH(c); 7400 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7401 7402 if (IS_CODE_DIGIT_ASCII(enc, c) 7403 || c == '-' || c == '+' || c == '<' || c == '\'') { 7404 UChar* name_end; 7405 int back_num; 7406 int exist_level; 7407 int level; 7408 enum REF_NUM num_type; 7409 int is_enclosed; 7410 7411 is_enclosed = (c == '<' || c == '\'') ? 1 : 0; 7412 if (! is_enclosed) 7413 PUNFETCH; 7414 prev = p; 7415 exist_level = 0; 7416 #ifdef USE_BACKREF_WITH_LEVEL 7417 name_end = NULL_UCHARP; /* no need. escape gcc warning. */ 7418 r = fetch_name_with_level( 7419 (OnigCodePoint )(is_enclosed != 0 ? c : '('), 7420 &p, end, &name_end, 7421 env, &back_num, &level, &num_type); 7422 if (r == 1) exist_level = 1; 7423 #else 7424 r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), 7425 &p, end, &name_end, env, &back_num, &num_type, TRUE); 7426 #endif 7427 if (r < 0) { 7428 if (is_enclosed == 0) { 7429 goto any_condition; 7430 } 7431 else 7432 return r; 7433 } 7434 7435 condition_is_checker = 1; 7436 if (num_type != IS_NOT_NUM) { 7437 if (num_type == IS_REL_NUM) { 7438 back_num = backref_rel_to_abs(back_num, env); 7439 } 7440 if (back_num <= 0) 7441 return ONIGERR_INVALID_BACKREF; 7442 7443 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { 7444 if (back_num > env->num_mem || 7445 IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) 7446 return ONIGERR_INVALID_BACKREF; 7447 } 7448 7449 condition = node_new_backref_checker(1, &back_num, FALSE, 7450 #ifdef USE_BACKREF_WITH_LEVEL 7451 exist_level, level, 7452 #endif 7453 env); 7454 } 7455 else { 7456 int num; 7457 int* backs; 7458 7459 num = name_to_group_numbers(env, prev, name_end, &backs); 7460 if (num <= 0) { 7461 return ONIGERR_UNDEFINED_NAME_REFERENCE; 7462 } 7463 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { 7464 int i; 7465 for (i = 0; i < num; i++) { 7466 if (backs[i] > env->num_mem || 7467 IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) 7468 return ONIGERR_INVALID_BACKREF; 7469 } 7470 } 7471 7472 condition = node_new_backref_checker(num, backs, TRUE, 7473 #ifdef USE_BACKREF_WITH_LEVEL 7474 exist_level, level, 7475 #endif 7476 env); 7477 } 7478 7479 if (is_enclosed != 0) { 7480 if (PEND) goto err_if_else; 7481 PFETCH(c); 7482 if (c != ')') goto err_if_else; 7483 } 7484 } 7485 #ifdef USE_CALLOUT 7486 else if (c == '?') { 7487 if (IS_SYNTAX_OP2(env->syntax, 7488 ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) { 7489 if (! PEND && PPEEK_IS('{')) { 7490 /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */ 7491 condition_is_checker = 0; 7492 PFETCH(c); 7493 r = parse_callout_of_contents(&condition, ')', &p, end, env); 7494 if (r != 0) return r; 7495 goto end_condition; 7496 } 7497 } 7498 goto any_condition; 7499 } 7500 else if (c == '*' && 7501 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { 7502 condition_is_checker = 0; 7503 r = parse_callout_of_name(&condition, ')', &p, end, env); 7504 if (r != 0) return r; 7505 goto end_condition; 7506 } 7507 #endif 7508 else { 7509 any_condition: 7510 PUNFETCH; 7511 condition_is_checker = 0; 7512 r = fetch_token(tok, &p, end, env); 7513 if (r < 0) return r; 7514 r = parse_alts(&condition, tok, term, &p, end, env, FALSE); 7515 if (r < 0) { 7516 onig_node_free(condition); 7517 return r; 7518 } 7519 } 7520 7521 #ifdef USE_CALLOUT 7522 end_condition: 7523 #endif 7524 CHECK_NULL_RETURN_MEMERR(condition); 7525 7526 if (PEND) { 7527 err_if_else: 7528 onig_node_free(condition); 7529 return ONIGERR_END_PATTERN_IN_GROUP; 7530 } 7531 7532 if (PPEEK_IS(')')) { /* case: empty body: make backref checker */ 7533 if (condition_is_checker == 0) { 7534 onig_node_free(condition); 7535 return ONIGERR_INVALID_IF_ELSE_SYNTAX; 7536 } 7537 PFETCH(c); 7538 *np = condition; 7539 } 7540 else { /* if-else */ 7541 int then_is_empty; 7542 Node *Then, *Else; 7543 7544 Then = 0; 7545 if (PPEEK_IS('|')) { 7546 PFETCH(c); 7547 then_is_empty = 1; 7548 } 7549 else 7550 then_is_empty = 0; 7551 7552 r = fetch_token(tok, &p, end, env); 7553 if (r < 0) { 7554 onig_node_free(condition); 7555 return r; 7556 } 7557 r = parse_alts(&target, tok, term, &p, end, env, TRUE); 7558 if (r < 0) { 7559 onig_node_free(condition); 7560 onig_node_free(target); 7561 return r; 7562 } 7563 7564 if (then_is_empty != 0) { 7565 Else = target; 7566 } 7567 else { 7568 if (NODE_TYPE(target) == NODE_ALT) { 7569 Then = NODE_CAR(target); 7570 if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) { 7571 Else = NODE_CAR(NODE_CDR(target)); 7572 cons_node_free_alone(NODE_CDR(target)); 7573 } 7574 else { 7575 Else = NODE_CDR(target); 7576 } 7577 cons_node_free_alone(target); 7578 } 7579 else { 7580 Then = target; 7581 Else = 0; 7582 } 7583 } 7584 7585 *np = node_new_bag_if_else(condition, Then, Else); 7586 if (IS_NULL(*np)) { 7587 onig_node_free(condition); 7588 onig_node_free(Then); 7589 onig_node_free(Else); 7590 return ONIGERR_MEMORY; 7591 } 7592 } 7593 goto end; 7594 } 7595 else { 7596 return ONIGERR_UNDEFINED_GROUP_OPTION; 7597 } 7598 break; 7599 7600 #ifdef USE_CAPTURE_HISTORY 7601 case '@': 7602 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { 7603 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 7604 PFETCH(c); 7605 if (c == '<' || c == '\'') { 7606 list_capture = 1; 7607 goto named_group2; /* (?@<name>...) */ 7608 } 7609 PUNFETCH; 7610 } 7611 7612 *np = node_new_memory(0); 7613 CHECK_NULL_RETURN_MEMERR(*np); 7614 num = scan_env_add_mem_entry(env); 7615 if (num < 0) { 7616 return num; 7617 } 7618 else if (num >= (int )MEM_STATUS_BITS_NUM) { 7619 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 7620 } 7621 BAG_(*np)->m.regnum = num; 7622 MEM_STATUS_ON_SIMPLE(env->cap_history, num); 7623 } 7624 else { 7625 return ONIGERR_UNDEFINED_GROUP_OPTION; 7626 } 7627 break; 7628 #endif 7629 7630 #ifdef USE_POSIXLINE_OPTION 7631 case 'p': 7632 #endif 7633 case '-': case 'i': case 'm': case 's': case 'x': 7634 case 'W': case 'D': case 'S': case 'P': 7635 case 'y': 7636 { 7637 int neg = 0; 7638 7639 while (1) { 7640 switch (c) { 7641 case ':': 7642 case ')': 7643 break; 7644 7645 case '-': neg = 1; break; 7646 case 'x': OPTION_NEGATE(option, ONIG_OPTION_EXTEND, neg); break; 7647 case 'i': OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break; 7648 case 's': 7649 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 7650 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg); 7651 } 7652 else 7653 return ONIGERR_UNDEFINED_GROUP_OPTION; 7654 break; 7655 7656 case 'm': 7657 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 7658 OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); 7659 } 7660 else if (IS_SYNTAX_OP2(env->syntax, 7661 ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { 7662 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg); 7663 } 7664 else 7665 return ONIGERR_UNDEFINED_GROUP_OPTION; 7666 break; 7667 #ifdef USE_POSIXLINE_OPTION 7668 case 'p': 7669 OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); 7670 break; 7671 #endif 7672 case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; 7673 case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; 7674 case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; 7675 case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; 7676 7677 case 'y': /* y{g}, y{w} */ 7678 { 7679 if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) 7680 return ONIGERR_UNDEFINED_GROUP_OPTION; 7681 7682 if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION; 7683 7684 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7685 if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION; 7686 PFETCH(c); 7687 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7688 PFETCH(c); 7689 switch (c) { 7690 case 'g': 7691 if (! ONIGENC_IS_UNICODE_ENCODING(enc)) 7692 return ONIGERR_UNDEFINED_GROUP_OPTION; 7693 7694 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); 7695 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); 7696 break; 7697 #ifdef USE_UNICODE_WORD_BREAK 7698 case 'w': 7699 if (! ONIGENC_IS_UNICODE_ENCODING(enc)) 7700 return ONIGERR_UNDEFINED_GROUP_OPTION; 7701 7702 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); 7703 OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); 7704 break; 7705 #endif 7706 default: 7707 return ONIGERR_UNDEFINED_GROUP_OPTION; 7708 break; 7709 } 7710 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7711 PFETCH(c); 7712 if (c != '}') 7713 return ONIGERR_UNDEFINED_GROUP_OPTION; 7714 break; 7715 } /* case 'y' */ 7716 7717 default: 7718 return ONIGERR_UNDEFINED_GROUP_OPTION; 7719 } 7720 7721 if (c == ')') { 7722 *np = node_new_option(option); 7723 CHECK_NULL_RETURN_MEMERR(*np); 7724 *src = p; 7725 return 2; /* option only */ 7726 } 7727 else if (c == ':') { 7728 OnigOptionType prev = env->options; 7729 7730 env->options = option; 7731 r = fetch_token(tok, &p, end, env); 7732 if (r < 0) return r; 7733 r = parse_alts(&target, tok, term, &p, end, env, FALSE); 7734 env->options = prev; 7735 if (r < 0) { 7736 onig_node_free(target); 7737 return r; 7738 } 7739 *np = node_new_option(option); 7740 CHECK_NULL_RETURN_MEMERR(*np); 7741 NODE_BODY(*np) = target; 7742 *src = p; 7743 return 0; 7744 } 7745 7746 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 7747 PFETCH(c); 7748 } /* while (1) */ 7749 } 7750 break; 7751 7752 default: 7753 return ONIGERR_UNDEFINED_GROUP_OPTION; 7754 } 7755 } 7756 #ifdef USE_CALLOUT 7757 else if (c == '*' && 7758 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { 7759 PINC; 7760 r = parse_callout_of_name(np, ')', &p, end, env); 7761 if (r != 0) return r; 7762 7763 goto end; 7764 } 7765 #endif 7766 else { 7767 if (OPTON_DONT_CAPTURE_GROUP(env->options)) 7768 goto group; 7769 7770 *np = node_new_memory(0); 7771 CHECK_NULL_RETURN_MEMERR(*np); 7772 num = scan_env_add_mem_entry(env); 7773 if (num < 0) return num; 7774 BAG_(*np)->m.regnum = num; 7775 } 7776 7777 CHECK_NULL_RETURN_MEMERR(*np); 7778 r = fetch_token(tok, &p, end, env); 7779 if (r < 0) return r; 7780 r = parse_alts(&target, tok, term, &p, end, env, FALSE); 7781 if (r < 0) { 7782 onig_node_free(target); 7783 return r; 7784 } 7785 7786 NODE_BODY(*np) = target; 7787 7788 if (NODE_TYPE(*np) == NODE_BAG) { 7789 if (BAG_(*np)->type == BAG_MEMORY) { 7790 /* Don't move this to previous of parse_alts() */ 7791 r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); 7792 if (r != 0) return r; 7793 } 7794 } 7795 7796 end: 7797 *src = p; 7798 return 0; 7799 } 7800 7801 static const char* PopularQStr[] = { 7802 "?", "*", "+", "??", "*?", "+?" 7803 }; 7804 7805 static const char* ReduceQStr[] = { 7806 "", "", "*", "*?", "??", "+ and ??", "+? and ?" 7807 }; 7808 7809 static int 7810 assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) 7811 { 7812 QuantNode* qn; 7813 7814 qn = QUANT_(qnode); 7815 if (qn->lower == 1 && qn->upper == 1) 7816 return 1; 7817 7818 switch (NODE_TYPE(target)) { 7819 case NODE_STRING: 7820 if (group == 0) { 7821 if (str_node_can_be_split(target, env->enc)) { 7822 Node* n = str_node_split_last_char(target, env->enc); 7823 if (IS_NOT_NULL(n)) { 7824 NODE_BODY(qnode) = n; 7825 return 2; 7826 } 7827 } 7828 } 7829 break; 7830 7831 case NODE_QUANT: 7832 { /* check redundant double repeat. */ 7833 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ 7834 QuantNode* qnt = QUANT_(target); 7835 int nestq_num = quantifier_type_num(qn); 7836 int targetq_num = quantifier_type_num(qnt); 7837 7838 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR 7839 if (targetq_num >= 0 && nestq_num >= 0 && 7840 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { 7841 UChar buf[WARN_BUFSIZE]; 7842 7843 switch(ReduceTypeTable[targetq_num][nestq_num]) { 7844 case RQ_ASIS: 7845 break; 7846 7847 case RQ_DEL: 7848 if (onig_verb_warn != onig_null_warn) { 7849 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 7850 env->pattern, env->pattern_end, 7851 (UChar* )"redundant nested repeat operator"); 7852 (*onig_verb_warn)((char* )buf); 7853 } 7854 goto warn_exit; 7855 break; 7856 7857 default: 7858 if (onig_verb_warn != onig_null_warn) { 7859 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 7860 env->pattern, env->pattern_end, 7861 (UChar* )"nested repeat operator %s and %s was replaced with '%s'", 7862 PopularQStr[targetq_num], PopularQStr[nestq_num], 7863 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); 7864 (*onig_verb_warn)((char* )buf); 7865 } 7866 goto warn_exit; 7867 break; 7868 } 7869 } 7870 7871 warn_exit: 7872 #endif 7873 if (targetq_num >= 0 && nestq_num < 0) { 7874 if (targetq_num == 1 || targetq_num == 2) { /* * or + */ 7875 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ 7876 if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { 7877 qn->upper = (qn->lower == 0 ? 1 : qn->lower); 7878 } 7879 } 7880 } 7881 else { 7882 int r; 7883 7884 NODE_BODY(qnode) = target; 7885 r = onig_reduce_nested_quantifier(qnode); 7886 return r; 7887 } 7888 } 7889 break; 7890 7891 default: 7892 break; 7893 } 7894 7895 NODE_BODY(qnode) = target; 7896 return 0; 7897 } 7898 7899 7900 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 7901 static int 7902 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) 7903 { 7904 BBuf *tbuf; 7905 int r; 7906 7907 if (IS_NCCLASS_NOT(cc)) { 7908 bitset_invert(cc->bs); 7909 7910 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 7911 r = not_code_range_buf(enc, cc->mbuf, &tbuf); 7912 if (r != 0) return r; 7913 7914 bbuf_free(cc->mbuf); 7915 cc->mbuf = tbuf; 7916 } 7917 7918 NCCLASS_CLEAR_NOT(cc); 7919 } 7920 7921 return 0; 7922 } 7923 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 7924 7925 #define ADD_CODE_INTO_CC(cc, code, enc) do {\ 7926 if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ 7927 add_code_range_to_buf(&((cc)->mbuf), code, code);\ 7928 }\ 7929 else {\ 7930 BITSET_SET_BIT((cc)->bs, code);\ 7931 }\ 7932 } while (0) 7933 7934 extern int 7935 onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, 7936 int n, OnigCodePoint codes[]) 7937 { 7938 int i; 7939 Node* node; 7940 CClassNode* cc; 7941 7942 *rnode = NULL_NODE; 7943 7944 node = node_new_cclass(); 7945 CHECK_NULL_RETURN_MEMERR(node); 7946 7947 cc = CCLASS_(node); 7948 7949 for (i = 0; i < n; i++) { 7950 ADD_CODE_INTO_CC(cc, codes[i], enc); 7951 } 7952 7953 *rnode = node; 7954 return 0; 7955 } 7956 7957 typedef struct { 7958 ScanEnv* env; 7959 CClassNode* cc; 7960 Node* alt_root; 7961 Node** ptail; 7962 } IApplyCaseFoldArg; 7963 7964 static int 7965 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) 7966 { 7967 IApplyCaseFoldArg* iarg; 7968 ScanEnv* env; 7969 CClassNode* cc; 7970 7971 iarg = (IApplyCaseFoldArg* )arg; 7972 env = iarg->env; 7973 cc = iarg->cc; 7974 7975 if (to_len == 1) { 7976 int is_in = onig_is_code_in_cc(env->enc, from, cc); 7977 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 7978 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || 7979 (is_in == 0 && IS_NCCLASS_NOT(cc))) { 7980 ADD_CODE_INTO_CC(cc, *to, env->enc); 7981 } 7982 #else 7983 if (is_in != 0) { 7984 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || 7985 ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { 7986 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); 7987 add_code_range(&(cc->mbuf), env, *to, *to); 7988 } 7989 else { 7990 if (IS_NCCLASS_NOT(cc)) { 7991 BITSET_CLEAR_BIT(cc->bs, *to); 7992 } 7993 else 7994 BITSET_SET_BIT(cc->bs, *to); 7995 } 7996 } 7997 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 7998 } 7999 else { 8000 int r, i, len; 8001 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 8002 8003 if (onig_is_code_in_cc(env->enc, from, cc) 8004 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 8005 && !IS_NCCLASS_NOT(cc) 8006 #endif 8007 ) { 8008 int n, j, m, index; 8009 Node* list_node; 8010 Node* ns[3]; 8011 8012 n = 0; 8013 for (i = 0; i < to_len; i++) { 8014 OnigCodePoint code; 8015 Node* csnode; 8016 CClassNode* cs_cc; 8017 8018 index = onigenc_unicode_fold1_key(&to[i]); 8019 if (index >= 0) { 8020 csnode = node_new_cclass(); 8021 cs_cc = CCLASS_(csnode); 8022 if (IS_NULL(csnode)) { 8023 err_free_ns: 8024 for (j = 0; j < n; j++) onig_node_free(ns[j]); 8025 return ONIGERR_MEMORY; 8026 } 8027 m = FOLDS1_UNFOLDS_NUM(index); 8028 for (j = 0; j < m; j++) { 8029 code = FOLDS1_UNFOLDS(index)[j]; 8030 ADD_CODE_INTO_CC(cs_cc, code, env->enc); 8031 } 8032 ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); 8033 ns[n++] = csnode; 8034 } 8035 else { 8036 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); 8037 if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { 8038 csnode = node_new_str(buf, buf + len); 8039 if (IS_NULL(csnode)) goto err_free_ns; 8040 8041 NODE_STRING_SET_CASE_EXPANDED(csnode); 8042 ns[n++] = csnode; 8043 } 8044 else { 8045 r = onig_node_str_cat(ns[n-1], buf, buf + len); 8046 if (r < 0) goto err_free_ns; 8047 } 8048 } 8049 } 8050 8051 if (n == 1) 8052 list_node = ns[0]; 8053 else 8054 list_node = make_list(n, ns); 8055 8056 *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); 8057 if (IS_NULL(*(iarg->ptail))) { 8058 onig_node_free(list_node); 8059 return ONIGERR_MEMORY; 8060 } 8061 iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); 8062 } 8063 } 8064 8065 return 0; 8066 } 8067 8068 static int 8069 parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, 8070 ScanEnv* env, int group_head) 8071 { 8072 int r, len, group; 8073 Node* qn; 8074 Node** tp; 8075 unsigned int parse_depth; 8076 8077 retry: 8078 group = 0; 8079 *np = NULL; 8080 if (tok->type == (enum TokenSyms )term) 8081 goto end_of_token; 8082 8083 parse_depth = env->parse_depth; 8084 8085 switch (tok->type) { 8086 case TK_ALT: 8087 case TK_EOT: 8088 end_of_token: 8089 *np = node_new_empty(); 8090 CHECK_NULL_RETURN_MEMERR(*np); 8091 return tok->type; 8092 break; 8093 8094 case TK_SUBEXP_OPEN: 8095 r = parse_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env); 8096 if (r < 0) return r; 8097 if (r == 1) { /* group */ 8098 if (group_head == 0) 8099 group = 1; 8100 else { 8101 Node* target = *np; 8102 *np = node_new_group(target); 8103 if (IS_NULL(*np)) { 8104 onig_node_free(target); 8105 return ONIGERR_MEMORY; 8106 } 8107 group = 2; 8108 } 8109 } 8110 else if (r == 2) { /* option only */ 8111 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) { 8112 env->options = BAG_(*np)->o.options; 8113 r = fetch_token(tok, src, end, env); 8114 if (r < 0) return r; 8115 onig_node_free(*np); 8116 goto retry; 8117 } 8118 else { 8119 Node* target; 8120 OnigOptionType prev = env->options; 8121 8122 env->options = BAG_(*np)->o.options; 8123 r = fetch_token(tok, src, end, env); 8124 if (r < 0) return r; 8125 r = parse_alts(&target, tok, term, src, end, env, FALSE); 8126 env->options = prev; 8127 if (r < 0) { 8128 onig_node_free(target); 8129 return r; 8130 } 8131 NODE_BODY(*np) = target; 8132 } 8133 return tok->type; 8134 } 8135 break; 8136 8137 case TK_SUBEXP_CLOSE: 8138 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) 8139 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; 8140 8141 if (tok->escaped) goto tk_crude_byte; 8142 else goto tk_byte; 8143 break; 8144 8145 case TK_STRING: 8146 tk_byte: 8147 { 8148 *np = node_new_str_with_options(tok->backp, *src, env->options); 8149 CHECK_NULL_RETURN_MEMERR(*np); 8150 8151 while (1) { 8152 r = fetch_token(tok, src, end, env); 8153 if (r < 0) return r; 8154 if (r != TK_STRING) break; 8155 8156 r = onig_node_str_cat(*np, tok->backp, *src); 8157 if (r < 0) return r; 8158 } 8159 8160 string_end: 8161 tp = np; 8162 goto repeat; 8163 } 8164 break; 8165 8166 case TK_CRUDE_BYTE: 8167 tk_crude_byte: 8168 { 8169 *np = node_new_str_crude_char(tok->u.byte, env->options); 8170 CHECK_NULL_RETURN_MEMERR(*np); 8171 len = 1; 8172 while (1) { 8173 if (len >= ONIGENC_MBC_MINLEN(env->enc)) { 8174 if (len == enclen(env->enc, STR_(*np)->s)) { 8175 r = fetch_token(tok, src, end, env); 8176 goto tk_crude_byte_end; 8177 } 8178 } 8179 8180 r = fetch_token(tok, src, end, env); 8181 if (r < 0) return r; 8182 if (r != TK_CRUDE_BYTE) 8183 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 8184 8185 r = node_str_cat_char(*np, tok->u.byte); 8186 if (r < 0) return r; 8187 8188 len++; 8189 } 8190 8191 tk_crude_byte_end: 8192 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) 8193 return ONIGERR_INVALID_WIDE_CHAR_VALUE; 8194 8195 NODE_STRING_CLEAR_CRUDE(*np); 8196 goto string_end; 8197 } 8198 break; 8199 8200 case TK_CODE_POINT: 8201 { 8202 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 8203 len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); 8204 if (len < 0) return len; 8205 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG 8206 *np = node_new_str_crude(buf, buf + len, env->options); 8207 #else 8208 *np = node_new_str_with_options(buf, buf + len, env->options); 8209 #endif 8210 CHECK_NULL_RETURN_MEMERR(*np); 8211 } 8212 break; 8213 8214 case TK_QUOTE_OPEN: 8215 { 8216 OnigCodePoint end_op[2]; 8217 UChar *qstart, *qend, *nextp; 8218 8219 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); 8220 end_op[1] = (OnigCodePoint )'E'; 8221 qstart = *src; 8222 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); 8223 if (IS_NULL(qend)) { 8224 nextp = qend = end; 8225 } 8226 *np = node_new_str_with_options(qstart, qend, env->options); 8227 CHECK_NULL_RETURN_MEMERR(*np); 8228 *src = nextp; 8229 } 8230 break; 8231 8232 case TK_CHAR_TYPE: 8233 { 8234 switch (tok->u.prop.ctype) { 8235 case ONIGENC_CTYPE_WORD: 8236 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options); 8237 CHECK_NULL_RETURN_MEMERR(*np); 8238 break; 8239 8240 case ONIGENC_CTYPE_SPACE: 8241 case ONIGENC_CTYPE_DIGIT: 8242 case ONIGENC_CTYPE_XDIGIT: 8243 { 8244 CClassNode* cc; 8245 8246 *np = node_new_cclass(); 8247 CHECK_NULL_RETURN_MEMERR(*np); 8248 cc = CCLASS_(*np); 8249 add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); 8250 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 8251 } 8252 break; 8253 8254 default: 8255 return ONIGERR_PARSER_BUG; 8256 break; 8257 } 8258 } 8259 break; 8260 8261 case TK_CHAR_PROPERTY: 8262 r = parse_char_property(np, tok, src, end, env); 8263 if (r != 0) return r; 8264 break; 8265 8266 case TK_OPEN_CC: 8267 { 8268 CClassNode* cc; 8269 8270 r = parse_cc(np, tok, src, end, env); 8271 if (r != 0) return r; 8272 8273 cc = CCLASS_(*np); 8274 if (OPTON_IGNORECASE(env->options)) { 8275 IApplyCaseFoldArg iarg; 8276 8277 iarg.env = env; 8278 iarg.cc = cc; 8279 iarg.alt_root = NULL_NODE; 8280 iarg.ptail = &(iarg.alt_root); 8281 8282 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, 8283 i_apply_case_fold, &iarg); 8284 if (r != 0) { 8285 onig_node_free(iarg.alt_root); 8286 return r; 8287 } 8288 if (IS_NOT_NULL(iarg.alt_root)) { 8289 Node* work = onig_node_new_alt(*np, iarg.alt_root); 8290 if (IS_NULL(work)) { 8291 onig_node_free(iarg.alt_root); 8292 return ONIGERR_MEMORY; 8293 } 8294 *np = work; 8295 } 8296 } 8297 } 8298 break; 8299 8300 case TK_ANYCHAR: 8301 *np = node_new_anychar(env->options); 8302 CHECK_NULL_RETURN_MEMERR(*np); 8303 break; 8304 8305 case TK_ANYCHAR_ANYTIME: 8306 *np = node_new_anychar(env->options); 8307 CHECK_NULL_RETURN_MEMERR(*np); 8308 qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); 8309 CHECK_NULL_RETURN_MEMERR(qn); 8310 NODE_BODY(qn) = *np; 8311 *np = qn; 8312 break; 8313 8314 case TK_BACKREF: 8315 len = tok->u.backref.num; 8316 *np = node_new_backref(len, 8317 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), 8318 tok->u.backref.by_name, 8319 #ifdef USE_BACKREF_WITH_LEVEL 8320 tok->u.backref.exist_level, 8321 tok->u.backref.level, 8322 #endif 8323 env); 8324 CHECK_NULL_RETURN_MEMERR(*np); 8325 break; 8326 8327 #ifdef USE_CALL 8328 case TK_CALL: 8329 { 8330 int gnum = tok->u.call.gnum; 8331 8332 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, 8333 gnum, tok->u.call.by_number); 8334 CHECK_NULL_RETURN_MEMERR(*np); 8335 env->num_call++; 8336 if (tok->u.call.by_number != 0 && gnum == 0) { 8337 env->has_call_zero = 1; 8338 } 8339 } 8340 break; 8341 #endif 8342 8343 case TK_ANCHOR: 8344 *np = node_new_anchor_with_options(tok->u.anchor, env->options); 8345 CHECK_NULL_RETURN_MEMERR(*np); 8346 break; 8347 8348 case TK_REPEAT: 8349 case TK_INTERVAL: 8350 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { 8351 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) 8352 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; 8353 else { 8354 *np = node_new_empty(); 8355 CHECK_NULL_RETURN_MEMERR(*np); 8356 } 8357 } 8358 else { 8359 goto tk_byte; 8360 } 8361 break; 8362 8363 case TK_KEEP: 8364 r = node_new_keep(np, env); 8365 if (r < 0) return r; 8366 break; 8367 8368 case TK_GENERAL_NEWLINE: 8369 r = node_new_general_newline(np, env); 8370 if (r < 0) return r; 8371 break; 8372 8373 case TK_NO_NEWLINE: 8374 r = node_new_no_newline(np, env); 8375 if (r < 0) return r; 8376 break; 8377 8378 case TK_TRUE_ANYCHAR: 8379 r = node_new_true_anychar(np); 8380 if (r < 0) return r; 8381 break; 8382 8383 case TK_TEXT_SEGMENT: 8384 r = make_text_segment(np, env); 8385 if (r < 0) return r; 8386 break; 8387 8388 default: 8389 return ONIGERR_PARSER_BUG; 8390 break; 8391 } 8392 8393 { 8394 tp = np; 8395 8396 re_entry: 8397 r = fetch_token(tok, src, end, env); 8398 if (r < 0) return r; 8399 8400 repeat: 8401 if (r == TK_REPEAT || r == TK_INTERVAL) { 8402 Node* target; 8403 8404 if (is_invalid_quantifier_target(*tp)) 8405 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; 8406 8407 INC_PARSE_DEPTH(parse_depth); 8408 8409 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, 8410 r == TK_INTERVAL); 8411 CHECK_NULL_RETURN_MEMERR(qn); 8412 QUANT_(qn)->greedy = tok->u.repeat.greedy; 8413 if (group == 2) { 8414 target = node_drop_group(*tp); 8415 *tp = NULL_NODE; 8416 } 8417 else { 8418 target = *tp; 8419 } 8420 r = assign_quantifier_body(qn, target, group, env); 8421 if (r < 0) { 8422 onig_node_free(qn); 8423 *tp = NULL_NODE; 8424 return r; 8425 } 8426 8427 if (tok->u.repeat.possessive != 0) { 8428 Node* en; 8429 en = node_new_bag(BAG_STOP_BACKTRACK); 8430 if (IS_NULL(en)) { 8431 onig_node_free(qn); 8432 return ONIGERR_MEMORY; 8433 } 8434 NODE_BODY(en) = qn; 8435 qn = en; 8436 } 8437 8438 if (r == 0) { 8439 *tp = qn; 8440 } 8441 else if (r == 1) { /* x{1,1} ==> x */ 8442 onig_node_free(qn); 8443 *tp = target; 8444 } 8445 else if (r == 2) { /* split case: /abc+/ */ 8446 Node *tmp; 8447 8448 *tp = node_new_list(*tp, NULL); 8449 if (IS_NULL(*tp)) { 8450 onig_node_free(qn); 8451 return ONIGERR_MEMORY; 8452 } 8453 tmp = NODE_CDR(*tp) = node_new_list(qn, NULL); 8454 if (IS_NULL(tmp)) { 8455 onig_node_free(qn); 8456 return ONIGERR_MEMORY; 8457 } 8458 tp = &(NODE_CAR(tmp)); 8459 } 8460 group = 0; 8461 goto re_entry; 8462 } 8463 } 8464 8465 return r; 8466 } 8467 8468 static int 8469 parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, 8470 ScanEnv* env, int group_head) 8471 { 8472 int r; 8473 Node *node, **headp; 8474 8475 *top = NULL; 8476 INC_PARSE_DEPTH(env->parse_depth); 8477 8478 r = parse_exp(&node, tok, term, src, end, env, group_head); 8479 if (r < 0) { 8480 onig_node_free(node); 8481 return r; 8482 } 8483 8484 if (r == TK_EOT || r == term || r == TK_ALT) { 8485 *top = node; 8486 } 8487 else { 8488 *top = node_new_list(node, NULL); 8489 if (IS_NULL(*top)) { 8490 onig_node_free(node); 8491 return ONIGERR_MEMORY; 8492 } 8493 8494 headp = &(NODE_CDR(*top)); 8495 while (r != TK_EOT && r != term && r != TK_ALT) { 8496 r = parse_exp(&node, tok, term, src, end, env, FALSE); 8497 if (r < 0) { 8498 onig_node_free(node); 8499 return r; 8500 } 8501 8502 if (NODE_TYPE(node) == NODE_LIST) { 8503 *headp = node; 8504 while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node); 8505 headp = &(NODE_CDR(node)); 8506 } 8507 else { 8508 *headp = node_new_list(node, NULL); 8509 headp = &(NODE_CDR(*headp)); 8510 } 8511 } 8512 } 8513 8514 DEC_PARSE_DEPTH(env->parse_depth); 8515 return r; 8516 } 8517 8518 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ 8519 static int 8520 parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, 8521 ScanEnv* env, int group_head) 8522 { 8523 int r; 8524 Node *node, **headp; 8525 OnigOptionType save_options; 8526 8527 *top = NULL; 8528 INC_PARSE_DEPTH(env->parse_depth); 8529 save_options = env->options; 8530 8531 r = parse_branch(&node, tok, term, src, end, env, group_head); 8532 if (r < 0) { 8533 onig_node_free(node); 8534 return r; 8535 } 8536 8537 if (r == term) { 8538 *top = node; 8539 } 8540 else if (r == TK_ALT) { 8541 *top = onig_node_new_alt(node, NULL); 8542 if (IS_NULL(*top)) { 8543 onig_node_free(node); 8544 return ONIGERR_MEMORY; 8545 } 8546 8547 headp = &(NODE_CDR(*top)); 8548 while (r == TK_ALT) { 8549 r = fetch_token(tok, src, end, env); 8550 if (r < 0) return r; 8551 r = parse_branch(&node, tok, term, src, end, env, FALSE); 8552 if (r < 0) { 8553 onig_node_free(node); 8554 return r; 8555 } 8556 *headp = onig_node_new_alt(node, NULL); 8557 if (IS_NULL(*headp)) { 8558 onig_node_free(node); 8559 onig_node_free(*top); 8560 return ONIGERR_MEMORY; 8561 } 8562 8563 headp = &(NODE_CDR(*headp)); 8564 } 8565 8566 if (tok->type != (enum TokenSyms )term) 8567 goto err; 8568 } 8569 else { 8570 onig_node_free(node); 8571 err: 8572 if (term == TK_SUBEXP_CLOSE) 8573 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 8574 else 8575 return ONIGERR_PARSER_BUG; 8576 } 8577 8578 env->options = save_options; 8579 DEC_PARSE_DEPTH(env->parse_depth); 8580 return r; 8581 } 8582 8583 static int 8584 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) 8585 { 8586 int r; 8587 PToken tok; 8588 8589 r = fetch_token(&tok, src, end, env); 8590 if (r < 0) return r; 8591 r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); 8592 if (r < 0) return r; 8593 8594 return 0; 8595 } 8596 8597 #ifdef USE_CALL 8598 static int 8599 make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) 8600 { 8601 int r; 8602 8603 Node* x = node_new_memory(0 /* 0: is not named */); 8604 CHECK_NULL_RETURN_MEMERR(x); 8605 8606 NODE_BODY(x) = node; 8607 BAG_(x)->m.regnum = 0; 8608 r = scan_env_set_mem_node(env, 0, x); 8609 if (r != 0) { 8610 onig_node_free(x); 8611 return r; 8612 } 8613 8614 *rnode = x; 8615 return 0; 8616 } 8617 #endif 8618 8619 extern int 8620 onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, 8621 regex_t* reg, ScanEnv* env) 8622 { 8623 int r; 8624 UChar* p; 8625 #ifdef USE_CALLOUT 8626 RegexExt* ext; 8627 #endif 8628 8629 names_clear(reg); 8630 8631 scan_env_clear(env); 8632 env->options = reg->options; 8633 env->case_fold_flag = reg->case_fold_flag; 8634 env->enc = reg->enc; 8635 env->syntax = reg->syntax; 8636 env->pattern = (UChar* )pattern; 8637 env->pattern_end = (UChar* )end; 8638 env->reg = reg; 8639 8640 *root = NULL; 8641 8642 if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end)) 8643 return ONIGERR_INVALID_WIDE_CHAR_VALUE; 8644 8645 p = (UChar* )pattern; 8646 r = parse_regexp(root, &p, (UChar* )end, env); 8647 8648 #ifdef USE_CALL 8649 if (r != 0) return r; 8650 8651 if (env->has_call_zero != 0) { 8652 Node* zero_node; 8653 r = make_call_zero_body(*root, env, &zero_node); 8654 if (r != 0) return r; 8655 8656 *root = zero_node; 8657 } 8658 #endif 8659 8660 reg->num_mem = env->num_mem; 8661 8662 #ifdef USE_CALLOUT 8663 ext = reg->extp; 8664 if (IS_NOT_NULL(ext) && ext->callout_num > 0) { 8665 r = setup_ext_callout_list_values(reg); 8666 } 8667 #endif 8668 8669 return r; 8670 } 8671 8672 extern void 8673 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, 8674 UChar* arg, UChar* arg_end) 8675 { 8676 env->error = arg; 8677 env->error_end = arg_end; 8678 } 8679