1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002-2018 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU General Public 8 License as published by the Free Software Foundation; either 9 version 3 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20 #ifdef _LIBC 21 # include <locale/weight.h> 22 #endif 23 24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 25 size_t length, reg_syntax_t syntax); 26 static void re_compile_fastmap_iter (regex_t *bufp, 27 const re_dfastate_t *init_state, 28 char *fastmap); 29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 30 #ifdef RE_ENABLE_I18N 31 static void free_charset (re_charset_t *cset); 32 #endif /* RE_ENABLE_I18N */ 33 static void free_workarea_compile (regex_t *preg); 34 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 35 #ifdef RE_ENABLE_I18N 36 static void optimize_utf8 (re_dfa_t *dfa); 37 #endif 38 static reg_errcode_t analyze (regex_t *preg); 39 static reg_errcode_t preorder (bin_tree_t *root, 40 reg_errcode_t (fn (void *, bin_tree_t *)), 41 void *extra); 42 static reg_errcode_t postorder (bin_tree_t *root, 43 reg_errcode_t (fn (void *, bin_tree_t *)), 44 void *extra); 45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 48 bin_tree_t *node); 49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 54 unsigned int constraint); 55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 57 Idx node, bool root); 58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 59 static Idx fetch_number (re_string_t *input, re_token_t *token, 60 reg_syntax_t syntax); 61 static int peek_token (re_token_t *token, re_string_t *input, 62 reg_syntax_t syntax); 63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 64 reg_syntax_t syntax, reg_errcode_t *err); 65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 66 re_token_t *token, reg_syntax_t syntax, 67 Idx nest, reg_errcode_t *err); 68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 69 re_token_t *token, reg_syntax_t syntax, 70 Idx nest, reg_errcode_t *err); 71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 72 re_token_t *token, reg_syntax_t syntax, 73 Idx nest, reg_errcode_t *err); 74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 75 re_token_t *token, reg_syntax_t syntax, 76 Idx nest, reg_errcode_t *err); 77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 78 re_dfa_t *dfa, re_token_t *token, 79 reg_syntax_t syntax, reg_errcode_t *err); 80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 81 re_token_t *token, reg_syntax_t syntax, 82 reg_errcode_t *err); 83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 84 re_string_t *regexp, 85 re_token_t *token, int token_len, 86 re_dfa_t *dfa, 87 reg_syntax_t syntax, 88 bool accept_hyphen); 89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 90 re_string_t *regexp, 91 re_token_t *token); 92 #ifdef RE_ENABLE_I18N 93 static reg_errcode_t build_equiv_class (bitset_t sbcset, 94 re_charset_t *mbcset, 95 Idx *equiv_class_alloc, 96 const unsigned char *name); 97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 98 bitset_t sbcset, 99 re_charset_t *mbcset, 100 Idx *char_class_alloc, 101 const char *class_name, 102 reg_syntax_t syntax); 103 #else /* not RE_ENABLE_I18N */ 104 static reg_errcode_t build_equiv_class (bitset_t sbcset, 105 const unsigned char *name); 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 107 bitset_t sbcset, 108 const char *class_name, 109 reg_syntax_t syntax); 110 #endif /* not RE_ENABLE_I18N */ 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 112 RE_TRANSLATE_TYPE trans, 113 const char *class_name, 114 const char *extra, 115 bool non_match, reg_errcode_t *err); 116 static bin_tree_t *create_tree (re_dfa_t *dfa, 117 bin_tree_t *left, bin_tree_t *right, 118 re_token_type_t type); 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 120 bin_tree_t *left, bin_tree_t *right, 121 const re_token_t *token); 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 123 static void free_token (re_token_t *node); 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 126 127 /* This table gives an error message for each of the error codes listed 128 in regex.h. Obviously the order here has to be same as there. 129 POSIX doesn't require that we do anything for REG_NOERROR, 130 but why not be nice? */ 131 132 static const char __re_error_msgid[] = 133 { 134 #define REG_NOERROR_IDX 0 135 gettext_noop ("Success") /* REG_NOERROR */ 136 "\0" 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 138 gettext_noop ("No match") /* REG_NOMATCH */ 139 "\0" 140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 142 "\0" 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 145 "\0" 146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 148 "\0" 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 151 "\0" 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 154 "\0" 155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */ 157 "\0" 158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=") 159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 160 "\0" 161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 163 "\0" 164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 166 "\0" 167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 168 gettext_noop ("Invalid range end") /* REG_ERANGE */ 169 "\0" 170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 171 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 172 "\0" 173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 175 "\0" 176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 177 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 178 "\0" 179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 180 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 181 "\0" 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 184 }; 185 186 static const size_t __re_error_msgid_idx[] = 187 { 188 REG_NOERROR_IDX, 189 REG_NOMATCH_IDX, 190 REG_BADPAT_IDX, 191 REG_ECOLLATE_IDX, 192 REG_ECTYPE_IDX, 193 REG_EESCAPE_IDX, 194 REG_ESUBREG_IDX, 195 REG_EBRACK_IDX, 196 REG_EPAREN_IDX, 197 REG_EBRACE_IDX, 198 REG_BADBR_IDX, 199 REG_ERANGE_IDX, 200 REG_ESPACE_IDX, 201 REG_BADRPT_IDX, 202 REG_EEND_IDX, 203 REG_ESIZE_IDX, 204 REG_ERPAREN_IDX 205 }; 206 207 /* Entry points for GNU code. */ 208 209 /* re_compile_pattern is the GNU regular expression compiler: it 210 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 211 Returns 0 if the pattern was valid, otherwise an error string. 212 213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields 214 are set in BUFP on entry. */ 215 216 const char * 217 re_compile_pattern (const char *pattern, size_t length, 218 struct re_pattern_buffer *bufp) 219 { 220 reg_errcode_t ret; 221 222 /* And GNU code determines whether or not to get register information 223 by passing null for the REGS argument to re_match, etc., not by 224 setting no_sub, unless RE_NO_SUB is set. */ 225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 226 227 /* Match anchors at newline. */ 228 bufp->newline_anchor = 1; 229 230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 231 232 if (!ret) 233 return NULL; 234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 235 } 236 #ifdef _LIBC 237 weak_alias (__re_compile_pattern, re_compile_pattern) 238 #endif 239 240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can 241 also be assigned to arbitrarily: each pattern buffer stores its own 242 syntax, so it can be changed between regex compilations. */ 243 /* This has no initializer because initialized variables in Emacs 244 become read-only after dumping. */ 245 reg_syntax_t re_syntax_options; 246 247 248 /* Specify the precise syntax of regexps for compilation. This provides 249 for compatibility for various utilities which historically have 250 different, incompatible syntaxes. 251 252 The argument SYNTAX is a bit mask comprised of the various bits 253 defined in regex.h. We return the old syntax. */ 254 255 reg_syntax_t 256 re_set_syntax (reg_syntax_t syntax) 257 { 258 reg_syntax_t ret = re_syntax_options; 259 260 re_syntax_options = syntax; 261 return ret; 262 } 263 #ifdef _LIBC 264 weak_alias (__re_set_syntax, re_set_syntax) 265 #endif 266 267 int 268 re_compile_fastmap (struct re_pattern_buffer *bufp) 269 { 270 re_dfa_t *dfa = bufp->buffer; 271 char *fastmap = bufp->fastmap; 272 273 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 275 if (dfa->init_state != dfa->init_state_word) 276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 277 if (dfa->init_state != dfa->init_state_nl) 278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 279 if (dfa->init_state != dfa->init_state_begbuf) 280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 281 bufp->fastmap_accurate = 1; 282 return 0; 283 } 284 #ifdef _LIBC 285 weak_alias (__re_compile_fastmap, re_compile_fastmap) 286 #endif 287 288 static inline void 289 __attribute__ ((always_inline)) 290 re_set_fastmap (char *fastmap, bool icase, int ch) 291 { 292 fastmap[ch] = 1; 293 if (icase) 294 fastmap[tolower (ch)] = 1; 295 } 296 297 /* Helper function for re_compile_fastmap. 298 Compile fastmap for the initial_state INIT_STATE. */ 299 300 static void 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 302 char *fastmap) 303 { 304 re_dfa_t *dfa = bufp->buffer; 305 Idx node_cnt; 306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 308 { 309 Idx node = init_state->nodes.elems[node_cnt]; 310 re_token_type_t type = dfa->nodes[node].type; 311 312 if (type == CHARACTER) 313 { 314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 315 #ifdef RE_ENABLE_I18N 316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 317 { 318 unsigned char buf[MB_LEN_MAX]; 319 unsigned char *p; 320 wchar_t wc; 321 mbstate_t state; 322 323 p = buf; 324 *p++ = dfa->nodes[node].opr.c; 325 while (++node < dfa->nodes_len 326 && dfa->nodes[node].type == CHARACTER 327 && dfa->nodes[node].mb_partial) 328 *p++ = dfa->nodes[node].opr.c; 329 memset (&state, '\0', sizeof (state)); 330 if (__mbrtowc (&wc, (const char *) buf, p - buf, 331 &state) == p - buf 332 && (__wcrtomb ((char *) buf, __towlower (wc), &state) 333 != (size_t) -1)) 334 re_set_fastmap (fastmap, false, buf[0]); 335 } 336 #endif 337 } 338 else if (type == SIMPLE_BRACKET) 339 { 340 int i, ch; 341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 342 { 343 int j; 344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 346 if (w & ((bitset_word_t) 1 << j)) 347 re_set_fastmap (fastmap, icase, ch); 348 } 349 } 350 #ifdef RE_ENABLE_I18N 351 else if (type == COMPLEX_BRACKET) 352 { 353 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 354 Idx i; 355 356 # ifdef _LIBC 357 /* See if we have to try all bytes which start multiple collation 358 elements. 359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 360 collation element, and don't catch 'b' since 'b' is 361 the only collation element which starts from 'b' (and 362 it is caught by SIMPLE_BRACKET). */ 363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 364 && (cset->ncoll_syms || cset->nranges)) 365 { 366 const int32_t *table = (const int32_t *) 367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 368 for (i = 0; i < SBC_MAX; ++i) 369 if (table[i] < 0) 370 re_set_fastmap (fastmap, icase, i); 371 } 372 # endif /* _LIBC */ 373 374 /* See if we have to start the match at all multibyte characters, 375 i.e. where we would not find an invalid sequence. This only 376 applies to multibyte character sets; for single byte character 377 sets, the SIMPLE_BRACKET again suffices. */ 378 if (dfa->mb_cur_max > 1 379 && (cset->nchar_classes || cset->non_match || cset->nranges 380 # ifdef _LIBC 381 || cset->nequiv_classes 382 # endif /* _LIBC */ 383 )) 384 { 385 unsigned char c = 0; 386 do 387 { 388 mbstate_t mbs; 389 memset (&mbs, 0, sizeof (mbs)); 390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 391 re_set_fastmap (fastmap, false, (int) c); 392 } 393 while (++c != 0); 394 } 395 396 else 397 { 398 /* ... Else catch all bytes which can start the mbchars. */ 399 for (i = 0; i < cset->nmbchars; ++i) 400 { 401 char buf[256]; 402 mbstate_t state; 403 memset (&state, '\0', sizeof (state)); 404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 407 { 408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state) 409 != (size_t) -1) 410 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 411 } 412 } 413 } 414 } 415 #endif /* RE_ENABLE_I18N */ 416 else if (type == OP_PERIOD 417 #ifdef RE_ENABLE_I18N 418 || type == OP_UTF8_PERIOD 419 #endif /* RE_ENABLE_I18N */ 420 || type == END_OF_RE) 421 { 422 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 423 if (type == END_OF_RE) 424 bufp->can_be_null = 1; 425 return; 426 } 427 } 428 } 429 430 /* Entry point for POSIX code. */ 431 /* regcomp takes a regular expression as a string and compiles it. 432 433 PREG is a regex_t *. We do not expect any fields to be initialized, 434 since POSIX says we shouldn't. Thus, we set 435 436 'buffer' to the compiled pattern; 437 'used' to the length of the compiled pattern; 438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the 439 REG_EXTENDED bit in CFLAGS is set; otherwise, to 440 RE_SYNTAX_POSIX_BASIC; 441 'newline_anchor' to REG_NEWLINE being set in CFLAGS; 442 'fastmap' to an allocated space for the fastmap; 443 'fastmap_accurate' to zero; 444 're_nsub' to the number of subexpressions in PATTERN. 445 446 PATTERN is the address of the pattern string. 447 448 CFLAGS is a series of bits which affect compilation. 449 450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 451 use POSIX basic syntax. 452 453 If REG_NEWLINE is set, then . and [^...] don't match newline. 454 Also, regexec will try a match beginning after every newline. 455 456 If REG_ICASE is set, then we considers upper- and lowercase 457 versions of letters to be equivalent when matching. 458 459 If REG_NOSUB is set, then when PREG is passed to regexec, that 460 routine will report only success or failure, and nothing about the 461 registers. 462 463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 464 the return codes and their meanings.) */ 465 466 int 467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags) 468 { 469 reg_errcode_t ret; 470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 471 : RE_SYNTAX_POSIX_BASIC); 472 473 preg->buffer = NULL; 474 preg->allocated = 0; 475 preg->used = 0; 476 477 /* Try to allocate space for the fastmap. */ 478 preg->fastmap = re_malloc (char, SBC_MAX); 479 if (__glibc_unlikely (preg->fastmap == NULL)) 480 return REG_ESPACE; 481 482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 483 484 /* If REG_NEWLINE is set, newlines are treated differently. */ 485 if (cflags & REG_NEWLINE) 486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 487 syntax &= ~RE_DOT_NEWLINE; 488 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 489 /* It also changes the matching behavior. */ 490 preg->newline_anchor = 1; 491 } 492 else 493 preg->newline_anchor = 0; 494 preg->no_sub = !!(cflags & REG_NOSUB); 495 preg->translate = NULL; 496 497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 498 499 /* POSIX doesn't distinguish between an unmatched open-group and an 500 unmatched close-group: both are REG_EPAREN. */ 501 if (ret == REG_ERPAREN) 502 ret = REG_EPAREN; 503 504 /* We have already checked preg->fastmap != NULL. */ 505 if (__glibc_likely (ret == REG_NOERROR)) 506 /* Compute the fastmap now, since regexec cannot modify the pattern 507 buffer. This function never fails in this implementation. */ 508 (void) re_compile_fastmap (preg); 509 else 510 { 511 /* Some error occurred while compiling the expression. */ 512 re_free (preg->fastmap); 513 preg->fastmap = NULL; 514 } 515 516 return (int) ret; 517 } 518 #ifdef _LIBC 519 libc_hidden_def (__regcomp) 520 weak_alias (__regcomp, regcomp) 521 #endif 522 523 /* Returns a message corresponding to an error code, ERRCODE, returned 524 from either regcomp or regexec. We don't use PREG here. */ 525 526 size_t 527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf, 528 size_t errbuf_size) 529 { 530 const char *msg; 531 size_t msg_size; 532 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0]; 533 534 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes)) 535 /* Only error codes returned by the rest of the code should be passed 536 to this routine. If we are given anything else, or if other regex 537 code generates an invalid error code, then the program has a bug. 538 Dump core so we can fix it. */ 539 abort (); 540 541 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 542 543 msg_size = strlen (msg) + 1; /* Includes the null. */ 544 545 if (__glibc_likely (errbuf_size != 0)) 546 { 547 size_t cpy_size = msg_size; 548 if (__glibc_unlikely (msg_size > errbuf_size)) 549 { 550 cpy_size = errbuf_size - 1; 551 errbuf[cpy_size] = '\0'; 552 } 553 memcpy (errbuf, msg, cpy_size); 554 } 555 556 return msg_size; 557 } 558 #ifdef _LIBC 559 weak_alias (__regerror, regerror) 560 #endif 561 562 563 #ifdef RE_ENABLE_I18N 564 /* This static array is used for the map to single-byte characters when 565 UTF-8 is used. Otherwise we would allocate memory just to initialize 566 it the same all the time. UTF-8 is the preferred encoding so this is 567 a worthwhile optimization. */ 568 static const bitset_t utf8_sb_map = 569 { 570 /* Set the first 128 bits. */ 571 # if defined __GNUC__ && !defined __STRICT_ANSI__ 572 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 573 # else 574 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 575 # error "bitset_word_t is narrower than 32 bits" 576 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 577 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 578 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 579 BITSET_WORD_MAX, BITSET_WORD_MAX, 580 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 581 BITSET_WORD_MAX, 582 # endif 583 (BITSET_WORD_MAX 584 >> (SBC_MAX % BITSET_WORD_BITS == 0 585 ? 0 586 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 587 # endif 588 }; 589 #endif 590 591 592 static void 593 free_dfa_content (re_dfa_t *dfa) 594 { 595 Idx i, j; 596 597 if (dfa->nodes) 598 for (i = 0; i < dfa->nodes_len; ++i) 599 free_token (dfa->nodes + i); 600 re_free (dfa->nexts); 601 for (i = 0; i < dfa->nodes_len; ++i) 602 { 603 if (dfa->eclosures != NULL) 604 re_node_set_free (dfa->eclosures + i); 605 if (dfa->inveclosures != NULL) 606 re_node_set_free (dfa->inveclosures + i); 607 if (dfa->edests != NULL) 608 re_node_set_free (dfa->edests + i); 609 } 610 re_free (dfa->edests); 611 re_free (dfa->eclosures); 612 re_free (dfa->inveclosures); 613 re_free (dfa->nodes); 614 615 if (dfa->state_table) 616 for (i = 0; i <= dfa->state_hash_mask; ++i) 617 { 618 struct re_state_table_entry *entry = dfa->state_table + i; 619 for (j = 0; j < entry->num; ++j) 620 { 621 re_dfastate_t *state = entry->array[j]; 622 free_state (state); 623 } 624 re_free (entry->array); 625 } 626 re_free (dfa->state_table); 627 #ifdef RE_ENABLE_I18N 628 if (dfa->sb_char != utf8_sb_map) 629 re_free (dfa->sb_char); 630 #endif 631 re_free (dfa->subexp_map); 632 #ifdef DEBUG 633 re_free (dfa->re_str); 634 #endif 635 636 re_free (dfa); 637 } 638 639 640 /* Free dynamically allocated space used by PREG. */ 641 642 void 643 regfree (regex_t *preg) 644 { 645 re_dfa_t *dfa = preg->buffer; 646 if (__glibc_likely (dfa != NULL)) 647 { 648 lock_fini (dfa->lock); 649 free_dfa_content (dfa); 650 } 651 preg->buffer = NULL; 652 preg->allocated = 0; 653 654 re_free (preg->fastmap); 655 preg->fastmap = NULL; 656 657 re_free (preg->translate); 658 preg->translate = NULL; 659 } 660 #ifdef _LIBC 661 libc_hidden_def (__regfree) 662 weak_alias (__regfree, regfree) 663 #endif 664 665 /* Entry points compatible with 4.2 BSD regex library. We don't define 666 them unless specifically requested. */ 667 668 #if defined _REGEX_RE_COMP || defined _LIBC 669 670 /* BSD has one and only one pattern buffer. */ 671 static struct re_pattern_buffer re_comp_buf; 672 673 char * 674 # ifdef _LIBC 675 /* Make these definitions weak in libc, so POSIX programs can redefine 676 these names if they don't use our functions, and still use 677 regcomp/regexec above without link errors. */ 678 weak_function 679 # endif 680 re_comp (const char *s) 681 { 682 reg_errcode_t ret; 683 char *fastmap; 684 685 if (!s) 686 { 687 if (!re_comp_buf.buffer) 688 return gettext ("No previous regular expression"); 689 return 0; 690 } 691 692 if (re_comp_buf.buffer) 693 { 694 fastmap = re_comp_buf.fastmap; 695 re_comp_buf.fastmap = NULL; 696 __regfree (&re_comp_buf); 697 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 698 re_comp_buf.fastmap = fastmap; 699 } 700 701 if (re_comp_buf.fastmap == NULL) 702 { 703 re_comp_buf.fastmap = re_malloc (char, SBC_MAX); 704 if (re_comp_buf.fastmap == NULL) 705 return (char *) gettext (__re_error_msgid 706 + __re_error_msgid_idx[(int) REG_ESPACE]); 707 } 708 709 /* Since 're_exec' always passes NULL for the 'regs' argument, we 710 don't need to initialize the pattern buffer fields which affect it. */ 711 712 /* Match anchors at newlines. */ 713 re_comp_buf.newline_anchor = 1; 714 715 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 716 717 if (!ret) 718 return NULL; 719 720 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ 721 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 722 } 723 724 #ifdef _LIBC 725 libc_freeres_fn (free_mem) 726 { 727 __regfree (&re_comp_buf); 728 } 729 #endif 730 731 #endif /* _REGEX_RE_COMP */ 732 733 /* Internal entry point. 734 Compile the regular expression PATTERN, whose length is LENGTH. 735 SYNTAX indicate regular expression's syntax. */ 736 737 static reg_errcode_t 738 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 739 reg_syntax_t syntax) 740 { 741 reg_errcode_t err = REG_NOERROR; 742 re_dfa_t *dfa; 743 re_string_t regexp; 744 745 /* Initialize the pattern buffer. */ 746 preg->fastmap_accurate = 0; 747 preg->syntax = syntax; 748 preg->not_bol = preg->not_eol = 0; 749 preg->used = 0; 750 preg->re_nsub = 0; 751 preg->can_be_null = 0; 752 preg->regs_allocated = REGS_UNALLOCATED; 753 754 /* Initialize the dfa. */ 755 dfa = preg->buffer; 756 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t))) 757 { 758 /* If zero allocated, but buffer is non-null, try to realloc 759 enough space. This loses if buffer's address is bogus, but 760 that is the user's responsibility. If ->buffer is NULL this 761 is a simple allocation. */ 762 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 763 if (dfa == NULL) 764 return REG_ESPACE; 765 preg->allocated = sizeof (re_dfa_t); 766 preg->buffer = dfa; 767 } 768 preg->used = sizeof (re_dfa_t); 769 770 err = init_dfa (dfa, length); 771 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0)) 772 err = REG_ESPACE; 773 if (__glibc_unlikely (err != REG_NOERROR)) 774 { 775 free_dfa_content (dfa); 776 preg->buffer = NULL; 777 preg->allocated = 0; 778 return err; 779 } 780 #ifdef DEBUG 781 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 782 dfa->re_str = re_malloc (char, length + 1); 783 strncpy (dfa->re_str, pattern, length + 1); 784 #endif 785 786 err = re_string_construct (®exp, pattern, length, preg->translate, 787 (syntax & RE_ICASE) != 0, dfa); 788 if (__glibc_unlikely (err != REG_NOERROR)) 789 { 790 re_compile_internal_free_return: 791 free_workarea_compile (preg); 792 re_string_destruct (®exp); 793 lock_fini (dfa->lock); 794 free_dfa_content (dfa); 795 preg->buffer = NULL; 796 preg->allocated = 0; 797 return err; 798 } 799 800 /* Parse the regular expression, and build a structure tree. */ 801 preg->re_nsub = 0; 802 dfa->str_tree = parse (®exp, preg, syntax, &err); 803 if (__glibc_unlikely (dfa->str_tree == NULL)) 804 goto re_compile_internal_free_return; 805 806 /* Analyze the tree and create the nfa. */ 807 err = analyze (preg); 808 if (__glibc_unlikely (err != REG_NOERROR)) 809 goto re_compile_internal_free_return; 810 811 #ifdef RE_ENABLE_I18N 812 /* If possible, do searching in single byte encoding to speed things up. */ 813 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 814 optimize_utf8 (dfa); 815 #endif 816 817 /* Then create the initial state of the dfa. */ 818 err = create_initial_state (dfa); 819 820 /* Release work areas. */ 821 free_workarea_compile (preg); 822 re_string_destruct (®exp); 823 824 if (__glibc_unlikely (err != REG_NOERROR)) 825 { 826 lock_fini (dfa->lock); 827 free_dfa_content (dfa); 828 preg->buffer = NULL; 829 preg->allocated = 0; 830 } 831 832 return err; 833 } 834 835 /* Initialize DFA. We use the length of the regular expression PAT_LEN 836 as the initial length of some arrays. */ 837 838 static reg_errcode_t 839 init_dfa (re_dfa_t *dfa, size_t pat_len) 840 { 841 __re_size_t table_size; 842 #ifndef _LIBC 843 const char *codeset_name; 844 #endif 845 #ifdef RE_ENABLE_I18N 846 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 847 #else 848 size_t max_i18n_object_size = 0; 849 #endif 850 size_t max_object_size = 851 MAX (sizeof (struct re_state_table_entry), 852 MAX (sizeof (re_token_t), 853 MAX (sizeof (re_node_set), 854 MAX (sizeof (regmatch_t), 855 max_i18n_object_size)))); 856 857 memset (dfa, '\0', sizeof (re_dfa_t)); 858 859 /* Force allocation of str_tree_storage the first time. */ 860 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 861 862 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 863 calculation below, and for similar doubling calculations 864 elsewhere. And it's <= rather than <, because some of the 865 doubling calculations add 1 afterwards. */ 866 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 867 <= pat_len)) 868 return REG_ESPACE; 869 870 dfa->nodes_alloc = pat_len + 1; 871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 872 873 /* table_size = 2 ^ ceil(log pat_len) */ 874 for (table_size = 1; ; table_size <<= 1) 875 if (table_size > pat_len) 876 break; 877 878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 879 dfa->state_hash_mask = table_size - 1; 880 881 dfa->mb_cur_max = MB_CUR_MAX; 882 #ifdef _LIBC 883 if (dfa->mb_cur_max == 6 884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 885 dfa->is_utf8 = 1; 886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 887 != 0); 888 #else 889 codeset_name = nl_langinfo (CODESET); 890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') 891 && (codeset_name[1] == 'T' || codeset_name[1] == 't') 892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f') 893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) 894 dfa->is_utf8 = 1; 895 896 /* We check exhaustively in the loop below if this charset is a 897 superset of ASCII. */ 898 dfa->map_notascii = 0; 899 #endif 900 901 #ifdef RE_ENABLE_I18N 902 if (dfa->mb_cur_max > 1) 903 { 904 if (dfa->is_utf8) 905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 906 else 907 { 908 int i, j, ch; 909 910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 911 if (__glibc_unlikely (dfa->sb_char == NULL)) 912 return REG_ESPACE; 913 914 /* Set the bits corresponding to single byte chars. */ 915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 917 { 918 wint_t wch = __btowc (ch); 919 if (wch != WEOF) 920 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 921 # ifndef _LIBC 922 if (isascii (ch) && wch != ch) 923 dfa->map_notascii = 1; 924 # endif 925 } 926 } 927 } 928 #endif 929 930 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL)) 931 return REG_ESPACE; 932 return REG_NOERROR; 933 } 934 935 /* Initialize WORD_CHAR table, which indicate which character is 936 "word". In this case "word" means that it is the word construction 937 character used by some operators like "\<", "\>", etc. */ 938 939 static void 940 init_word_char (re_dfa_t *dfa) 941 { 942 int i = 0; 943 int j; 944 int ch = 0; 945 dfa->word_ops_used = 1; 946 if (__glibc_likely (dfa->map_notascii == 0)) 947 { 948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack 949 them, an issue when this code is used in Gnulib. */ 950 bitset_word_t bits0 = 0x00000000; 951 bitset_word_t bits1 = 0x03ff0000; 952 bitset_word_t bits2 = 0x87fffffe; 953 bitset_word_t bits3 = 0x07fffffe; 954 if (BITSET_WORD_BITS == 64) 955 { 956 /* Pacify gcc -Woverflow on 32-bit platformns. */ 957 dfa->word_char[0] = bits1 << 31 << 1 | bits0; 958 dfa->word_char[1] = bits3 << 31 << 1 | bits2; 959 i = 2; 960 } 961 else if (BITSET_WORD_BITS == 32) 962 { 963 dfa->word_char[0] = bits0; 964 dfa->word_char[1] = bits1; 965 dfa->word_char[2] = bits2; 966 dfa->word_char[3] = bits3; 967 i = 4; 968 } 969 else 970 goto general_case; 971 ch = 128; 972 973 if (__glibc_likely (dfa->is_utf8)) 974 { 975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 976 return; 977 } 978 } 979 980 general_case: 981 for (; i < BITSET_WORDS; ++i) 982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 983 if (isalnum (ch) || ch == '_') 984 dfa->word_char[i] |= (bitset_word_t) 1 << j; 985 } 986 987 /* Free the work area which are only used while compiling. */ 988 989 static void 990 free_workarea_compile (regex_t *preg) 991 { 992 re_dfa_t *dfa = preg->buffer; 993 bin_tree_storage_t *storage, *next; 994 for (storage = dfa->str_tree_storage; storage; storage = next) 995 { 996 next = storage->next; 997 re_free (storage); 998 } 999 dfa->str_tree_storage = NULL; 1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 1001 dfa->str_tree = NULL; 1002 re_free (dfa->org_indices); 1003 dfa->org_indices = NULL; 1004 } 1005 1006 /* Create initial states for all contexts. */ 1007 1008 static reg_errcode_t 1009 create_initial_state (re_dfa_t *dfa) 1010 { 1011 Idx first, i; 1012 reg_errcode_t err; 1013 re_node_set init_nodes; 1014 1015 /* Initial states have the epsilon closure of the node which is 1016 the first node of the regular expression. */ 1017 first = dfa->str_tree->first->node_idx; 1018 dfa->init_node = first; 1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1020 if (__glibc_unlikely (err != REG_NOERROR)) 1021 return err; 1022 1023 /* The back-references which are in initial states can epsilon transit, 1024 since in this case all of the subexpressions can be null. 1025 Then we add epsilon closures of the nodes which are the next nodes of 1026 the back-references. */ 1027 if (dfa->nbackref > 0) 1028 for (i = 0; i < init_nodes.nelem; ++i) 1029 { 1030 Idx node_idx = init_nodes.elems[i]; 1031 re_token_type_t type = dfa->nodes[node_idx].type; 1032 1033 Idx clexp_idx; 1034 if (type != OP_BACK_REF) 1035 continue; 1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1037 { 1038 re_token_t *clexp_node; 1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1040 if (clexp_node->type == OP_CLOSE_SUBEXP 1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1042 break; 1043 } 1044 if (clexp_idx == init_nodes.nelem) 1045 continue; 1046 1047 if (type == OP_BACK_REF) 1048 { 1049 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1050 if (!re_node_set_contains (&init_nodes, dest_idx)) 1051 { 1052 reg_errcode_t merge_err 1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1054 if (merge_err != REG_NOERROR) 1055 return merge_err; 1056 i = 0; 1057 } 1058 } 1059 } 1060 1061 /* It must be the first time to invoke acquire_state. */ 1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1063 /* We don't check ERR here, since the initial state must not be NULL. */ 1064 if (__glibc_unlikely (dfa->init_state == NULL)) 1065 return err; 1066 if (dfa->init_state->has_constraint) 1067 { 1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1069 CONTEXT_WORD); 1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1071 CONTEXT_NEWLINE); 1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1073 &init_nodes, 1074 CONTEXT_NEWLINE 1075 | CONTEXT_BEGBUF); 1076 if (__glibc_unlikely (dfa->init_state_word == NULL 1077 || dfa->init_state_nl == NULL 1078 || dfa->init_state_begbuf == NULL)) 1079 return err; 1080 } 1081 else 1082 dfa->init_state_word = dfa->init_state_nl 1083 = dfa->init_state_begbuf = dfa->init_state; 1084 1085 re_node_set_free (&init_nodes); 1086 return REG_NOERROR; 1087 } 1088 1089 #ifdef RE_ENABLE_I18N 1090 /* If it is possible to do searching in single byte encoding instead of UTF-8 1091 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1092 DFA nodes where needed. */ 1093 1094 static void 1095 optimize_utf8 (re_dfa_t *dfa) 1096 { 1097 Idx node; 1098 int i; 1099 bool mb_chars = false; 1100 bool has_period = false; 1101 1102 for (node = 0; node < dfa->nodes_len; ++node) 1103 switch (dfa->nodes[node].type) 1104 { 1105 case CHARACTER: 1106 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1107 mb_chars = true; 1108 break; 1109 case ANCHOR: 1110 switch (dfa->nodes[node].opr.ctx_type) 1111 { 1112 case LINE_FIRST: 1113 case LINE_LAST: 1114 case BUF_FIRST: 1115 case BUF_LAST: 1116 break; 1117 default: 1118 /* Word anchors etc. cannot be handled. It's okay to test 1119 opr.ctx_type since constraints (for all DFA nodes) are 1120 created by ORing one or more opr.ctx_type values. */ 1121 return; 1122 } 1123 break; 1124 case OP_PERIOD: 1125 has_period = true; 1126 break; 1127 case OP_BACK_REF: 1128 case OP_ALT: 1129 case END_OF_RE: 1130 case OP_DUP_ASTERISK: 1131 case OP_OPEN_SUBEXP: 1132 case OP_CLOSE_SUBEXP: 1133 break; 1134 case COMPLEX_BRACKET: 1135 return; 1136 case SIMPLE_BRACKET: 1137 /* Just double check. */ 1138 { 1139 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1140 ? 0 1141 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1142 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1143 { 1144 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1145 return; 1146 rshift = 0; 1147 } 1148 } 1149 break; 1150 default: 1151 abort (); 1152 } 1153 1154 if (mb_chars || has_period) 1155 for (node = 0; node < dfa->nodes_len; ++node) 1156 { 1157 if (dfa->nodes[node].type == CHARACTER 1158 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1159 dfa->nodes[node].mb_partial = 0; 1160 else if (dfa->nodes[node].type == OP_PERIOD) 1161 dfa->nodes[node].type = OP_UTF8_PERIOD; 1162 } 1163 1164 /* The search can be in single byte locale. */ 1165 dfa->mb_cur_max = 1; 1166 dfa->is_utf8 = 0; 1167 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1168 } 1169 #endif 1170 1171 /* Analyze the structure tree, and calculate "first", "next", "edest", 1172 "eclosure", and "inveclosure". */ 1173 1174 static reg_errcode_t 1175 analyze (regex_t *preg) 1176 { 1177 re_dfa_t *dfa = preg->buffer; 1178 reg_errcode_t ret; 1179 1180 /* Allocate arrays. */ 1181 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1182 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1183 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1184 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1185 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL 1186 || dfa->edests == NULL || dfa->eclosures == NULL)) 1187 return REG_ESPACE; 1188 1189 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1190 if (dfa->subexp_map != NULL) 1191 { 1192 Idx i; 1193 for (i = 0; i < preg->re_nsub; i++) 1194 dfa->subexp_map[i] = i; 1195 preorder (dfa->str_tree, optimize_subexps, dfa); 1196 for (i = 0; i < preg->re_nsub; i++) 1197 if (dfa->subexp_map[i] != i) 1198 break; 1199 if (i == preg->re_nsub) 1200 { 1201 re_free (dfa->subexp_map); 1202 dfa->subexp_map = NULL; 1203 } 1204 } 1205 1206 ret = postorder (dfa->str_tree, lower_subexps, preg); 1207 if (__glibc_unlikely (ret != REG_NOERROR)) 1208 return ret; 1209 ret = postorder (dfa->str_tree, calc_first, dfa); 1210 if (__glibc_unlikely (ret != REG_NOERROR)) 1211 return ret; 1212 preorder (dfa->str_tree, calc_next, dfa); 1213 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1214 if (__glibc_unlikely (ret != REG_NOERROR)) 1215 return ret; 1216 ret = calc_eclosure (dfa); 1217 if (__glibc_unlikely (ret != REG_NOERROR)) 1218 return ret; 1219 1220 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1221 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1222 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1223 || dfa->nbackref) 1224 { 1225 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1226 if (__glibc_unlikely (dfa->inveclosures == NULL)) 1227 return REG_ESPACE; 1228 ret = calc_inveclosure (dfa); 1229 } 1230 1231 return ret; 1232 } 1233 1234 /* Our parse trees are very unbalanced, so we cannot use a stack to 1235 implement parse tree visits. Instead, we use parent pointers and 1236 some hairy code in these two functions. */ 1237 static reg_errcode_t 1238 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1239 void *extra) 1240 { 1241 bin_tree_t *node, *prev; 1242 1243 for (node = root; ; ) 1244 { 1245 /* Descend down the tree, preferably to the left (or to the right 1246 if that's the only child). */ 1247 while (node->left || node->right) 1248 if (node->left) 1249 node = node->left; 1250 else 1251 node = node->right; 1252 1253 do 1254 { 1255 reg_errcode_t err = fn (extra, node); 1256 if (__glibc_unlikely (err != REG_NOERROR)) 1257 return err; 1258 if (node->parent == NULL) 1259 return REG_NOERROR; 1260 prev = node; 1261 node = node->parent; 1262 } 1263 /* Go up while we have a node that is reached from the right. */ 1264 while (node->right == prev || node->right == NULL); 1265 node = node->right; 1266 } 1267 } 1268 1269 static reg_errcode_t 1270 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1271 void *extra) 1272 { 1273 bin_tree_t *node; 1274 1275 for (node = root; ; ) 1276 { 1277 reg_errcode_t err = fn (extra, node); 1278 if (__glibc_unlikely (err != REG_NOERROR)) 1279 return err; 1280 1281 /* Go to the left node, or up and to the right. */ 1282 if (node->left) 1283 node = node->left; 1284 else 1285 { 1286 bin_tree_t *prev = NULL; 1287 while (node->right == prev || node->right == NULL) 1288 { 1289 prev = node; 1290 node = node->parent; 1291 if (!node) 1292 return REG_NOERROR; 1293 } 1294 node = node->right; 1295 } 1296 } 1297 } 1298 1299 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1300 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1301 backreferences as well. Requires a preorder visit. */ 1302 static reg_errcode_t 1303 optimize_subexps (void *extra, bin_tree_t *node) 1304 { 1305 re_dfa_t *dfa = (re_dfa_t *) extra; 1306 1307 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1308 { 1309 int idx = node->token.opr.idx; 1310 node->token.opr.idx = dfa->subexp_map[idx]; 1311 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1312 } 1313 1314 else if (node->token.type == SUBEXP 1315 && node->left && node->left->token.type == SUBEXP) 1316 { 1317 Idx other_idx = node->left->token.opr.idx; 1318 1319 node->left = node->left->left; 1320 if (node->left) 1321 node->left->parent = node; 1322 1323 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1324 if (other_idx < BITSET_WORD_BITS) 1325 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1326 } 1327 1328 return REG_NOERROR; 1329 } 1330 1331 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1332 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1333 static reg_errcode_t 1334 lower_subexps (void *extra, bin_tree_t *node) 1335 { 1336 regex_t *preg = (regex_t *) extra; 1337 reg_errcode_t err = REG_NOERROR; 1338 1339 if (node->left && node->left->token.type == SUBEXP) 1340 { 1341 node->left = lower_subexp (&err, preg, node->left); 1342 if (node->left) 1343 node->left->parent = node; 1344 } 1345 if (node->right && node->right->token.type == SUBEXP) 1346 { 1347 node->right = lower_subexp (&err, preg, node->right); 1348 if (node->right) 1349 node->right->parent = node; 1350 } 1351 1352 return err; 1353 } 1354 1355 static bin_tree_t * 1356 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1357 { 1358 re_dfa_t *dfa = preg->buffer; 1359 bin_tree_t *body = node->left; 1360 bin_tree_t *op, *cls, *tree1, *tree; 1361 1362 if (preg->no_sub 1363 /* We do not optimize empty subexpressions, because otherwise we may 1364 have bad CONCAT nodes with NULL children. This is obviously not 1365 very common, so we do not lose much. An example that triggers 1366 this case is the sed "script" /\(\)/x. */ 1367 && node->left != NULL 1368 && (node->token.opr.idx >= BITSET_WORD_BITS 1369 || !(dfa->used_bkref_map 1370 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1371 return node->left; 1372 1373 /* Convert the SUBEXP node to the concatenation of an 1374 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1375 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1376 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1377 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1378 tree = create_tree (dfa, op, tree1, CONCAT); 1379 if (__glibc_unlikely (tree == NULL || tree1 == NULL 1380 || op == NULL || cls == NULL)) 1381 { 1382 *err = REG_ESPACE; 1383 return NULL; 1384 } 1385 1386 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1387 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1388 return tree; 1389 } 1390 1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1392 nodes. Requires a postorder visit. */ 1393 static reg_errcode_t 1394 calc_first (void *extra, bin_tree_t *node) 1395 { 1396 re_dfa_t *dfa = (re_dfa_t *) extra; 1397 if (node->token.type == CONCAT) 1398 { 1399 node->first = node->left->first; 1400 node->node_idx = node->left->node_idx; 1401 } 1402 else 1403 { 1404 node->first = node; 1405 node->node_idx = re_dfa_add_node (dfa, node->token); 1406 if (__glibc_unlikely (node->node_idx == -1)) 1407 return REG_ESPACE; 1408 if (node->token.type == ANCHOR) 1409 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1410 } 1411 return REG_NOERROR; 1412 } 1413 1414 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1415 static reg_errcode_t 1416 calc_next (void *extra, bin_tree_t *node) 1417 { 1418 switch (node->token.type) 1419 { 1420 case OP_DUP_ASTERISK: 1421 node->left->next = node; 1422 break; 1423 case CONCAT: 1424 node->left->next = node->right->first; 1425 node->right->next = node->next; 1426 break; 1427 default: 1428 if (node->left) 1429 node->left->next = node->next; 1430 if (node->right) 1431 node->right->next = node->next; 1432 break; 1433 } 1434 return REG_NOERROR; 1435 } 1436 1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1438 static reg_errcode_t 1439 link_nfa_nodes (void *extra, bin_tree_t *node) 1440 { 1441 re_dfa_t *dfa = (re_dfa_t *) extra; 1442 Idx idx = node->node_idx; 1443 reg_errcode_t err = REG_NOERROR; 1444 1445 switch (node->token.type) 1446 { 1447 case CONCAT: 1448 break; 1449 1450 case END_OF_RE: 1451 assert (node->next == NULL); 1452 break; 1453 1454 case OP_DUP_ASTERISK: 1455 case OP_ALT: 1456 { 1457 Idx left, right; 1458 dfa->has_plural_match = 1; 1459 if (node->left != NULL) 1460 left = node->left->first->node_idx; 1461 else 1462 left = node->next->node_idx; 1463 if (node->right != NULL) 1464 right = node->right->first->node_idx; 1465 else 1466 right = node->next->node_idx; 1467 assert (left > -1); 1468 assert (right > -1); 1469 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1470 } 1471 break; 1472 1473 case ANCHOR: 1474 case OP_OPEN_SUBEXP: 1475 case OP_CLOSE_SUBEXP: 1476 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1477 break; 1478 1479 case OP_BACK_REF: 1480 dfa->nexts[idx] = node->next->node_idx; 1481 if (node->token.type == OP_BACK_REF) 1482 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1483 break; 1484 1485 default: 1486 assert (!IS_EPSILON_NODE (node->token.type)); 1487 dfa->nexts[idx] = node->next->node_idx; 1488 break; 1489 } 1490 1491 return err; 1492 } 1493 1494 /* Duplicate the epsilon closure of the node ROOT_NODE. 1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1496 to their own constraint. */ 1497 1498 static reg_errcode_t 1499 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1500 Idx root_node, unsigned int init_constraint) 1501 { 1502 Idx org_node, clone_node; 1503 bool ok; 1504 unsigned int constraint = init_constraint; 1505 for (org_node = top_org_node, clone_node = top_clone_node;;) 1506 { 1507 Idx org_dest, clone_dest; 1508 if (dfa->nodes[org_node].type == OP_BACK_REF) 1509 { 1510 /* If the back reference epsilon-transit, its destination must 1511 also have the constraint. Then duplicate the epsilon closure 1512 of the destination of the back reference, and store it in 1513 edests of the back reference. */ 1514 org_dest = dfa->nexts[org_node]; 1515 re_node_set_empty (dfa->edests + clone_node); 1516 clone_dest = duplicate_node (dfa, org_dest, constraint); 1517 if (__glibc_unlikely (clone_dest == -1)) 1518 return REG_ESPACE; 1519 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1520 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1521 if (__glibc_unlikely (! ok)) 1522 return REG_ESPACE; 1523 } 1524 else if (dfa->edests[org_node].nelem == 0) 1525 { 1526 /* In case of the node can't epsilon-transit, don't duplicate the 1527 destination and store the original destination as the 1528 destination of the node. */ 1529 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1530 break; 1531 } 1532 else if (dfa->edests[org_node].nelem == 1) 1533 { 1534 /* In case of the node can epsilon-transit, and it has only one 1535 destination. */ 1536 org_dest = dfa->edests[org_node].elems[0]; 1537 re_node_set_empty (dfa->edests + clone_node); 1538 /* If the node is root_node itself, it means the epsilon closure 1539 has a loop. Then tie it to the destination of the root_node. */ 1540 if (org_node == root_node && clone_node != org_node) 1541 { 1542 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1543 if (__glibc_unlikely (! ok)) 1544 return REG_ESPACE; 1545 break; 1546 } 1547 /* In case the node has another constraint, append it. */ 1548 constraint |= dfa->nodes[org_node].constraint; 1549 clone_dest = duplicate_node (dfa, org_dest, constraint); 1550 if (__glibc_unlikely (clone_dest == -1)) 1551 return REG_ESPACE; 1552 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1553 if (__glibc_unlikely (! ok)) 1554 return REG_ESPACE; 1555 } 1556 else /* dfa->edests[org_node].nelem == 2 */ 1557 { 1558 /* In case of the node can epsilon-transit, and it has two 1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1560 org_dest = dfa->edests[org_node].elems[0]; 1561 re_node_set_empty (dfa->edests + clone_node); 1562 /* Search for a duplicated node which satisfies the constraint. */ 1563 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1564 if (clone_dest == -1) 1565 { 1566 /* There is no such duplicated node, create a new one. */ 1567 reg_errcode_t err; 1568 clone_dest = duplicate_node (dfa, org_dest, constraint); 1569 if (__glibc_unlikely (clone_dest == -1)) 1570 return REG_ESPACE; 1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1572 if (__glibc_unlikely (! ok)) 1573 return REG_ESPACE; 1574 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1575 root_node, constraint); 1576 if (__glibc_unlikely (err != REG_NOERROR)) 1577 return err; 1578 } 1579 else 1580 { 1581 /* There is a duplicated node which satisfies the constraint, 1582 use it to avoid infinite loop. */ 1583 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1584 if (__glibc_unlikely (! ok)) 1585 return REG_ESPACE; 1586 } 1587 1588 org_dest = dfa->edests[org_node].elems[1]; 1589 clone_dest = duplicate_node (dfa, org_dest, constraint); 1590 if (__glibc_unlikely (clone_dest == -1)) 1591 return REG_ESPACE; 1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1593 if (__glibc_unlikely (! ok)) 1594 return REG_ESPACE; 1595 } 1596 org_node = org_dest; 1597 clone_node = clone_dest; 1598 } 1599 return REG_NOERROR; 1600 } 1601 1602 /* Search for a node which is duplicated from the node ORG_NODE, and 1603 satisfies the constraint CONSTRAINT. */ 1604 1605 static Idx 1606 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1607 unsigned int constraint) 1608 { 1609 Idx idx; 1610 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1611 { 1612 if (org_node == dfa->org_indices[idx] 1613 && constraint == dfa->nodes[idx].constraint) 1614 return idx; /* Found. */ 1615 } 1616 return -1; /* Not found. */ 1617 } 1618 1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1620 Return the index of the new node, or -1 if insufficient storage is 1621 available. */ 1622 1623 static Idx 1624 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1625 { 1626 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1627 if (__glibc_likely (dup_idx != -1)) 1628 { 1629 dfa->nodes[dup_idx].constraint = constraint; 1630 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1631 dfa->nodes[dup_idx].duplicated = 1; 1632 1633 /* Store the index of the original node. */ 1634 dfa->org_indices[dup_idx] = org_idx; 1635 } 1636 return dup_idx; 1637 } 1638 1639 static reg_errcode_t 1640 calc_inveclosure (re_dfa_t *dfa) 1641 { 1642 Idx src, idx; 1643 bool ok; 1644 for (idx = 0; idx < dfa->nodes_len; ++idx) 1645 re_node_set_init_empty (dfa->inveclosures + idx); 1646 1647 for (src = 0; src < dfa->nodes_len; ++src) 1648 { 1649 Idx *elems = dfa->eclosures[src].elems; 1650 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1651 { 1652 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1653 if (__glibc_unlikely (! ok)) 1654 return REG_ESPACE; 1655 } 1656 } 1657 1658 return REG_NOERROR; 1659 } 1660 1661 /* Calculate "eclosure" for all the node in DFA. */ 1662 1663 static reg_errcode_t 1664 calc_eclosure (re_dfa_t *dfa) 1665 { 1666 Idx node_idx; 1667 bool incomplete; 1668 #ifdef DEBUG 1669 assert (dfa->nodes_len > 0); 1670 #endif 1671 incomplete = false; 1672 /* For each nodes, calculate epsilon closure. */ 1673 for (node_idx = 0; ; ++node_idx) 1674 { 1675 reg_errcode_t err; 1676 re_node_set eclosure_elem; 1677 if (node_idx == dfa->nodes_len) 1678 { 1679 if (!incomplete) 1680 break; 1681 incomplete = false; 1682 node_idx = 0; 1683 } 1684 1685 #ifdef DEBUG 1686 assert (dfa->eclosures[node_idx].nelem != -1); 1687 #endif 1688 1689 /* If we have already calculated, skip it. */ 1690 if (dfa->eclosures[node_idx].nelem != 0) 1691 continue; 1692 /* Calculate epsilon closure of 'node_idx'. */ 1693 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1694 if (__glibc_unlikely (err != REG_NOERROR)) 1695 return err; 1696 1697 if (dfa->eclosures[node_idx].nelem == 0) 1698 { 1699 incomplete = true; 1700 re_node_set_free (&eclosure_elem); 1701 } 1702 } 1703 return REG_NOERROR; 1704 } 1705 1706 /* Calculate epsilon closure of NODE. */ 1707 1708 static reg_errcode_t 1709 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1710 { 1711 reg_errcode_t err; 1712 Idx i; 1713 re_node_set eclosure; 1714 bool ok; 1715 bool incomplete = false; 1716 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1717 if (__glibc_unlikely (err != REG_NOERROR)) 1718 return err; 1719 1720 /* This indicates that we are calculating this node now. 1721 We reference this value to avoid infinite loop. */ 1722 dfa->eclosures[node].nelem = -1; 1723 1724 /* If the current node has constraints, duplicate all nodes 1725 since they must inherit the constraints. */ 1726 if (dfa->nodes[node].constraint 1727 && dfa->edests[node].nelem 1728 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1729 { 1730 err = duplicate_node_closure (dfa, node, node, node, 1731 dfa->nodes[node].constraint); 1732 if (__glibc_unlikely (err != REG_NOERROR)) 1733 return err; 1734 } 1735 1736 /* Expand each epsilon destination nodes. */ 1737 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1738 for (i = 0; i < dfa->edests[node].nelem; ++i) 1739 { 1740 re_node_set eclosure_elem; 1741 Idx edest = dfa->edests[node].elems[i]; 1742 /* If calculating the epsilon closure of 'edest' is in progress, 1743 return intermediate result. */ 1744 if (dfa->eclosures[edest].nelem == -1) 1745 { 1746 incomplete = true; 1747 continue; 1748 } 1749 /* If we haven't calculated the epsilon closure of 'edest' yet, 1750 calculate now. Otherwise use calculated epsilon closure. */ 1751 if (dfa->eclosures[edest].nelem == 0) 1752 { 1753 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1754 if (__glibc_unlikely (err != REG_NOERROR)) 1755 return err; 1756 } 1757 else 1758 eclosure_elem = dfa->eclosures[edest]; 1759 /* Merge the epsilon closure of 'edest'. */ 1760 err = re_node_set_merge (&eclosure, &eclosure_elem); 1761 if (__glibc_unlikely (err != REG_NOERROR)) 1762 return err; 1763 /* If the epsilon closure of 'edest' is incomplete, 1764 the epsilon closure of this node is also incomplete. */ 1765 if (dfa->eclosures[edest].nelem == 0) 1766 { 1767 incomplete = true; 1768 re_node_set_free (&eclosure_elem); 1769 } 1770 } 1771 1772 /* An epsilon closure includes itself. */ 1773 ok = re_node_set_insert (&eclosure, node); 1774 if (__glibc_unlikely (! ok)) 1775 return REG_ESPACE; 1776 if (incomplete && !root) 1777 dfa->eclosures[node].nelem = 0; 1778 else 1779 dfa->eclosures[node] = eclosure; 1780 *new_set = eclosure; 1781 return REG_NOERROR; 1782 } 1783 1784 /* Functions for token which are used in the parser. */ 1785 1786 /* Fetch a token from INPUT. 1787 We must not use this function inside bracket expressions. */ 1788 1789 static void 1790 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1791 { 1792 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1793 } 1794 1795 /* Peek a token from INPUT, and return the length of the token. 1796 We must not use this function inside bracket expressions. */ 1797 1798 static int 1799 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1800 { 1801 unsigned char c; 1802 1803 if (re_string_eoi (input)) 1804 { 1805 token->type = END_OF_RE; 1806 return 0; 1807 } 1808 1809 c = re_string_peek_byte (input, 0); 1810 token->opr.c = c; 1811 1812 token->word_char = 0; 1813 #ifdef RE_ENABLE_I18N 1814 token->mb_partial = 0; 1815 if (input->mb_cur_max > 1 && 1816 !re_string_first_byte (input, re_string_cur_idx (input))) 1817 { 1818 token->type = CHARACTER; 1819 token->mb_partial = 1; 1820 return 1; 1821 } 1822 #endif 1823 if (c == '\\') 1824 { 1825 unsigned char c2; 1826 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1827 { 1828 token->type = BACK_SLASH; 1829 return 1; 1830 } 1831 1832 c2 = re_string_peek_byte_case (input, 1); 1833 token->opr.c = c2; 1834 token->type = CHARACTER; 1835 #ifdef RE_ENABLE_I18N 1836 if (input->mb_cur_max > 1) 1837 { 1838 wint_t wc = re_string_wchar_at (input, 1839 re_string_cur_idx (input) + 1); 1840 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1841 } 1842 else 1843 #endif 1844 token->word_char = IS_WORD_CHAR (c2) != 0; 1845 1846 switch (c2) 1847 { 1848 case '|': 1849 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1850 token->type = OP_ALT; 1851 break; 1852 case '1': case '2': case '3': case '4': case '5': 1853 case '6': case '7': case '8': case '9': 1854 if (!(syntax & RE_NO_BK_REFS)) 1855 { 1856 token->type = OP_BACK_REF; 1857 token->opr.idx = c2 - '1'; 1858 } 1859 break; 1860 case '<': 1861 if (!(syntax & RE_NO_GNU_OPS)) 1862 { 1863 token->type = ANCHOR; 1864 token->opr.ctx_type = WORD_FIRST; 1865 } 1866 break; 1867 case '>': 1868 if (!(syntax & RE_NO_GNU_OPS)) 1869 { 1870 token->type = ANCHOR; 1871 token->opr.ctx_type = WORD_LAST; 1872 } 1873 break; 1874 case 'b': 1875 if (!(syntax & RE_NO_GNU_OPS)) 1876 { 1877 token->type = ANCHOR; 1878 token->opr.ctx_type = WORD_DELIM; 1879 } 1880 break; 1881 case 'B': 1882 if (!(syntax & RE_NO_GNU_OPS)) 1883 { 1884 token->type = ANCHOR; 1885 token->opr.ctx_type = NOT_WORD_DELIM; 1886 } 1887 break; 1888 case 'w': 1889 if (!(syntax & RE_NO_GNU_OPS)) 1890 token->type = OP_WORD; 1891 break; 1892 case 'W': 1893 if (!(syntax & RE_NO_GNU_OPS)) 1894 token->type = OP_NOTWORD; 1895 break; 1896 case 's': 1897 if (!(syntax & RE_NO_GNU_OPS)) 1898 token->type = OP_SPACE; 1899 break; 1900 case 'S': 1901 if (!(syntax & RE_NO_GNU_OPS)) 1902 token->type = OP_NOTSPACE; 1903 break; 1904 case '`': 1905 if (!(syntax & RE_NO_GNU_OPS)) 1906 { 1907 token->type = ANCHOR; 1908 token->opr.ctx_type = BUF_FIRST; 1909 } 1910 break; 1911 case '\'': 1912 if (!(syntax & RE_NO_GNU_OPS)) 1913 { 1914 token->type = ANCHOR; 1915 token->opr.ctx_type = BUF_LAST; 1916 } 1917 break; 1918 case '(': 1919 if (!(syntax & RE_NO_BK_PARENS)) 1920 token->type = OP_OPEN_SUBEXP; 1921 break; 1922 case ')': 1923 if (!(syntax & RE_NO_BK_PARENS)) 1924 token->type = OP_CLOSE_SUBEXP; 1925 break; 1926 case '+': 1927 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1928 token->type = OP_DUP_PLUS; 1929 break; 1930 case '?': 1931 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1932 token->type = OP_DUP_QUESTION; 1933 break; 1934 case '{': 1935 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1936 token->type = OP_OPEN_DUP_NUM; 1937 break; 1938 case '}': 1939 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1940 token->type = OP_CLOSE_DUP_NUM; 1941 break; 1942 default: 1943 break; 1944 } 1945 return 2; 1946 } 1947 1948 token->type = CHARACTER; 1949 #ifdef RE_ENABLE_I18N 1950 if (input->mb_cur_max > 1) 1951 { 1952 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1953 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1954 } 1955 else 1956 #endif 1957 token->word_char = IS_WORD_CHAR (token->opr.c); 1958 1959 switch (c) 1960 { 1961 case '\n': 1962 if (syntax & RE_NEWLINE_ALT) 1963 token->type = OP_ALT; 1964 break; 1965 case '|': 1966 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1967 token->type = OP_ALT; 1968 break; 1969 case '*': 1970 token->type = OP_DUP_ASTERISK; 1971 break; 1972 case '+': 1973 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1974 token->type = OP_DUP_PLUS; 1975 break; 1976 case '?': 1977 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1978 token->type = OP_DUP_QUESTION; 1979 break; 1980 case '{': 1981 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1982 token->type = OP_OPEN_DUP_NUM; 1983 break; 1984 case '}': 1985 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1986 token->type = OP_CLOSE_DUP_NUM; 1987 break; 1988 case '(': 1989 if (syntax & RE_NO_BK_PARENS) 1990 token->type = OP_OPEN_SUBEXP; 1991 break; 1992 case ')': 1993 if (syntax & RE_NO_BK_PARENS) 1994 token->type = OP_CLOSE_SUBEXP; 1995 break; 1996 case '[': 1997 token->type = OP_OPEN_BRACKET; 1998 break; 1999 case '.': 2000 token->type = OP_PERIOD; 2001 break; 2002 case '^': 2003 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 2004 re_string_cur_idx (input) != 0) 2005 { 2006 char prev = re_string_peek_byte (input, -1); 2007 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 2008 break; 2009 } 2010 token->type = ANCHOR; 2011 token->opr.ctx_type = LINE_FIRST; 2012 break; 2013 case '$': 2014 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2015 re_string_cur_idx (input) + 1 != re_string_length (input)) 2016 { 2017 re_token_t next; 2018 re_string_skip_bytes (input, 1); 2019 peek_token (&next, input, syntax); 2020 re_string_skip_bytes (input, -1); 2021 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2022 break; 2023 } 2024 token->type = ANCHOR; 2025 token->opr.ctx_type = LINE_LAST; 2026 break; 2027 default: 2028 break; 2029 } 2030 return 1; 2031 } 2032 2033 /* Peek a token from INPUT, and return the length of the token. 2034 We must not use this function out of bracket expressions. */ 2035 2036 static int 2037 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2038 { 2039 unsigned char c; 2040 if (re_string_eoi (input)) 2041 { 2042 token->type = END_OF_RE; 2043 return 0; 2044 } 2045 c = re_string_peek_byte (input, 0); 2046 token->opr.c = c; 2047 2048 #ifdef RE_ENABLE_I18N 2049 if (input->mb_cur_max > 1 && 2050 !re_string_first_byte (input, re_string_cur_idx (input))) 2051 { 2052 token->type = CHARACTER; 2053 return 1; 2054 } 2055 #endif /* RE_ENABLE_I18N */ 2056 2057 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2058 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2059 { 2060 /* In this case, '\' escape a character. */ 2061 unsigned char c2; 2062 re_string_skip_bytes (input, 1); 2063 c2 = re_string_peek_byte (input, 0); 2064 token->opr.c = c2; 2065 token->type = CHARACTER; 2066 return 1; 2067 } 2068 if (c == '[') /* '[' is a special char in a bracket exps. */ 2069 { 2070 unsigned char c2; 2071 int token_len; 2072 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2073 c2 = re_string_peek_byte (input, 1); 2074 else 2075 c2 = 0; 2076 token->opr.c = c2; 2077 token_len = 2; 2078 switch (c2) 2079 { 2080 case '.': 2081 token->type = OP_OPEN_COLL_ELEM; 2082 break; 2083 2084 case '=': 2085 token->type = OP_OPEN_EQUIV_CLASS; 2086 break; 2087 2088 case ':': 2089 if (syntax & RE_CHAR_CLASSES) 2090 { 2091 token->type = OP_OPEN_CHAR_CLASS; 2092 break; 2093 } 2094 FALLTHROUGH; 2095 default: 2096 token->type = CHARACTER; 2097 token->opr.c = c; 2098 token_len = 1; 2099 break; 2100 } 2101 return token_len; 2102 } 2103 switch (c) 2104 { 2105 case '-': 2106 token->type = OP_CHARSET_RANGE; 2107 break; 2108 case ']': 2109 token->type = OP_CLOSE_BRACKET; 2110 break; 2111 case '^': 2112 token->type = OP_NON_MATCH_LIST; 2113 break; 2114 default: 2115 token->type = CHARACTER; 2116 } 2117 return 1; 2118 } 2119 2120 /* Functions for parser. */ 2121 2122 /* Entry point of the parser. 2123 Parse the regular expression REGEXP and return the structure tree. 2124 If an error occurs, ERR is set by error code, and return NULL. 2125 This function build the following tree, from regular expression <reg_exp>: 2126 CAT 2127 / \ 2128 / \ 2129 <reg_exp> EOR 2130 2131 CAT means concatenation. 2132 EOR means end of regular expression. */ 2133 2134 static bin_tree_t * 2135 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2136 reg_errcode_t *err) 2137 { 2138 re_dfa_t *dfa = preg->buffer; 2139 bin_tree_t *tree, *eor, *root; 2140 re_token_t current_token; 2141 dfa->syntax = syntax; 2142 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2143 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2144 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2145 return NULL; 2146 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2147 if (tree != NULL) 2148 root = create_tree (dfa, tree, eor, CONCAT); 2149 else 2150 root = eor; 2151 if (__glibc_unlikely (eor == NULL || root == NULL)) 2152 { 2153 *err = REG_ESPACE; 2154 return NULL; 2155 } 2156 return root; 2157 } 2158 2159 /* This function build the following tree, from regular expression 2160 <branch1>|<branch2>: 2161 ALT 2162 / \ 2163 / \ 2164 <branch1> <branch2> 2165 2166 ALT means alternative, which represents the operator '|'. */ 2167 2168 static bin_tree_t * 2169 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2170 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2171 { 2172 re_dfa_t *dfa = preg->buffer; 2173 bin_tree_t *tree, *branch = NULL; 2174 bitset_word_t initial_bkref_map = dfa->completed_bkref_map; 2175 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2176 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2177 return NULL; 2178 2179 while (token->type == OP_ALT) 2180 { 2181 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2182 if (token->type != OP_ALT && token->type != END_OF_RE 2183 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2184 { 2185 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map; 2186 dfa->completed_bkref_map = initial_bkref_map; 2187 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2188 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL)) 2189 { 2190 if (tree != NULL) 2191 postorder (tree, free_tree, NULL); 2192 return NULL; 2193 } 2194 dfa->completed_bkref_map |= accumulated_bkref_map; 2195 } 2196 else 2197 branch = NULL; 2198 tree = create_tree (dfa, tree, branch, OP_ALT); 2199 if (__glibc_unlikely (tree == NULL)) 2200 { 2201 *err = REG_ESPACE; 2202 return NULL; 2203 } 2204 } 2205 return tree; 2206 } 2207 2208 /* This function build the following tree, from regular expression 2209 <exp1><exp2>: 2210 CAT 2211 / \ 2212 / \ 2213 <exp1> <exp2> 2214 2215 CAT means concatenation. */ 2216 2217 static bin_tree_t * 2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2219 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2220 { 2221 bin_tree_t *tree, *expr; 2222 re_dfa_t *dfa = preg->buffer; 2223 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2224 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2225 return NULL; 2226 2227 while (token->type != OP_ALT && token->type != END_OF_RE 2228 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2229 { 2230 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2231 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL)) 2232 { 2233 if (tree != NULL) 2234 postorder (tree, free_tree, NULL); 2235 return NULL; 2236 } 2237 if (tree != NULL && expr != NULL) 2238 { 2239 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); 2240 if (newtree == NULL) 2241 { 2242 postorder (expr, free_tree, NULL); 2243 postorder (tree, free_tree, NULL); 2244 *err = REG_ESPACE; 2245 return NULL; 2246 } 2247 tree = newtree; 2248 } 2249 else if (tree == NULL) 2250 tree = expr; 2251 /* Otherwise expr == NULL, we don't need to create new tree. */ 2252 } 2253 return tree; 2254 } 2255 2256 /* This function build the following tree, from regular expression a*: 2257 * 2258 | 2259 a 2260 */ 2261 2262 static bin_tree_t * 2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2264 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2265 { 2266 re_dfa_t *dfa = preg->buffer; 2267 bin_tree_t *tree; 2268 switch (token->type) 2269 { 2270 case CHARACTER: 2271 tree = create_token_tree (dfa, NULL, NULL, token); 2272 if (__glibc_unlikely (tree == NULL)) 2273 { 2274 *err = REG_ESPACE; 2275 return NULL; 2276 } 2277 #ifdef RE_ENABLE_I18N 2278 if (dfa->mb_cur_max > 1) 2279 { 2280 while (!re_string_eoi (regexp) 2281 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2282 { 2283 bin_tree_t *mbc_remain; 2284 fetch_token (token, regexp, syntax); 2285 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2286 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2287 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL)) 2288 { 2289 *err = REG_ESPACE; 2290 return NULL; 2291 } 2292 } 2293 } 2294 #endif 2295 break; 2296 2297 case OP_OPEN_SUBEXP: 2298 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2299 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2300 return NULL; 2301 break; 2302 2303 case OP_OPEN_BRACKET: 2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2305 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2306 return NULL; 2307 break; 2308 2309 case OP_BACK_REF: 2310 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx))) 2311 { 2312 *err = REG_ESUBREG; 2313 return NULL; 2314 } 2315 dfa->used_bkref_map |= 1 << token->opr.idx; 2316 tree = create_token_tree (dfa, NULL, NULL, token); 2317 if (__glibc_unlikely (tree == NULL)) 2318 { 2319 *err = REG_ESPACE; 2320 return NULL; 2321 } 2322 ++dfa->nbackref; 2323 dfa->has_mb_node = 1; 2324 break; 2325 2326 case OP_OPEN_DUP_NUM: 2327 if (syntax & RE_CONTEXT_INVALID_DUP) 2328 { 2329 *err = REG_BADRPT; 2330 return NULL; 2331 } 2332 FALLTHROUGH; 2333 case OP_DUP_ASTERISK: 2334 case OP_DUP_PLUS: 2335 case OP_DUP_QUESTION: 2336 if (syntax & RE_CONTEXT_INVALID_OPS) 2337 { 2338 *err = REG_BADRPT; 2339 return NULL; 2340 } 2341 else if (syntax & RE_CONTEXT_INDEP_OPS) 2342 { 2343 fetch_token (token, regexp, syntax); 2344 return parse_expression (regexp, preg, token, syntax, nest, err); 2345 } 2346 FALLTHROUGH; 2347 case OP_CLOSE_SUBEXP: 2348 if ((token->type == OP_CLOSE_SUBEXP) && 2349 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2350 { 2351 *err = REG_ERPAREN; 2352 return NULL; 2353 } 2354 FALLTHROUGH; 2355 case OP_CLOSE_DUP_NUM: 2356 /* We treat it as a normal character. */ 2357 2358 /* Then we can these characters as normal characters. */ 2359 token->type = CHARACTER; 2360 /* mb_partial and word_char bits should be initialized already 2361 by peek_token. */ 2362 tree = create_token_tree (dfa, NULL, NULL, token); 2363 if (__glibc_unlikely (tree == NULL)) 2364 { 2365 *err = REG_ESPACE; 2366 return NULL; 2367 } 2368 break; 2369 2370 case ANCHOR: 2371 if ((token->opr.ctx_type 2372 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2373 && dfa->word_ops_used == 0) 2374 init_word_char (dfa); 2375 if (token->opr.ctx_type == WORD_DELIM 2376 || token->opr.ctx_type == NOT_WORD_DELIM) 2377 { 2378 bin_tree_t *tree_first, *tree_last; 2379 if (token->opr.ctx_type == WORD_DELIM) 2380 { 2381 token->opr.ctx_type = WORD_FIRST; 2382 tree_first = create_token_tree (dfa, NULL, NULL, token); 2383 token->opr.ctx_type = WORD_LAST; 2384 } 2385 else 2386 { 2387 token->opr.ctx_type = INSIDE_WORD; 2388 tree_first = create_token_tree (dfa, NULL, NULL, token); 2389 token->opr.ctx_type = INSIDE_NOTWORD; 2390 } 2391 tree_last = create_token_tree (dfa, NULL, NULL, token); 2392 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2393 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL 2394 || tree == NULL)) 2395 { 2396 *err = REG_ESPACE; 2397 return NULL; 2398 } 2399 } 2400 else 2401 { 2402 tree = create_token_tree (dfa, NULL, NULL, token); 2403 if (__glibc_unlikely (tree == NULL)) 2404 { 2405 *err = REG_ESPACE; 2406 return NULL; 2407 } 2408 } 2409 /* We must return here, since ANCHORs can't be followed 2410 by repetition operators. 2411 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2412 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2413 fetch_token (token, regexp, syntax); 2414 return tree; 2415 2416 case OP_PERIOD: 2417 tree = create_token_tree (dfa, NULL, NULL, token); 2418 if (__glibc_unlikely (tree == NULL)) 2419 { 2420 *err = REG_ESPACE; 2421 return NULL; 2422 } 2423 if (dfa->mb_cur_max > 1) 2424 dfa->has_mb_node = 1; 2425 break; 2426 2427 case OP_WORD: 2428 case OP_NOTWORD: 2429 tree = build_charclass_op (dfa, regexp->trans, 2430 "alnum", 2431 "_", 2432 token->type == OP_NOTWORD, err); 2433 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2434 return NULL; 2435 break; 2436 2437 case OP_SPACE: 2438 case OP_NOTSPACE: 2439 tree = build_charclass_op (dfa, regexp->trans, 2440 "space", 2441 "", 2442 token->type == OP_NOTSPACE, err); 2443 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) 2444 return NULL; 2445 break; 2446 2447 case OP_ALT: 2448 case END_OF_RE: 2449 return NULL; 2450 2451 case BACK_SLASH: 2452 *err = REG_EESCAPE; 2453 return NULL; 2454 2455 default: 2456 /* Must not happen? */ 2457 #ifdef DEBUG 2458 assert (0); 2459 #endif 2460 return NULL; 2461 } 2462 fetch_token (token, regexp, syntax); 2463 2464 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2465 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2466 { 2467 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, 2468 syntax, err); 2469 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL)) 2470 { 2471 if (tree != NULL) 2472 postorder (tree, free_tree, NULL); 2473 return NULL; 2474 } 2475 tree = dup_tree; 2476 /* In BRE consecutive duplications are not allowed. */ 2477 if ((syntax & RE_CONTEXT_INVALID_DUP) 2478 && (token->type == OP_DUP_ASTERISK 2479 || token->type == OP_OPEN_DUP_NUM)) 2480 { 2481 if (tree != NULL) 2482 postorder (tree, free_tree, NULL); 2483 *err = REG_BADRPT; 2484 return NULL; 2485 } 2486 } 2487 2488 return tree; 2489 } 2490 2491 /* This function build the following tree, from regular expression 2492 (<reg_exp>): 2493 SUBEXP 2494 | 2495 <reg_exp> 2496 */ 2497 2498 static bin_tree_t * 2499 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2500 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2501 { 2502 re_dfa_t *dfa = preg->buffer; 2503 bin_tree_t *tree; 2504 size_t cur_nsub; 2505 cur_nsub = preg->re_nsub++; 2506 2507 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2508 2509 /* The subexpression may be a null string. */ 2510 if (token->type == OP_CLOSE_SUBEXP) 2511 tree = NULL; 2512 else 2513 { 2514 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2515 if (__glibc_unlikely (*err == REG_NOERROR 2516 && token->type != OP_CLOSE_SUBEXP)) 2517 { 2518 if (tree != NULL) 2519 postorder (tree, free_tree, NULL); 2520 *err = REG_EPAREN; 2521 } 2522 if (__glibc_unlikely (*err != REG_NOERROR)) 2523 return NULL; 2524 } 2525 2526 if (cur_nsub <= '9' - '1') 2527 dfa->completed_bkref_map |= 1 << cur_nsub; 2528 2529 tree = create_tree (dfa, tree, NULL, SUBEXP); 2530 if (__glibc_unlikely (tree == NULL)) 2531 { 2532 *err = REG_ESPACE; 2533 return NULL; 2534 } 2535 tree->token.opr.idx = cur_nsub; 2536 return tree; 2537 } 2538 2539 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2540 2541 static bin_tree_t * 2542 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2543 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2544 { 2545 bin_tree_t *tree = NULL, *old_tree = NULL; 2546 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2547 re_token_t start_token = *token; 2548 2549 if (token->type == OP_OPEN_DUP_NUM) 2550 { 2551 end = 0; 2552 start = fetch_number (regexp, token, syntax); 2553 if (start == -1) 2554 { 2555 if (token->type == CHARACTER && token->opr.c == ',') 2556 start = 0; /* We treat "{,m}" as "{0,m}". */ 2557 else 2558 { 2559 *err = REG_BADBR; /* <re>{} is invalid. */ 2560 return NULL; 2561 } 2562 } 2563 if (__glibc_likely (start != -2)) 2564 { 2565 /* We treat "{n}" as "{n,n}". */ 2566 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2567 : ((token->type == CHARACTER && token->opr.c == ',') 2568 ? fetch_number (regexp, token, syntax) : -2)); 2569 } 2570 if (__glibc_unlikely (start == -2 || end == -2)) 2571 { 2572 /* Invalid sequence. */ 2573 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD))) 2574 { 2575 if (token->type == END_OF_RE) 2576 *err = REG_EBRACE; 2577 else 2578 *err = REG_BADBR; 2579 2580 return NULL; 2581 } 2582 2583 /* If the syntax bit is set, rollback. */ 2584 re_string_set_index (regexp, start_idx); 2585 *token = start_token; 2586 token->type = CHARACTER; 2587 /* mb_partial and word_char bits should be already initialized by 2588 peek_token. */ 2589 return elem; 2590 } 2591 2592 if (__glibc_unlikely ((end != -1 && start > end) 2593 || token->type != OP_CLOSE_DUP_NUM)) 2594 { 2595 /* First number greater than second. */ 2596 *err = REG_BADBR; 2597 return NULL; 2598 } 2599 2600 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end))) 2601 { 2602 *err = REG_ESIZE; 2603 return NULL; 2604 } 2605 } 2606 else 2607 { 2608 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2609 end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2610 } 2611 2612 fetch_token (token, regexp, syntax); 2613 2614 if (__glibc_unlikely (elem == NULL)) 2615 return NULL; 2616 if (__glibc_unlikely (start == 0 && end == 0)) 2617 { 2618 postorder (elem, free_tree, NULL); 2619 return NULL; 2620 } 2621 2622 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2623 if (__glibc_unlikely (start > 0)) 2624 { 2625 tree = elem; 2626 for (i = 2; i <= start; ++i) 2627 { 2628 elem = duplicate_tree (elem, dfa); 2629 tree = create_tree (dfa, tree, elem, CONCAT); 2630 if (__glibc_unlikely (elem == NULL || tree == NULL)) 2631 goto parse_dup_op_espace; 2632 } 2633 2634 if (start == end) 2635 return tree; 2636 2637 /* Duplicate ELEM before it is marked optional. */ 2638 elem = duplicate_tree (elem, dfa); 2639 if (__glibc_unlikely (elem == NULL)) 2640 goto parse_dup_op_espace; 2641 old_tree = tree; 2642 } 2643 else 2644 old_tree = NULL; 2645 2646 if (elem->token.type == SUBEXP) 2647 { 2648 uintptr_t subidx = elem->token.opr.idx; 2649 postorder (elem, mark_opt_subexp, (void *) subidx); 2650 } 2651 2652 tree = create_tree (dfa, elem, NULL, 2653 (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2654 if (__glibc_unlikely (tree == NULL)) 2655 goto parse_dup_op_espace; 2656 2657 /* This loop is actually executed only when end != -1, 2658 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2659 already created the start+1-th copy. */ 2660 if (TYPE_SIGNED (Idx) || end != -1) 2661 for (i = start + 2; i <= end; ++i) 2662 { 2663 elem = duplicate_tree (elem, dfa); 2664 tree = create_tree (dfa, tree, elem, CONCAT); 2665 if (__glibc_unlikely (elem == NULL || tree == NULL)) 2666 goto parse_dup_op_espace; 2667 2668 tree = create_tree (dfa, tree, NULL, OP_ALT); 2669 if (__glibc_unlikely (tree == NULL)) 2670 goto parse_dup_op_espace; 2671 } 2672 2673 if (old_tree) 2674 tree = create_tree (dfa, old_tree, tree, CONCAT); 2675 2676 return tree; 2677 2678 parse_dup_op_espace: 2679 *err = REG_ESPACE; 2680 return NULL; 2681 } 2682 2683 /* Size of the names for collating symbol/equivalence_class/character_class. 2684 I'm not sure, but maybe enough. */ 2685 #define BRACKET_NAME_BUF_SIZE 32 2686 2687 #ifndef _LIBC 2688 2689 # ifdef RE_ENABLE_I18N 2690 /* Convert the byte B to the corresponding wide character. In a 2691 unibyte locale, treat B as itself. In a multibyte locale, return 2692 WEOF if B is an encoding error. */ 2693 static wint_t 2694 parse_byte (unsigned char b, re_charset_t *mbcset) 2695 { 2696 return mbcset == NULL ? b : __btowc (b); 2697 } 2698 # endif 2699 2700 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2701 Build the range expression which starts from START_ELEM, and ends 2702 at END_ELEM. The result are written to MBCSET and SBCSET. 2703 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2704 mbcset->range_ends, is a pointer argument since we may 2705 update it. */ 2706 2707 static reg_errcode_t 2708 # ifdef RE_ENABLE_I18N 2709 build_range_exp (const reg_syntax_t syntax, 2710 bitset_t sbcset, 2711 re_charset_t *mbcset, 2712 Idx *range_alloc, 2713 const bracket_elem_t *start_elem, 2714 const bracket_elem_t *end_elem) 2715 # else /* not RE_ENABLE_I18N */ 2716 build_range_exp (const reg_syntax_t syntax, 2717 bitset_t sbcset, 2718 const bracket_elem_t *start_elem, 2719 const bracket_elem_t *end_elem) 2720 # endif /* not RE_ENABLE_I18N */ 2721 { 2722 unsigned int start_ch, end_ch; 2723 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2724 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS 2725 || start_elem->type == CHAR_CLASS 2726 || end_elem->type == EQUIV_CLASS 2727 || end_elem->type == CHAR_CLASS)) 2728 return REG_ERANGE; 2729 2730 /* We can handle no multi character collating elements without libc 2731 support. */ 2732 if (__glibc_unlikely ((start_elem->type == COLL_SYM 2733 && strlen ((char *) start_elem->opr.name) > 1) 2734 || (end_elem->type == COLL_SYM 2735 && strlen ((char *) end_elem->opr.name) > 1))) 2736 return REG_ECOLLATE; 2737 2738 # ifdef RE_ENABLE_I18N 2739 { 2740 wchar_t wc; 2741 wint_t start_wc; 2742 wint_t end_wc; 2743 2744 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2745 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2746 : 0)); 2747 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2748 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2749 : 0)); 2750 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2751 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch); 2752 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2753 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch); 2754 if (start_wc == WEOF || end_wc == WEOF) 2755 return REG_ECOLLATE; 2756 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) 2757 && start_wc > end_wc)) 2758 return REG_ERANGE; 2759 2760 /* Got valid collation sequence values, add them as a new entry. 2761 However, for !_LIBC we have no collation elements: if the 2762 character set is single byte, the single byte character set 2763 that we build below suffices. parse_bracket_exp passes 2764 no MBCSET if dfa->mb_cur_max == 1. */ 2765 if (mbcset) 2766 { 2767 /* Check the space of the arrays. */ 2768 if (__glibc_unlikely (*range_alloc == mbcset->nranges)) 2769 { 2770 /* There is not enough space, need realloc. */ 2771 wchar_t *new_array_start, *new_array_end; 2772 Idx new_nranges; 2773 2774 /* +1 in case of mbcset->nranges is 0. */ 2775 new_nranges = 2 * mbcset->nranges + 1; 2776 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2777 are NULL if *range_alloc == 0. */ 2778 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2779 new_nranges); 2780 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2781 new_nranges); 2782 2783 if (__glibc_unlikely (new_array_start == NULL 2784 || new_array_end == NULL)) 2785 { 2786 re_free (new_array_start); 2787 re_free (new_array_end); 2788 return REG_ESPACE; 2789 } 2790 2791 mbcset->range_starts = new_array_start; 2792 mbcset->range_ends = new_array_end; 2793 *range_alloc = new_nranges; 2794 } 2795 2796 mbcset->range_starts[mbcset->nranges] = start_wc; 2797 mbcset->range_ends[mbcset->nranges++] = end_wc; 2798 } 2799 2800 /* Build the table for single byte characters. */ 2801 for (wc = 0; wc < SBC_MAX; ++wc) 2802 { 2803 if (start_wc <= wc && wc <= end_wc) 2804 bitset_set (sbcset, wc); 2805 } 2806 } 2807 # else /* not RE_ENABLE_I18N */ 2808 { 2809 unsigned int ch; 2810 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2811 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2812 : 0)); 2813 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2814 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2815 : 0)); 2816 if (start_ch > end_ch) 2817 return REG_ERANGE; 2818 /* Build the table for single byte characters. */ 2819 for (ch = 0; ch < SBC_MAX; ++ch) 2820 if (start_ch <= ch && ch <= end_ch) 2821 bitset_set (sbcset, ch); 2822 } 2823 # endif /* not RE_ENABLE_I18N */ 2824 return REG_NOERROR; 2825 } 2826 #endif /* not _LIBC */ 2827 2828 #ifndef _LIBC 2829 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2830 Build the collating element which is represented by NAME. 2831 The result are written to MBCSET and SBCSET. 2832 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2833 pointer argument since we may update it. */ 2834 2835 static reg_errcode_t 2836 # ifdef RE_ENABLE_I18N 2837 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2838 Idx *coll_sym_alloc, const unsigned char *name) 2839 # else /* not RE_ENABLE_I18N */ 2840 build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2841 # endif /* not RE_ENABLE_I18N */ 2842 { 2843 size_t name_len = strlen ((const char *) name); 2844 if (__glibc_unlikely (name_len != 1)) 2845 return REG_ECOLLATE; 2846 else 2847 { 2848 bitset_set (sbcset, name[0]); 2849 return REG_NOERROR; 2850 } 2851 } 2852 #endif /* not _LIBC */ 2853 2854 /* This function parse bracket expression like "[abc]", "[a-c]", 2855 "[[.a-a.]]" etc. */ 2856 2857 static bin_tree_t * 2858 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2859 reg_syntax_t syntax, reg_errcode_t *err) 2860 { 2861 #ifdef _LIBC 2862 const unsigned char *collseqmb; 2863 const char *collseqwc; 2864 uint32_t nrules; 2865 int32_t table_size; 2866 const int32_t *symb_table; 2867 const unsigned char *extra; 2868 2869 /* Local function for parse_bracket_exp used in _LIBC environment. 2870 Seek the collating symbol entry corresponding to NAME. 2871 Return the index of the symbol in the SYMB_TABLE, 2872 or -1 if not found. */ 2873 2874 auto inline int32_t 2875 __attribute__ ((always_inline)) 2876 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2877 { 2878 int32_t elem; 2879 2880 for (elem = 0; elem < table_size; elem++) 2881 if (symb_table[2 * elem] != 0) 2882 { 2883 int32_t idx = symb_table[2 * elem + 1]; 2884 /* Skip the name of collating element name. */ 2885 idx += 1 + extra[idx]; 2886 if (/* Compare the length of the name. */ 2887 name_len == extra[idx] 2888 /* Compare the name. */ 2889 && memcmp (name, &extra[idx + 1], name_len) == 0) 2890 /* Yep, this is the entry. */ 2891 return elem; 2892 } 2893 return -1; 2894 } 2895 2896 /* Local function for parse_bracket_exp used in _LIBC environment. 2897 Look up the collation sequence value of BR_ELEM. 2898 Return the value if succeeded, UINT_MAX otherwise. */ 2899 2900 auto inline unsigned int 2901 __attribute__ ((always_inline)) 2902 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2903 { 2904 if (br_elem->type == SB_CHAR) 2905 { 2906 /* 2907 if (MB_CUR_MAX == 1) 2908 */ 2909 if (nrules == 0) 2910 return collseqmb[br_elem->opr.ch]; 2911 else 2912 { 2913 wint_t wc = __btowc (br_elem->opr.ch); 2914 return __collseq_table_lookup (collseqwc, wc); 2915 } 2916 } 2917 else if (br_elem->type == MB_CHAR) 2918 { 2919 if (nrules != 0) 2920 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2921 } 2922 else if (br_elem->type == COLL_SYM) 2923 { 2924 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2925 if (nrules != 0) 2926 { 2927 int32_t elem, idx; 2928 elem = seek_collating_symbol_entry (br_elem->opr.name, 2929 sym_name_len); 2930 if (elem != -1) 2931 { 2932 /* We found the entry. */ 2933 idx = symb_table[2 * elem + 1]; 2934 /* Skip the name of collating element name. */ 2935 idx += 1 + extra[idx]; 2936 /* Skip the byte sequence of the collating element. */ 2937 idx += 1 + extra[idx]; 2938 /* Adjust for the alignment. */ 2939 idx = (idx + 3) & ~3; 2940 /* Skip the multibyte collation sequence value. */ 2941 idx += sizeof (unsigned int); 2942 /* Skip the wide char sequence of the collating element. */ 2943 idx += sizeof (unsigned int) * 2944 (1 + *(unsigned int *) (extra + idx)); 2945 /* Return the collation sequence value. */ 2946 return *(unsigned int *) (extra + idx); 2947 } 2948 else if (sym_name_len == 1) 2949 { 2950 /* No valid character. Match it as a single byte 2951 character. */ 2952 return collseqmb[br_elem->opr.name[0]]; 2953 } 2954 } 2955 else if (sym_name_len == 1) 2956 return collseqmb[br_elem->opr.name[0]]; 2957 } 2958 return UINT_MAX; 2959 } 2960 2961 /* Local function for parse_bracket_exp used in _LIBC environment. 2962 Build the range expression which starts from START_ELEM, and ends 2963 at END_ELEM. The result are written to MBCSET and SBCSET. 2964 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2965 mbcset->range_ends, is a pointer argument since we may 2966 update it. */ 2967 2968 auto inline reg_errcode_t 2969 __attribute__ ((always_inline)) 2970 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2971 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2972 { 2973 unsigned int ch; 2974 uint32_t start_collseq; 2975 uint32_t end_collseq; 2976 2977 /* Equivalence Classes and Character Classes can't be a range 2978 start/end. */ 2979 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS 2980 || start_elem->type == CHAR_CLASS 2981 || end_elem->type == EQUIV_CLASS 2982 || end_elem->type == CHAR_CLASS)) 2983 return REG_ERANGE; 2984 2985 /* FIXME: Implement rational ranges here, too. */ 2986 start_collseq = lookup_collation_sequence_value (start_elem); 2987 end_collseq = lookup_collation_sequence_value (end_elem); 2988 /* Check start/end collation sequence values. */ 2989 if (__glibc_unlikely (start_collseq == UINT_MAX 2990 || end_collseq == UINT_MAX)) 2991 return REG_ECOLLATE; 2992 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) 2993 && start_collseq > end_collseq)) 2994 return REG_ERANGE; 2995 2996 /* Got valid collation sequence values, add them as a new entry. 2997 However, if we have no collation elements, and the character set 2998 is single byte, the single byte character set that we 2999 build below suffices. */ 3000 if (nrules > 0 || dfa->mb_cur_max > 1) 3001 { 3002 /* Check the space of the arrays. */ 3003 if (__glibc_unlikely (*range_alloc == mbcset->nranges)) 3004 { 3005 /* There is not enough space, need realloc. */ 3006 uint32_t *new_array_start; 3007 uint32_t *new_array_end; 3008 Idx new_nranges; 3009 3010 /* +1 in case of mbcset->nranges is 0. */ 3011 new_nranges = 2 * mbcset->nranges + 1; 3012 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 3013 new_nranges); 3014 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 3015 new_nranges); 3016 3017 if (__glibc_unlikely (new_array_start == NULL 3018 || new_array_end == NULL)) 3019 return REG_ESPACE; 3020 3021 mbcset->range_starts = new_array_start; 3022 mbcset->range_ends = new_array_end; 3023 *range_alloc = new_nranges; 3024 } 3025 3026 mbcset->range_starts[mbcset->nranges] = start_collseq; 3027 mbcset->range_ends[mbcset->nranges++] = end_collseq; 3028 } 3029 3030 /* Build the table for single byte characters. */ 3031 for (ch = 0; ch < SBC_MAX; ch++) 3032 { 3033 uint32_t ch_collseq; 3034 /* 3035 if (MB_CUR_MAX == 1) 3036 */ 3037 if (nrules == 0) 3038 ch_collseq = collseqmb[ch]; 3039 else 3040 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 3041 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 3042 bitset_set (sbcset, ch); 3043 } 3044 return REG_NOERROR; 3045 } 3046 3047 /* Local function for parse_bracket_exp used in _LIBC environment. 3048 Build the collating element which is represented by NAME. 3049 The result are written to MBCSET and SBCSET. 3050 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3051 pointer argument since we may update it. */ 3052 3053 auto inline reg_errcode_t 3054 __attribute__ ((always_inline)) 3055 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 3056 Idx *coll_sym_alloc, const unsigned char *name) 3057 { 3058 int32_t elem, idx; 3059 size_t name_len = strlen ((const char *) name); 3060 if (nrules != 0) 3061 { 3062 elem = seek_collating_symbol_entry (name, name_len); 3063 if (elem != -1) 3064 { 3065 /* We found the entry. */ 3066 idx = symb_table[2 * elem + 1]; 3067 /* Skip the name of collating element name. */ 3068 idx += 1 + extra[idx]; 3069 } 3070 else if (name_len == 1) 3071 { 3072 /* No valid character, treat it as a normal 3073 character. */ 3074 bitset_set (sbcset, name[0]); 3075 return REG_NOERROR; 3076 } 3077 else 3078 return REG_ECOLLATE; 3079 3080 /* Got valid collation sequence, add it as a new entry. */ 3081 /* Check the space of the arrays. */ 3082 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) 3083 { 3084 /* Not enough, realloc it. */ 3085 /* +1 in case of mbcset->ncoll_syms is 0. */ 3086 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3087 /* Use realloc since mbcset->coll_syms is NULL 3088 if *alloc == 0. */ 3089 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3090 new_coll_sym_alloc); 3091 if (__glibc_unlikely (new_coll_syms == NULL)) 3092 return REG_ESPACE; 3093 mbcset->coll_syms = new_coll_syms; 3094 *coll_sym_alloc = new_coll_sym_alloc; 3095 } 3096 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3097 return REG_NOERROR; 3098 } 3099 else 3100 { 3101 if (__glibc_unlikely (name_len != 1)) 3102 return REG_ECOLLATE; 3103 else 3104 { 3105 bitset_set (sbcset, name[0]); 3106 return REG_NOERROR; 3107 } 3108 } 3109 } 3110 #endif 3111 3112 re_token_t br_token; 3113 re_bitset_ptr_t sbcset; 3114 #ifdef RE_ENABLE_I18N 3115 re_charset_t *mbcset; 3116 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3117 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3118 #endif /* not RE_ENABLE_I18N */ 3119 bool non_match = false; 3120 bin_tree_t *work_tree; 3121 int token_len; 3122 bool first_round = true; 3123 #ifdef _LIBC 3124 collseqmb = (const unsigned char *) 3125 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3126 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3127 if (nrules) 3128 { 3129 /* 3130 if (MB_CUR_MAX > 1) 3131 */ 3132 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3133 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3134 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3135 _NL_COLLATE_SYMB_TABLEMB); 3136 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3137 _NL_COLLATE_SYMB_EXTRAMB); 3138 } 3139 #endif 3140 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3141 #ifdef RE_ENABLE_I18N 3142 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3143 #endif /* RE_ENABLE_I18N */ 3144 #ifdef RE_ENABLE_I18N 3145 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL)) 3146 #else 3147 if (__glibc_unlikely (sbcset == NULL)) 3148 #endif /* RE_ENABLE_I18N */ 3149 { 3150 re_free (sbcset); 3151 #ifdef RE_ENABLE_I18N 3152 re_free (mbcset); 3153 #endif 3154 *err = REG_ESPACE; 3155 return NULL; 3156 } 3157 3158 token_len = peek_token_bracket (token, regexp, syntax); 3159 if (__glibc_unlikely (token->type == END_OF_RE)) 3160 { 3161 *err = REG_BADPAT; 3162 goto parse_bracket_exp_free_return; 3163 } 3164 if (token->type == OP_NON_MATCH_LIST) 3165 { 3166 #ifdef RE_ENABLE_I18N 3167 mbcset->non_match = 1; 3168 #endif /* not RE_ENABLE_I18N */ 3169 non_match = true; 3170 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3171 bitset_set (sbcset, '\n'); 3172 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3173 token_len = peek_token_bracket (token, regexp, syntax); 3174 if (__glibc_unlikely (token->type == END_OF_RE)) 3175 { 3176 *err = REG_BADPAT; 3177 goto parse_bracket_exp_free_return; 3178 } 3179 } 3180 3181 /* We treat the first ']' as a normal character. */ 3182 if (token->type == OP_CLOSE_BRACKET) 3183 token->type = CHARACTER; 3184 3185 while (1) 3186 { 3187 bracket_elem_t start_elem, end_elem; 3188 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3189 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3190 reg_errcode_t ret; 3191 int token_len2 = 0; 3192 bool is_range_exp = false; 3193 re_token_t token2; 3194 3195 start_elem.opr.name = start_name_buf; 3196 start_elem.type = COLL_SYM; 3197 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3198 syntax, first_round); 3199 if (__glibc_unlikely (ret != REG_NOERROR)) 3200 { 3201 *err = ret; 3202 goto parse_bracket_exp_free_return; 3203 } 3204 first_round = false; 3205 3206 /* Get information about the next token. We need it in any case. */ 3207 token_len = peek_token_bracket (token, regexp, syntax); 3208 3209 /* Do not check for ranges if we know they are not allowed. */ 3210 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3211 { 3212 if (__glibc_unlikely (token->type == END_OF_RE)) 3213 { 3214 *err = REG_EBRACK; 3215 goto parse_bracket_exp_free_return; 3216 } 3217 if (token->type == OP_CHARSET_RANGE) 3218 { 3219 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3220 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3221 if (__glibc_unlikely (token2.type == END_OF_RE)) 3222 { 3223 *err = REG_EBRACK; 3224 goto parse_bracket_exp_free_return; 3225 } 3226 if (token2.type == OP_CLOSE_BRACKET) 3227 { 3228 /* We treat the last '-' as a normal character. */ 3229 re_string_skip_bytes (regexp, -token_len); 3230 token->type = CHARACTER; 3231 } 3232 else 3233 is_range_exp = true; 3234 } 3235 } 3236 3237 if (is_range_exp == true) 3238 { 3239 end_elem.opr.name = end_name_buf; 3240 end_elem.type = COLL_SYM; 3241 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3242 dfa, syntax, true); 3243 if (__glibc_unlikely (ret != REG_NOERROR)) 3244 { 3245 *err = ret; 3246 goto parse_bracket_exp_free_return; 3247 } 3248 3249 token_len = peek_token_bracket (token, regexp, syntax); 3250 3251 #ifdef _LIBC 3252 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3253 &start_elem, &end_elem); 3254 #else 3255 # ifdef RE_ENABLE_I18N 3256 *err = build_range_exp (syntax, sbcset, 3257 dfa->mb_cur_max > 1 ? mbcset : NULL, 3258 &range_alloc, &start_elem, &end_elem); 3259 # else 3260 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3261 # endif 3262 #endif /* RE_ENABLE_I18N */ 3263 if (__glibc_unlikely (*err != REG_NOERROR)) 3264 goto parse_bracket_exp_free_return; 3265 } 3266 else 3267 { 3268 switch (start_elem.type) 3269 { 3270 case SB_CHAR: 3271 bitset_set (sbcset, start_elem.opr.ch); 3272 break; 3273 #ifdef RE_ENABLE_I18N 3274 case MB_CHAR: 3275 /* Check whether the array has enough space. */ 3276 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars)) 3277 { 3278 wchar_t *new_mbchars; 3279 /* Not enough, realloc it. */ 3280 /* +1 in case of mbcset->nmbchars is 0. */ 3281 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3282 /* Use realloc since array is NULL if *alloc == 0. */ 3283 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3284 mbchar_alloc); 3285 if (__glibc_unlikely (new_mbchars == NULL)) 3286 goto parse_bracket_exp_espace; 3287 mbcset->mbchars = new_mbchars; 3288 } 3289 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3290 break; 3291 #endif /* RE_ENABLE_I18N */ 3292 case EQUIV_CLASS: 3293 *err = build_equiv_class (sbcset, 3294 #ifdef RE_ENABLE_I18N 3295 mbcset, &equiv_class_alloc, 3296 #endif /* RE_ENABLE_I18N */ 3297 start_elem.opr.name); 3298 if (__glibc_unlikely (*err != REG_NOERROR)) 3299 goto parse_bracket_exp_free_return; 3300 break; 3301 case COLL_SYM: 3302 *err = build_collating_symbol (sbcset, 3303 #ifdef RE_ENABLE_I18N 3304 mbcset, &coll_sym_alloc, 3305 #endif /* RE_ENABLE_I18N */ 3306 start_elem.opr.name); 3307 if (__glibc_unlikely (*err != REG_NOERROR)) 3308 goto parse_bracket_exp_free_return; 3309 break; 3310 case CHAR_CLASS: 3311 *err = build_charclass (regexp->trans, sbcset, 3312 #ifdef RE_ENABLE_I18N 3313 mbcset, &char_class_alloc, 3314 #endif /* RE_ENABLE_I18N */ 3315 (const char *) start_elem.opr.name, 3316 syntax); 3317 if (__glibc_unlikely (*err != REG_NOERROR)) 3318 goto parse_bracket_exp_free_return; 3319 break; 3320 default: 3321 assert (0); 3322 break; 3323 } 3324 } 3325 if (__glibc_unlikely (token->type == END_OF_RE)) 3326 { 3327 *err = REG_EBRACK; 3328 goto parse_bracket_exp_free_return; 3329 } 3330 if (token->type == OP_CLOSE_BRACKET) 3331 break; 3332 } 3333 3334 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3335 3336 /* If it is non-matching list. */ 3337 if (non_match) 3338 bitset_not (sbcset); 3339 3340 #ifdef RE_ENABLE_I18N 3341 /* Ensure only single byte characters are set. */ 3342 if (dfa->mb_cur_max > 1) 3343 bitset_mask (sbcset, dfa->sb_char); 3344 3345 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3346 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3347 || mbcset->non_match))) 3348 { 3349 bin_tree_t *mbc_tree; 3350 int sbc_idx; 3351 /* Build a tree for complex bracket. */ 3352 dfa->has_mb_node = 1; 3353 br_token.type = COMPLEX_BRACKET; 3354 br_token.opr.mbcset = mbcset; 3355 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3356 if (__glibc_unlikely (mbc_tree == NULL)) 3357 goto parse_bracket_exp_espace; 3358 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3359 if (sbcset[sbc_idx]) 3360 break; 3361 /* If there are no bits set in sbcset, there is no point 3362 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3363 if (sbc_idx < BITSET_WORDS) 3364 { 3365 /* Build a tree for simple bracket. */ 3366 br_token.type = SIMPLE_BRACKET; 3367 br_token.opr.sbcset = sbcset; 3368 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3369 if (__glibc_unlikely (work_tree == NULL)) 3370 goto parse_bracket_exp_espace; 3371 3372 /* Then join them by ALT node. */ 3373 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3374 if (__glibc_unlikely (work_tree == NULL)) 3375 goto parse_bracket_exp_espace; 3376 } 3377 else 3378 { 3379 re_free (sbcset); 3380 work_tree = mbc_tree; 3381 } 3382 } 3383 else 3384 #endif /* not RE_ENABLE_I18N */ 3385 { 3386 #ifdef RE_ENABLE_I18N 3387 free_charset (mbcset); 3388 #endif 3389 /* Build a tree for simple bracket. */ 3390 br_token.type = SIMPLE_BRACKET; 3391 br_token.opr.sbcset = sbcset; 3392 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3393 if (__glibc_unlikely (work_tree == NULL)) 3394 goto parse_bracket_exp_espace; 3395 } 3396 return work_tree; 3397 3398 parse_bracket_exp_espace: 3399 *err = REG_ESPACE; 3400 parse_bracket_exp_free_return: 3401 re_free (sbcset); 3402 #ifdef RE_ENABLE_I18N 3403 free_charset (mbcset); 3404 #endif /* RE_ENABLE_I18N */ 3405 return NULL; 3406 } 3407 3408 /* Parse an element in the bracket expression. */ 3409 3410 static reg_errcode_t 3411 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3412 re_token_t *token, int token_len, re_dfa_t *dfa, 3413 reg_syntax_t syntax, bool accept_hyphen) 3414 { 3415 #ifdef RE_ENABLE_I18N 3416 int cur_char_size; 3417 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3418 if (cur_char_size > 1) 3419 { 3420 elem->type = MB_CHAR; 3421 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3422 re_string_skip_bytes (regexp, cur_char_size); 3423 return REG_NOERROR; 3424 } 3425 #endif /* RE_ENABLE_I18N */ 3426 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3427 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3428 || token->type == OP_OPEN_EQUIV_CLASS) 3429 return parse_bracket_symbol (elem, regexp, token); 3430 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen) 3431 { 3432 /* A '-' must only appear as anything but a range indicator before 3433 the closing bracket. Everything else is an error. */ 3434 re_token_t token2; 3435 (void) peek_token_bracket (&token2, regexp, syntax); 3436 if (token2.type != OP_CLOSE_BRACKET) 3437 /* The actual error value is not standardized since this whole 3438 case is undefined. But ERANGE makes good sense. */ 3439 return REG_ERANGE; 3440 } 3441 elem->type = SB_CHAR; 3442 elem->opr.ch = token->opr.c; 3443 return REG_NOERROR; 3444 } 3445 3446 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3447 such as [:<character_class>:], [.<collating_element>.], and 3448 [=<equivalent_class>=]. */ 3449 3450 static reg_errcode_t 3451 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3452 re_token_t *token) 3453 { 3454 unsigned char ch, delim = token->opr.c; 3455 int i = 0; 3456 if (re_string_eoi(regexp)) 3457 return REG_EBRACK; 3458 for (;; ++i) 3459 { 3460 if (i >= BRACKET_NAME_BUF_SIZE) 3461 return REG_EBRACK; 3462 if (token->type == OP_OPEN_CHAR_CLASS) 3463 ch = re_string_fetch_byte_case (regexp); 3464 else 3465 ch = re_string_fetch_byte (regexp); 3466 if (re_string_eoi(regexp)) 3467 return REG_EBRACK; 3468 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3469 break; 3470 elem->opr.name[i] = ch; 3471 } 3472 re_string_skip_bytes (regexp, 1); 3473 elem->opr.name[i] = '\0'; 3474 switch (token->type) 3475 { 3476 case OP_OPEN_COLL_ELEM: 3477 elem->type = COLL_SYM; 3478 break; 3479 case OP_OPEN_EQUIV_CLASS: 3480 elem->type = EQUIV_CLASS; 3481 break; 3482 case OP_OPEN_CHAR_CLASS: 3483 elem->type = CHAR_CLASS; 3484 break; 3485 default: 3486 break; 3487 } 3488 return REG_NOERROR; 3489 } 3490 3491 /* Helper function for parse_bracket_exp. 3492 Build the equivalence class which is represented by NAME. 3493 The result are written to MBCSET and SBCSET. 3494 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3495 is a pointer argument since we may update it. */ 3496 3497 static reg_errcode_t 3498 #ifdef RE_ENABLE_I18N 3499 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3500 Idx *equiv_class_alloc, const unsigned char *name) 3501 #else /* not RE_ENABLE_I18N */ 3502 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3503 #endif /* not RE_ENABLE_I18N */ 3504 { 3505 #ifdef _LIBC 3506 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3507 if (nrules != 0) 3508 { 3509 const int32_t *table, *indirect; 3510 const unsigned char *weights, *extra, *cp; 3511 unsigned char char_buf[2]; 3512 int32_t idx1, idx2; 3513 unsigned int ch; 3514 size_t len; 3515 /* Calculate the index for equivalence class. */ 3516 cp = name; 3517 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3518 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3519 _NL_COLLATE_WEIGHTMB); 3520 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3521 _NL_COLLATE_EXTRAMB); 3522 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3523 _NL_COLLATE_INDIRECTMB); 3524 idx1 = findidx (table, indirect, extra, &cp, -1); 3525 if (__glibc_unlikely (idx1 == 0 || *cp != '\0')) 3526 /* This isn't a valid character. */ 3527 return REG_ECOLLATE; 3528 3529 /* Build single byte matching table for this equivalence class. */ 3530 len = weights[idx1 & 0xffffff]; 3531 for (ch = 0; ch < SBC_MAX; ++ch) 3532 { 3533 char_buf[0] = ch; 3534 cp = char_buf; 3535 idx2 = findidx (table, indirect, extra, &cp, 1); 3536 /* 3537 idx2 = table[ch]; 3538 */ 3539 if (idx2 == 0) 3540 /* This isn't a valid character. */ 3541 continue; 3542 /* Compare only if the length matches and the collation rule 3543 index is the same. */ 3544 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24) 3545 && memcmp (weights + (idx1 & 0xffffff) + 1, 3546 weights + (idx2 & 0xffffff) + 1, len) == 0) 3547 bitset_set (sbcset, ch); 3548 } 3549 /* Check whether the array has enough space. */ 3550 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes)) 3551 { 3552 /* Not enough, realloc it. */ 3553 /* +1 in case of mbcset->nequiv_classes is 0. */ 3554 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3555 /* Use realloc since the array is NULL if *alloc == 0. */ 3556 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3557 int32_t, 3558 new_equiv_class_alloc); 3559 if (__glibc_unlikely (new_equiv_classes == NULL)) 3560 return REG_ESPACE; 3561 mbcset->equiv_classes = new_equiv_classes; 3562 *equiv_class_alloc = new_equiv_class_alloc; 3563 } 3564 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3565 } 3566 else 3567 #endif /* _LIBC */ 3568 { 3569 if (__glibc_unlikely (strlen ((const char *) name) != 1)) 3570 return REG_ECOLLATE; 3571 bitset_set (sbcset, *name); 3572 } 3573 return REG_NOERROR; 3574 } 3575 3576 /* Helper function for parse_bracket_exp. 3577 Build the character class which is represented by NAME. 3578 The result are written to MBCSET and SBCSET. 3579 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3580 is a pointer argument since we may update it. */ 3581 3582 static reg_errcode_t 3583 #ifdef RE_ENABLE_I18N 3584 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3585 re_charset_t *mbcset, Idx *char_class_alloc, 3586 const char *class_name, reg_syntax_t syntax) 3587 #else /* not RE_ENABLE_I18N */ 3588 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3589 const char *class_name, reg_syntax_t syntax) 3590 #endif /* not RE_ENABLE_I18N */ 3591 { 3592 int i; 3593 const char *name = class_name; 3594 3595 /* In case of REG_ICASE "upper" and "lower" match the both of 3596 upper and lower cases. */ 3597 if ((syntax & RE_ICASE) 3598 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3599 name = "alpha"; 3600 3601 #ifdef RE_ENABLE_I18N 3602 /* Check the space of the arrays. */ 3603 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes)) 3604 { 3605 /* Not enough, realloc it. */ 3606 /* +1 in case of mbcset->nchar_classes is 0. */ 3607 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3608 /* Use realloc since array is NULL if *alloc == 0. */ 3609 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3610 new_char_class_alloc); 3611 if (__glibc_unlikely (new_char_classes == NULL)) 3612 return REG_ESPACE; 3613 mbcset->char_classes = new_char_classes; 3614 *char_class_alloc = new_char_class_alloc; 3615 } 3616 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3617 #endif /* RE_ENABLE_I18N */ 3618 3619 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3620 do { \ 3621 if (__glibc_unlikely (trans != NULL)) \ 3622 { \ 3623 for (i = 0; i < SBC_MAX; ++i) \ 3624 if (ctype_func (i)) \ 3625 bitset_set (sbcset, trans[i]); \ 3626 } \ 3627 else \ 3628 { \ 3629 for (i = 0; i < SBC_MAX; ++i) \ 3630 if (ctype_func (i)) \ 3631 bitset_set (sbcset, i); \ 3632 } \ 3633 } while (0) 3634 3635 if (strcmp (name, "alnum") == 0) 3636 BUILD_CHARCLASS_LOOP (isalnum); 3637 else if (strcmp (name, "cntrl") == 0) 3638 BUILD_CHARCLASS_LOOP (iscntrl); 3639 else if (strcmp (name, "lower") == 0) 3640 BUILD_CHARCLASS_LOOP (islower); 3641 else if (strcmp (name, "space") == 0) 3642 BUILD_CHARCLASS_LOOP (isspace); 3643 else if (strcmp (name, "alpha") == 0) 3644 BUILD_CHARCLASS_LOOP (isalpha); 3645 else if (strcmp (name, "digit") == 0) 3646 BUILD_CHARCLASS_LOOP (isdigit); 3647 else if (strcmp (name, "print") == 0) 3648 BUILD_CHARCLASS_LOOP (isprint); 3649 else if (strcmp (name, "upper") == 0) 3650 BUILD_CHARCLASS_LOOP (isupper); 3651 else if (strcmp (name, "blank") == 0) 3652 BUILD_CHARCLASS_LOOP (isblank); 3653 else if (strcmp (name, "graph") == 0) 3654 BUILD_CHARCLASS_LOOP (isgraph); 3655 else if (strcmp (name, "punct") == 0) 3656 BUILD_CHARCLASS_LOOP (ispunct); 3657 else if (strcmp (name, "xdigit") == 0) 3658 BUILD_CHARCLASS_LOOP (isxdigit); 3659 else 3660 return REG_ECTYPE; 3661 3662 return REG_NOERROR; 3663 } 3664 3665 static bin_tree_t * 3666 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3667 const char *class_name, 3668 const char *extra, bool non_match, 3669 reg_errcode_t *err) 3670 { 3671 re_bitset_ptr_t sbcset; 3672 #ifdef RE_ENABLE_I18N 3673 re_charset_t *mbcset; 3674 Idx alloc = 0; 3675 #endif /* not RE_ENABLE_I18N */ 3676 reg_errcode_t ret; 3677 re_token_t br_token; 3678 bin_tree_t *tree; 3679 3680 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3681 if (__glibc_unlikely (sbcset == NULL)) 3682 { 3683 *err = REG_ESPACE; 3684 return NULL; 3685 } 3686 #ifdef RE_ENABLE_I18N 3687 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3688 if (__glibc_unlikely (mbcset == NULL)) 3689 { 3690 re_free (sbcset); 3691 *err = REG_ESPACE; 3692 return NULL; 3693 } 3694 mbcset->non_match = non_match; 3695 #endif /* RE_ENABLE_I18N */ 3696 3697 /* We don't care the syntax in this case. */ 3698 ret = build_charclass (trans, sbcset, 3699 #ifdef RE_ENABLE_I18N 3700 mbcset, &alloc, 3701 #endif /* RE_ENABLE_I18N */ 3702 class_name, 0); 3703 3704 if (__glibc_unlikely (ret != REG_NOERROR)) 3705 { 3706 re_free (sbcset); 3707 #ifdef RE_ENABLE_I18N 3708 free_charset (mbcset); 3709 #endif /* RE_ENABLE_I18N */ 3710 *err = ret; 3711 return NULL; 3712 } 3713 /* \w match '_' also. */ 3714 for (; *extra; extra++) 3715 bitset_set (sbcset, *extra); 3716 3717 /* If it is non-matching list. */ 3718 if (non_match) 3719 bitset_not (sbcset); 3720 3721 #ifdef RE_ENABLE_I18N 3722 /* Ensure only single byte characters are set. */ 3723 if (dfa->mb_cur_max > 1) 3724 bitset_mask (sbcset, dfa->sb_char); 3725 #endif 3726 3727 /* Build a tree for simple bracket. */ 3728 #if defined GCC_LINT || defined lint 3729 memset (&br_token, 0, sizeof br_token); 3730 #endif 3731 br_token.type = SIMPLE_BRACKET; 3732 br_token.opr.sbcset = sbcset; 3733 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3734 if (__glibc_unlikely (tree == NULL)) 3735 goto build_word_op_espace; 3736 3737 #ifdef RE_ENABLE_I18N 3738 if (dfa->mb_cur_max > 1) 3739 { 3740 bin_tree_t *mbc_tree; 3741 /* Build a tree for complex bracket. */ 3742 br_token.type = COMPLEX_BRACKET; 3743 br_token.opr.mbcset = mbcset; 3744 dfa->has_mb_node = 1; 3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3746 if (__glibc_unlikely (mbc_tree == NULL)) 3747 goto build_word_op_espace; 3748 /* Then join them by ALT node. */ 3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3750 if (__glibc_likely (mbc_tree != NULL)) 3751 return tree; 3752 } 3753 else 3754 { 3755 free_charset (mbcset); 3756 return tree; 3757 } 3758 #else /* not RE_ENABLE_I18N */ 3759 return tree; 3760 #endif /* not RE_ENABLE_I18N */ 3761 3762 build_word_op_espace: 3763 re_free (sbcset); 3764 #ifdef RE_ENABLE_I18N 3765 free_charset (mbcset); 3766 #endif /* RE_ENABLE_I18N */ 3767 *err = REG_ESPACE; 3768 return NULL; 3769 } 3770 3771 /* This is intended for the expressions like "a{1,3}". 3772 Fetch a number from 'input', and return the number. 3773 Return -1 if the number field is empty like "{,1}". 3774 Return RE_DUP_MAX + 1 if the number field is too large. 3775 Return -2 if an error occurred. */ 3776 3777 static Idx 3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3779 { 3780 Idx num = -1; 3781 unsigned char c; 3782 while (1) 3783 { 3784 fetch_token (token, input, syntax); 3785 c = token->opr.c; 3786 if (__glibc_unlikely (token->type == END_OF_RE)) 3787 return -2; 3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3789 break; 3790 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3791 ? -2 3792 : num == -1 3793 ? c - '0' 3794 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); 3795 } 3796 return num; 3797 } 3798 3799 #ifdef RE_ENABLE_I18N 3800 static void 3801 free_charset (re_charset_t *cset) 3802 { 3803 re_free (cset->mbchars); 3804 # ifdef _LIBC 3805 re_free (cset->coll_syms); 3806 re_free (cset->equiv_classes); 3807 # endif 3808 re_free (cset->range_starts); 3809 re_free (cset->range_ends); 3810 re_free (cset->char_classes); 3811 re_free (cset); 3812 } 3813 #endif /* RE_ENABLE_I18N */ 3814 3815 /* Functions for binary tree operation. */ 3816 3817 /* Create a tree node. */ 3818 3819 static bin_tree_t * 3820 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3821 re_token_type_t type) 3822 { 3823 re_token_t t; 3824 #if defined GCC_LINT || defined lint 3825 memset (&t, 0, sizeof t); 3826 #endif 3827 t.type = type; 3828 return create_token_tree (dfa, left, right, &t); 3829 } 3830 3831 static bin_tree_t * 3832 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3833 const re_token_t *token) 3834 { 3835 bin_tree_t *tree; 3836 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE)) 3837 { 3838 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3839 3840 if (storage == NULL) 3841 return NULL; 3842 storage->next = dfa->str_tree_storage; 3843 dfa->str_tree_storage = storage; 3844 dfa->str_tree_storage_idx = 0; 3845 } 3846 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3847 3848 tree->parent = NULL; 3849 tree->left = left; 3850 tree->right = right; 3851 tree->token = *token; 3852 tree->token.duplicated = 0; 3853 tree->token.opt_subexp = 0; 3854 tree->first = NULL; 3855 tree->next = NULL; 3856 tree->node_idx = -1; 3857 3858 if (left != NULL) 3859 left->parent = tree; 3860 if (right != NULL) 3861 right->parent = tree; 3862 return tree; 3863 } 3864 3865 /* Mark the tree SRC as an optional subexpression. 3866 To be called from preorder or postorder. */ 3867 3868 static reg_errcode_t 3869 mark_opt_subexp (void *extra, bin_tree_t *node) 3870 { 3871 Idx idx = (uintptr_t) extra; 3872 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3873 node->token.opt_subexp = 1; 3874 3875 return REG_NOERROR; 3876 } 3877 3878 /* Free the allocated memory inside NODE. */ 3879 3880 static void 3881 free_token (re_token_t *node) 3882 { 3883 #ifdef RE_ENABLE_I18N 3884 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3885 free_charset (node->opr.mbcset); 3886 else 3887 #endif /* RE_ENABLE_I18N */ 3888 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3889 re_free (node->opr.sbcset); 3890 } 3891 3892 /* Worker function for tree walking. Free the allocated memory inside NODE 3893 and its children. */ 3894 3895 static reg_errcode_t 3896 free_tree (void *extra, bin_tree_t *node) 3897 { 3898 free_token (&node->token); 3899 return REG_NOERROR; 3900 } 3901 3902 3903 /* Duplicate the node SRC, and return new node. This is a preorder 3904 visit similar to the one implemented by the generic visitor, but 3905 we need more infrastructure to maintain two parallel trees --- so, 3906 it's easier to duplicate. */ 3907 3908 static bin_tree_t * 3909 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3910 { 3911 const bin_tree_t *node; 3912 bin_tree_t *dup_root; 3913 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3914 3915 for (node = root; ; ) 3916 { 3917 /* Create a new tree and link it back to the current parent. */ 3918 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3919 if (*p_new == NULL) 3920 return NULL; 3921 (*p_new)->parent = dup_node; 3922 (*p_new)->token.duplicated = 1; 3923 dup_node = *p_new; 3924 3925 /* Go to the left node, or up and to the right. */ 3926 if (node->left) 3927 { 3928 node = node->left; 3929 p_new = &dup_node->left; 3930 } 3931 else 3932 { 3933 const bin_tree_t *prev = NULL; 3934 while (node->right == prev || node->right == NULL) 3935 { 3936 prev = node; 3937 node = node->parent; 3938 dup_node = dup_node->parent; 3939 if (!node) 3940 return dup_root; 3941 } 3942 node = node->right; 3943 p_new = &dup_node->right; 3944 } 3945 } 3946 } 3947