1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 #ifdef PERL_EXT_RE_BUILD 35 #include "re_top.h" 36 #endif 37 38 /* 39 * pregcomp and pregexec -- regsub and regerror are not used in perl 40 * 41 * Copyright (c) 1986 by University of Toronto. 42 * Written by Henry Spencer. Not derived from licensed software. 43 * 44 * Permission is granted to anyone to use this software for any 45 * purpose on any computer system, and to redistribute it freely, 46 * subject to the following restrictions: 47 * 48 * 1. The author is not responsible for the consequences of use of 49 * this software, no matter how awful, even if they arise 50 * from defects in it. 51 * 52 * 2. The origin of this software must not be misrepresented, either 53 * by explicit claim or by omission. 54 * 55 * 3. Altered versions must be plainly marked as such, and must not 56 * be misrepresented as being the original software. 57 * 58 * 59 **** Alterations to Henry's code are... 60 **** 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 63 **** by Larry Wall and others 64 **** 65 **** You may distribute under the terms of either the GNU General Public 66 **** License or the Artistic License, as specified in the README file. 67 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGCOMP_C 75 #include "perl.h" 76 77 #ifndef PERL_IN_XSUB_RE 78 # include "INTERN.h" 79 #endif 80 81 #define REG_COMP_C 82 #ifdef PERL_IN_XSUB_RE 83 # include "re_comp.h" 84 EXTERN_C const struct regexp_engine my_reg_engine; 85 #else 86 # include "regcomp.h" 87 #endif 88 89 #include "dquote_static.c" 90 #include "charclass_invlists.h" 91 #include "inline_invlist.c" 92 #include "unicode_constants.h" 93 94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ 95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 98 99 #ifndef STATIC 100 #define STATIC static 101 #endif 102 103 104 struct RExC_state_t { 105 U32 flags; /* RXf_* are we folding, multilining? */ 106 U32 pm_flags; /* PMf_* stuff from the calling PMOP */ 107 char *precomp; /* uncompiled string. */ 108 REGEXP *rx_sv; /* The SV that is the regexp. */ 109 regexp *rx; /* perl core regexp structure */ 110 regexp_internal *rxi; /* internal data for regexp object 111 pprivate field */ 112 char *start; /* Start of input for compile */ 113 char *end; /* End of input for compile */ 114 char *parse; /* Input-scan pointer. */ 115 SSize_t whilem_seen; /* number of WHILEM in this expr */ 116 regnode *emit_start; /* Start of emitted-code area */ 117 regnode *emit_bound; /* First regnode outside of the 118 allocated space */ 119 regnode *emit; /* Code-emit pointer; if = &emit_dummy, 120 implies compiling, so don't emit */ 121 regnode_ssc emit_dummy; /* placeholder for emit to point to; 122 large enough for the largest 123 non-EXACTish node, so can use it as 124 scratch in pass1 */ 125 I32 naughty; /* How bad is this pattern? */ 126 I32 sawback; /* Did we see \1, ...? */ 127 U32 seen; 128 SSize_t size; /* Code size. */ 129 I32 npar; /* Capture buffer count, (OPEN) plus 130 one. ("par" 0 is the whole 131 pattern)*/ 132 I32 nestroot; /* root parens we are in - used by 133 accept */ 134 I32 extralen; 135 I32 seen_zerolen; 136 regnode **open_parens; /* pointers to open parens */ 137 regnode **close_parens; /* pointers to close parens */ 138 regnode *opend; /* END node in program */ 139 I32 utf8; /* whether the pattern is utf8 or not */ 140 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 141 /* XXX use this for future optimisation of case 142 * where pattern must be upgraded to utf8. */ 143 I32 uni_semantics; /* If a d charset modifier should use unicode 144 rules, even if the pattern is not in 145 utf8 */ 146 HV *paren_names; /* Paren names */ 147 148 regnode **recurse; /* Recurse regops */ 149 I32 recurse_count; /* Number of recurse regops */ 150 U8 *study_chunk_recursed; /* bitmap of which parens we have moved 151 through */ 152 U32 study_chunk_recursed_bytes; /* bytes in bitmap */ 153 I32 in_lookbehind; 154 I32 contains_locale; 155 I32 contains_i; 156 I32 override_recoding; 157 I32 in_multi_char_class; 158 struct reg_code_block *code_blocks; /* positions of literal (?{}) 159 within pattern */ 160 int num_code_blocks; /* size of code_blocks[] */ 161 int code_index; /* next code_blocks[] slot */ 162 SSize_t maxlen; /* mininum possible number of chars in string to match */ 163 #ifdef ADD_TO_REGEXEC 164 char *starttry; /* -Dr: where regtry was called. */ 165 #define RExC_starttry (pRExC_state->starttry) 166 #endif 167 SV *runtime_code_qr; /* qr with the runtime code blocks */ 168 #ifdef DEBUGGING 169 const char *lastparse; 170 I32 lastnum; 171 AV *paren_name_list; /* idx -> name */ 172 #define RExC_lastparse (pRExC_state->lastparse) 173 #define RExC_lastnum (pRExC_state->lastnum) 174 #define RExC_paren_name_list (pRExC_state->paren_name_list) 175 #endif 176 }; 177 178 #define RExC_flags (pRExC_state->flags) 179 #define RExC_pm_flags (pRExC_state->pm_flags) 180 #define RExC_precomp (pRExC_state->precomp) 181 #define RExC_rx_sv (pRExC_state->rx_sv) 182 #define RExC_rx (pRExC_state->rx) 183 #define RExC_rxi (pRExC_state->rxi) 184 #define RExC_start (pRExC_state->start) 185 #define RExC_end (pRExC_state->end) 186 #define RExC_parse (pRExC_state->parse) 187 #define RExC_whilem_seen (pRExC_state->whilem_seen) 188 #ifdef RE_TRACK_PATTERN_OFFSETS 189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the 190 others */ 191 #endif 192 #define RExC_emit (pRExC_state->emit) 193 #define RExC_emit_dummy (pRExC_state->emit_dummy) 194 #define RExC_emit_start (pRExC_state->emit_start) 195 #define RExC_emit_bound (pRExC_state->emit_bound) 196 #define RExC_naughty (pRExC_state->naughty) 197 #define RExC_sawback (pRExC_state->sawback) 198 #define RExC_seen (pRExC_state->seen) 199 #define RExC_size (pRExC_state->size) 200 #define RExC_maxlen (pRExC_state->maxlen) 201 #define RExC_npar (pRExC_state->npar) 202 #define RExC_nestroot (pRExC_state->nestroot) 203 #define RExC_extralen (pRExC_state->extralen) 204 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 205 #define RExC_utf8 (pRExC_state->utf8) 206 #define RExC_uni_semantics (pRExC_state->uni_semantics) 207 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 208 #define RExC_open_parens (pRExC_state->open_parens) 209 #define RExC_close_parens (pRExC_state->close_parens) 210 #define RExC_opend (pRExC_state->opend) 211 #define RExC_paren_names (pRExC_state->paren_names) 212 #define RExC_recurse (pRExC_state->recurse) 213 #define RExC_recurse_count (pRExC_state->recurse_count) 214 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) 215 #define RExC_study_chunk_recursed_bytes \ 216 (pRExC_state->study_chunk_recursed_bytes) 217 #define RExC_in_lookbehind (pRExC_state->in_lookbehind) 218 #define RExC_contains_locale (pRExC_state->contains_locale) 219 #define RExC_contains_i (pRExC_state->contains_i) 220 #define RExC_override_recoding (pRExC_state->override_recoding) 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) 222 223 224 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 225 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 226 ((*s) == '{' && regcurly(s, FALSE))) 227 228 /* 229 * Flags to be passed up and down. 230 */ 231 #define WORST 0 /* Worst case. */ 232 #define HASWIDTH 0x01 /* Known to match non-null strings. */ 233 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single 235 * character. (There needs to be a case: in the switch statement in regexec.c 236 * for any node marked SIMPLE.) Note that this is not the same thing as 237 * REGNODE_SIMPLE */ 238 #define SIMPLE 0x02 239 #define SPSTART 0x04 /* Starts with * or + */ 240 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ 241 #define TRYAGAIN 0x10 /* Weeded out a declaration. */ 242 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ 243 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 245 246 /* whether trie related optimizations are enabled */ 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 248 #define TRIE_STUDY_OPT 249 #define FULL_TRIE_STUDY 250 #define TRIE_STCLASS 251 #endif 252 253 254 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 256 #define PBITVAL(paren) (1 << ((paren) & 7)) 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 260 261 #define REQUIRE_UTF8 STMT_START { \ 262 if (!UTF) { \ 263 *flagp = RESTART_UTF8; \ 264 return NULL; \ 265 } \ 266 } STMT_END 267 268 /* This converts the named class defined in regcomp.h to its equivalent class 269 * number defined in handy.h. */ 270 #define namedclass_to_classnum(class) ((int) ((class) / 2)) 271 #define classnum_to_namedclass(classnum) ((classnum) * 2) 272 273 #define _invlist_union_complement_2nd(a, b, output) \ 274 _invlist_union_maybe_complement_2nd(a, b, TRUE, output) 275 #define _invlist_intersection_complement_2nd(a, b, output) \ 276 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) 277 278 /* About scan_data_t. 279 280 During optimisation we recurse through the regexp program performing 281 various inplace (keyhole style) optimisations. In addition study_chunk 282 and scan_commit populate this data structure with information about 283 what strings MUST appear in the pattern. We look for the longest 284 string that must appear at a fixed location, and we look for the 285 longest string that may appear at a floating location. So for instance 286 in the pattern: 287 288 /FOO[xX]A.*B[xX]BAR/ 289 290 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 291 strings (because they follow a .* construct). study_chunk will identify 292 both FOO and BAR as being the longest fixed and floating strings respectively. 293 294 The strings can be composites, for instance 295 296 /(f)(o)(o)/ 297 298 will result in a composite fixed substring 'foo'. 299 300 For each string some basic information is maintained: 301 302 - offset or min_offset 303 This is the position the string must appear at, or not before. 304 It also implicitly (when combined with minlenp) tells us how many 305 characters must match before the string we are searching for. 306 Likewise when combined with minlenp and the length of the string it 307 tells us how many characters must appear after the string we have 308 found. 309 310 - max_offset 311 Only used for floating strings. This is the rightmost point that 312 the string can appear at. If set to SSize_t_MAX it indicates that the 313 string can occur infinitely far to the right. 314 315 - minlenp 316 A pointer to the minimum number of characters of the pattern that the 317 string was found inside. This is important as in the case of positive 318 lookahead or positive lookbehind we can have multiple patterns 319 involved. Consider 320 321 /(?=FOO).*F/ 322 323 The minimum length of the pattern overall is 3, the minimum length 324 of the lookahead part is 3, but the minimum length of the part that 325 will actually match is 1. So 'FOO's minimum length is 3, but the 326 minimum length for the F is 1. This is important as the minimum length 327 is used to determine offsets in front of and behind the string being 328 looked for. Since strings can be composites this is the length of the 329 pattern at the time it was committed with a scan_commit. Note that 330 the length is calculated by study_chunk, so that the minimum lengths 331 are not known until the full pattern has been compiled, thus the 332 pointer to the value. 333 334 - lookbehind 335 336 In the case of lookbehind the string being searched for can be 337 offset past the start point of the final matching string. 338 If this value was just blithely removed from the min_offset it would 339 invalidate some of the calculations for how many chars must match 340 before or after (as they are derived from min_offset and minlen and 341 the length of the string being searched for). 342 When the final pattern is compiled and the data is moved from the 343 scan_data_t structure into the regexp structure the information 344 about lookbehind is factored in, with the information that would 345 have been lost precalculated in the end_shift field for the 346 associated string. 347 348 The fields pos_min and pos_delta are used to store the minimum offset 349 and the delta to the maximum offset at the current point in the pattern. 350 351 */ 352 353 typedef struct scan_data_t { 354 /*I32 len_min; unused */ 355 /*I32 len_delta; unused */ 356 SSize_t pos_min; 357 SSize_t pos_delta; 358 SV *last_found; 359 SSize_t last_end; /* min value, <0 unless valid. */ 360 SSize_t last_start_min; 361 SSize_t last_start_max; 362 SV **longest; /* Either &l_fixed, or &l_float. */ 363 SV *longest_fixed; /* longest fixed string found in pattern */ 364 SSize_t offset_fixed; /* offset where it starts */ 365 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ 366 I32 lookbehind_fixed; /* is the position of the string modfied by LB */ 367 SV *longest_float; /* longest floating string found in pattern */ 368 SSize_t offset_float_min; /* earliest point in string it can appear */ 369 SSize_t offset_float_max; /* latest point in string it can appear */ 370 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ 371 SSize_t lookbehind_float; /* is the pos of the string modified by LB */ 372 I32 flags; 373 I32 whilem_c; 374 SSize_t *last_closep; 375 regnode_ssc *start_class; 376 } scan_data_t; 377 378 /* The below is perhaps overboard, but this allows us to save a test at the 379 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' 380 * and 'a' differ by a single bit; the same with the upper and lower case of 381 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; 382 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and 383 * then inverts it to form a mask, with just a single 0, in the bit position 384 * where the upper- and lowercase differ. XXX There are about 40 other 385 * instances in the Perl core where this micro-optimization could be used. 386 * Should decide if maintenance cost is worse, before changing those 387 * 388 * Returns a boolean as to whether or not 'v' is either a lowercase or 389 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a 390 * compile-time constant, the generated code is better than some optimizing 391 * compilers figure out, amounting to a mask and test. The results are 392 * meaningless if 'c' is not one of [A-Za-z] */ 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \ 394 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) 395 396 /* 397 * Forward declarations for pregcomp()'s friends. 398 */ 399 400 static const scan_data_t zero_scan_data = 401 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; 402 403 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 404 #define SF_BEFORE_SEOL 0x0001 405 #define SF_BEFORE_MEOL 0x0002 406 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 407 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 408 409 #define SF_FIX_SHIFT_EOL (+2) 410 #define SF_FL_SHIFT_EOL (+4) 411 412 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 413 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 414 415 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 416 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 417 #define SF_IS_INF 0x0040 418 #define SF_HAS_PAR 0x0080 419 #define SF_IN_PAR 0x0100 420 #define SF_HAS_EVAL 0x0200 421 #define SCF_DO_SUBSTR 0x0400 422 #define SCF_DO_STCLASS_AND 0x0800 423 #define SCF_DO_STCLASS_OR 0x1000 424 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 425 #define SCF_WHILEM_VISITED_POS 0x2000 426 427 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 428 #define SCF_SEEN_ACCEPT 0x8000 429 #define SCF_TRIE_DOING_RESTUDY 0x10000 430 431 #define UTF cBOOL(RExC_utf8) 432 433 /* The enums for all these are ordered so things work out correctly */ 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ 436 == REGEX_DEPENDS_CHARSET) 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ 439 >= REGEX_UNICODE_CHARSET) 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 441 == REGEX_ASCII_RESTRICTED_CHARSET) 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 443 >= REGEX_ASCII_RESTRICTED_CHARSET) 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ 445 == REGEX_ASCII_MORE_RESTRICTED_CHARSET) 446 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) 448 449 /* For programs that want to be strictly Unicode compatible by dying if any 450 * attempt is made to match a non-Unicode code point against a Unicode 451 * property. */ 452 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) 453 454 #define OOB_NAMEDCLASS -1 455 456 /* There is no code point that is out-of-bounds, so this is problematic. But 457 * its only current use is to initialize a variable that is always set before 458 * looked at. */ 459 #define OOB_UNICODE 0xDEADBEEF 460 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 463 464 465 /* length of regex to show in messages that don't mark a position within */ 466 #define RegexLengthToShowInErrorMessages 127 467 468 /* 469 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 470 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 471 * op/pragma/warn/regcomp. 472 */ 473 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 474 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 475 476 #define REPORT_LOCATION " in regex; marked by " MARKER1 \ 477 " in m/%"UTF8f MARKER2 "%"UTF8f"/" 478 479 #define REPORT_LOCATION_ARGS(offset) \ 480 UTF8fARG(UTF, offset, RExC_precomp), \ 481 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) 482 483 /* 484 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 485 * arg. Show regex, up to a maximum length. If it's too long, chop and add 486 * "...". 487 */ 488 #define _FAIL(code) STMT_START { \ 489 const char *ellipses = ""; \ 490 IV len = RExC_end - RExC_precomp; \ 491 \ 492 if (!SIZE_ONLY) \ 493 SAVEFREESV(RExC_rx_sv); \ 494 if (len > RegexLengthToShowInErrorMessages) { \ 495 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 496 len = RegexLengthToShowInErrorMessages - 10; \ 497 ellipses = "..."; \ 498 } \ 499 code; \ 500 } STMT_END 501 502 #define FAIL(msg) _FAIL( \ 503 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ 504 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 505 506 #define FAIL2(msg,arg) _FAIL( \ 507 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ 508 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 509 510 /* 511 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 512 */ 513 #define Simple_vFAIL(m) STMT_START { \ 514 const IV offset = RExC_parse - RExC_precomp; \ 515 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 516 m, REPORT_LOCATION_ARGS(offset)); \ 517 } STMT_END 518 519 /* 520 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 521 */ 522 #define vFAIL(m) STMT_START { \ 523 if (!SIZE_ONLY) \ 524 SAVEFREESV(RExC_rx_sv); \ 525 Simple_vFAIL(m); \ 526 } STMT_END 527 528 /* 529 * Like Simple_vFAIL(), but accepts two arguments. 530 */ 531 #define Simple_vFAIL2(m,a1) STMT_START { \ 532 const IV offset = RExC_parse - RExC_precomp; \ 533 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 534 REPORT_LOCATION_ARGS(offset)); \ 535 } STMT_END 536 537 /* 538 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 539 */ 540 #define vFAIL2(m,a1) STMT_START { \ 541 if (!SIZE_ONLY) \ 542 SAVEFREESV(RExC_rx_sv); \ 543 Simple_vFAIL2(m, a1); \ 544 } STMT_END 545 546 547 /* 548 * Like Simple_vFAIL(), but accepts three arguments. 549 */ 550 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 551 const IV offset = RExC_parse - RExC_precomp; \ 552 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ 553 REPORT_LOCATION_ARGS(offset)); \ 554 } STMT_END 555 556 /* 557 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 558 */ 559 #define vFAIL3(m,a1,a2) STMT_START { \ 560 if (!SIZE_ONLY) \ 561 SAVEFREESV(RExC_rx_sv); \ 562 Simple_vFAIL3(m, a1, a2); \ 563 } STMT_END 564 565 /* 566 * Like Simple_vFAIL(), but accepts four arguments. 567 */ 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 569 const IV offset = RExC_parse - RExC_precomp; \ 570 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ 571 REPORT_LOCATION_ARGS(offset)); \ 572 } STMT_END 573 574 #define vFAIL4(m,a1,a2,a3) STMT_START { \ 575 if (!SIZE_ONLY) \ 576 SAVEFREESV(RExC_rx_sv); \ 577 Simple_vFAIL4(m, a1, a2, a3); \ 578 } STMT_END 579 580 /* A specialized version of vFAIL2 that works with UTF8f */ 581 #define vFAIL2utf8f(m, a1) STMT_START { \ 582 const IV offset = RExC_parse - RExC_precomp; \ 583 if (!SIZE_ONLY) \ 584 SAVEFREESV(RExC_rx_sv); \ 585 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 586 REPORT_LOCATION_ARGS(offset)); \ 587 } STMT_END 588 589 590 /* m is not necessarily a "literal string", in this macro */ 591 #define reg_warn_non_literal_string(loc, m) STMT_START { \ 592 const IV offset = loc - RExC_precomp; \ 593 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ 594 m, REPORT_LOCATION_ARGS(offset)); \ 595 } STMT_END 596 597 #define ckWARNreg(loc,m) STMT_START { \ 598 const IV offset = loc - RExC_precomp; \ 599 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 600 REPORT_LOCATION_ARGS(offset)); \ 601 } STMT_END 602 603 #define vWARN_dep(loc, m) STMT_START { \ 604 const IV offset = loc - RExC_precomp; \ 605 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ 606 REPORT_LOCATION_ARGS(offset)); \ 607 } STMT_END 608 609 #define ckWARNdep(loc,m) STMT_START { \ 610 const IV offset = loc - RExC_precomp; \ 611 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 612 m REPORT_LOCATION, \ 613 REPORT_LOCATION_ARGS(offset)); \ 614 } STMT_END 615 616 #define ckWARNregdep(loc,m) STMT_START { \ 617 const IV offset = loc - RExC_precomp; \ 618 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 619 m REPORT_LOCATION, \ 620 REPORT_LOCATION_ARGS(offset)); \ 621 } STMT_END 622 623 #define ckWARN2reg_d(loc,m, a1) STMT_START { \ 624 const IV offset = loc - RExC_precomp; \ 625 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ 626 m REPORT_LOCATION, \ 627 a1, REPORT_LOCATION_ARGS(offset)); \ 628 } STMT_END 629 630 #define ckWARN2reg(loc, m, a1) STMT_START { \ 631 const IV offset = loc - RExC_precomp; \ 632 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 633 a1, REPORT_LOCATION_ARGS(offset)); \ 634 } STMT_END 635 636 #define vWARN3(loc, m, a1, a2) STMT_START { \ 637 const IV offset = loc - RExC_precomp; \ 638 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 639 a1, a2, REPORT_LOCATION_ARGS(offset)); \ 640 } STMT_END 641 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ 643 const IV offset = loc - RExC_precomp; \ 644 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 645 a1, a2, REPORT_LOCATION_ARGS(offset)); \ 646 } STMT_END 647 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 649 const IV offset = loc - RExC_precomp; \ 650 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 651 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ 652 } STMT_END 653 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ 655 const IV offset = loc - RExC_precomp; \ 656 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 657 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ 658 } STMT_END 659 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 661 const IV offset = loc - RExC_precomp; \ 662 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 663 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ 664 } STMT_END 665 666 667 /* Allow for side effects in s */ 668 #define REGC(c,s) STMT_START { \ 669 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ 670 } STMT_END 671 672 /* Macros for recording node offsets. 20001227 mjd@plover.com 673 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 674 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 675 * Element 0 holds the number n. 676 * Position is 1 indexed. 677 */ 678 #ifndef RE_TRACK_PATTERN_OFFSETS 679 #define Set_Node_Offset_To_R(node,byte) 680 #define Set_Node_Offset(node,byte) 681 #define Set_Cur_Node_Offset 682 #define Set_Node_Length_To_R(node,len) 683 #define Set_Node_Length(node,len) 684 #define Set_Node_Cur_Length(node,start) 685 #define Node_Offset(n) 686 #define Node_Length(n) 687 #define Set_Node_Offset_Length(node,offset,len) 688 #define ProgLen(ri) ri->u.proglen 689 #define SetProgLen(ri,x) ri->u.proglen = x 690 #else 691 #define ProgLen(ri) ri->u.offsets[0] 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x 693 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 694 if (! SIZE_ONLY) { \ 695 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 696 __LINE__, (int)(node), (int)(byte))); \ 697 if((node) < 0) { \ 698 Perl_croak(aTHX_ "value of node is %d in Offset macro", \ 699 (int)(node)); \ 700 } else { \ 701 RExC_offsets[2*(node)-1] = (byte); \ 702 } \ 703 } \ 704 } STMT_END 705 706 #define Set_Node_Offset(node,byte) \ 707 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 709 710 #define Set_Node_Length_To_R(node,len) STMT_START { \ 711 if (! SIZE_ONLY) { \ 712 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 713 __LINE__, (int)(node), (int)(len))); \ 714 if((node) < 0) { \ 715 Perl_croak(aTHX_ "value of node is %d in Length macro", \ 716 (int)(node)); \ 717 } else { \ 718 RExC_offsets[2*(node)] = (len); \ 719 } \ 720 } \ 721 } STMT_END 722 723 #define Set_Node_Length(node,len) \ 724 Set_Node_Length_To_R((node)-RExC_emit_start, len) 725 #define Set_Node_Cur_Length(node, start) \ 726 Set_Node_Length(node, RExC_parse - start) 727 728 /* Get offsets and lengths */ 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 731 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 733 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ 734 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ 735 } STMT_END 736 #endif 737 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 739 #define EXPERIMENTAL_INPLACESCAN 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ 741 742 #define DEBUG_RExC_seen() \ 743 DEBUG_OPTIMISE_MORE_r({ \ 744 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ 745 \ 746 if (RExC_seen & REG_ZERO_LEN_SEEN) \ 747 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ 748 \ 749 if (RExC_seen & REG_LOOKBEHIND_SEEN) \ 750 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ 751 \ 752 if (RExC_seen & REG_GPOS_SEEN) \ 753 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ 754 \ 755 if (RExC_seen & REG_CANY_SEEN) \ 756 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ 757 \ 758 if (RExC_seen & REG_RECURSE_SEEN) \ 759 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ 760 \ 761 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ 762 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ 763 \ 764 if (RExC_seen & REG_VERBARG_SEEN) \ 765 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ 766 \ 767 if (RExC_seen & REG_CUTGROUP_SEEN) \ 768 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ 769 \ 770 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ 771 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ 772 \ 773 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ 774 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ 775 \ 776 if (RExC_seen & REG_GOSTART_SEEN) \ 777 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ 778 \ 779 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ 780 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ 781 \ 782 PerlIO_printf(Perl_debug_log,"\n"); \ 783 }); 784 785 #define DEBUG_STUDYDATA(str,data,depth) \ 786 DEBUG_OPTIMISE_MORE_r(if(data){ \ 787 PerlIO_printf(Perl_debug_log, \ 788 "%*s" str "Pos:%"IVdf"/%"IVdf \ 789 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ 790 (int)(depth)*2, "", \ 791 (IV)((data)->pos_min), \ 792 (IV)((data)->pos_delta), \ 793 (UV)((data)->flags), \ 794 (IV)((data)->whilem_c), \ 795 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ 796 is_inf ? "INF " : "" \ 797 ); \ 798 if ((data)->last_found) \ 799 PerlIO_printf(Perl_debug_log, \ 800 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ 801 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ 802 SvPVX_const((data)->last_found), \ 803 (IV)((data)->last_end), \ 804 (IV)((data)->last_start_min), \ 805 (IV)((data)->last_start_max), \ 806 ((data)->longest && \ 807 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ 808 SvPVX_const((data)->longest_fixed), \ 809 (IV)((data)->offset_fixed), \ 810 ((data)->longest && \ 811 (data)->longest==&((data)->longest_float)) ? "*" : "", \ 812 SvPVX_const((data)->longest_float), \ 813 (IV)((data)->offset_float_min), \ 814 (IV)((data)->offset_float_max) \ 815 ); \ 816 PerlIO_printf(Perl_debug_log,"\n"); \ 817 }); 818 819 /* Mark that we cannot extend a found fixed substring at this point. 820 Update the longest found anchored substring and the longest found 821 floating substrings if needed. */ 822 823 STATIC void 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, 825 SSize_t *minlenp, int is_inf) 826 { 827 const STRLEN l = CHR_SVLEN(data->last_found); 828 const STRLEN old_l = CHR_SVLEN(*data->longest); 829 GET_RE_DEBUG_FLAGS_DECL; 830 831 PERL_ARGS_ASSERT_SCAN_COMMIT; 832 833 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 834 SvSetMagicSV(*data->longest, data->last_found); 835 if (*data->longest == data->longest_fixed) { 836 data->offset_fixed = l ? data->last_start_min : data->pos_min; 837 if (data->flags & SF_BEFORE_EOL) 838 data->flags 839 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 840 else 841 data->flags &= ~SF_FIX_BEFORE_EOL; 842 data->minlen_fixed=minlenp; 843 data->lookbehind_fixed=0; 844 } 845 else { /* *data->longest == data->longest_float */ 846 data->offset_float_min = l ? data->last_start_min : data->pos_min; 847 data->offset_float_max = (l 848 ? data->last_start_max 849 : (data->pos_delta == SSize_t_MAX 850 ? SSize_t_MAX 851 : data->pos_min + data->pos_delta)); 852 if (is_inf 853 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) 854 data->offset_float_max = SSize_t_MAX; 855 if (data->flags & SF_BEFORE_EOL) 856 data->flags 857 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 858 else 859 data->flags &= ~SF_FL_BEFORE_EOL; 860 data->minlen_float=minlenp; 861 data->lookbehind_float=0; 862 } 863 } 864 SvCUR_set(data->last_found, 0); 865 { 866 SV * const sv = data->last_found; 867 if (SvUTF8(sv) && SvMAGICAL(sv)) { 868 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 869 if (mg) 870 mg->mg_len = 0; 871 } 872 } 873 data->last_end = -1; 874 data->flags &= ~SF_BEFORE_EOL; 875 DEBUG_STUDYDATA("commit: ",data,0); 876 } 877 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion 879 * list that describes which code points it matches */ 880 881 STATIC void 882 S_ssc_anything(pTHX_ regnode_ssc *ssc) 883 { 884 /* Set the SSC 'ssc' to match an empty string or any code point */ 885 886 PERL_ARGS_ASSERT_SSC_ANYTHING; 887 888 assert(is_ANYOF_SYNTHETIC(ssc)); 889 890 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ 891 _append_range_to_invlist(ssc->invlist, 0, UV_MAX); 892 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ 893 } 894 895 STATIC int 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) 897 { 898 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code 899 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys 900 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted 901 * in any way, so there's no point in using it */ 902 903 UV start, end; 904 bool ret; 905 906 PERL_ARGS_ASSERT_SSC_IS_ANYTHING; 907 908 assert(is_ANYOF_SYNTHETIC(ssc)); 909 910 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { 911 return FALSE; 912 } 913 914 /* See if the list consists solely of the range 0 - Infinity */ 915 invlist_iterinit(ssc->invlist); 916 ret = invlist_iternext(ssc->invlist, &start, &end) 917 && start == 0 918 && end == UV_MAX; 919 920 invlist_iterfinish(ssc->invlist); 921 922 if (ret) { 923 return TRUE; 924 } 925 926 /* If e.g., both \w and \W are set, matches everything */ 927 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 928 int i; 929 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { 930 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { 931 return TRUE; 932 } 933 } 934 } 935 936 return FALSE; 937 } 938 939 STATIC void 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) 941 { 942 /* Initializes the SSC 'ssc'. This includes setting it to match an empty 943 * string, any code point, or any posix class under locale */ 944 945 PERL_ARGS_ASSERT_SSC_INIT; 946 947 Zero(ssc, 1, regnode_ssc); 948 set_ANYOF_SYNTHETIC(ssc); 949 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); 950 ssc_anything(ssc); 951 952 /* If any portion of the regex is to operate under locale rules, 953 * initialization includes it. The reason this isn't done for all regexes 954 * is that the optimizer was written under the assumption that locale was 955 * all-or-nothing. Given the complexity and lack of documentation in the 956 * optimizer, and that there are inadequate test cases for locale, many 957 * parts of it may not work properly, it is safest to avoid locale unless 958 * necessary. */ 959 if (RExC_contains_locale) { 960 ANYOF_POSIXL_SETALL(ssc); 961 } 962 else { 963 ANYOF_POSIXL_ZERO(ssc); 964 } 965 } 966 967 STATIC int 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, 969 const regnode_ssc *ssc) 970 { 971 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only 972 * to the list of code points matched, and locale posix classes; hence does 973 * not check its flags) */ 974 975 UV start, end; 976 bool ret; 977 978 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; 979 980 assert(is_ANYOF_SYNTHETIC(ssc)); 981 982 invlist_iterinit(ssc->invlist); 983 ret = invlist_iternext(ssc->invlist, &start, &end) 984 && start == 0 985 && end == UV_MAX; 986 987 invlist_iterfinish(ssc->invlist); 988 989 if (! ret) { 990 return FALSE; 991 } 992 993 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { 994 return FALSE; 995 } 996 997 return TRUE; 998 } 999 1000 STATIC SV* 1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, 1002 const regnode_charclass* const node) 1003 { 1004 /* Returns a mortal inversion list defining which code points are matched 1005 * by 'node', which is of type ANYOF. Handles complementing the result if 1006 * appropriate. If some code points aren't knowable at this time, the 1007 * returned list must, and will, contain every code point that is a 1008 * possibility. */ 1009 1010 SV* invlist = sv_2mortal(_new_invlist(0)); 1011 SV* only_utf8_locale_invlist = NULL; 1012 unsigned int i; 1013 const U32 n = ARG(node); 1014 bool new_node_has_latin1 = FALSE; 1015 1016 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; 1017 1018 /* Look at the data structure created by S_set_ANYOF_arg() */ 1019 if (n != ANYOF_NONBITMAP_EMPTY) { 1020 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); 1021 AV * const av = MUTABLE_AV(SvRV(rv)); 1022 SV **const ary = AvARRAY(av); 1023 assert(RExC_rxi->data->what[n] == 's'); 1024 1025 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ 1026 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); 1027 } 1028 else if (ary[0] && ary[0] != &PL_sv_undef) { 1029 1030 /* Here, no compile-time swash, and there are things that won't be 1031 * known until runtime -- we have to assume it could be anything */ 1032 return _add_range_to_invlist(invlist, 0, UV_MAX); 1033 } 1034 else if (ary[3] && ary[3] != &PL_sv_undef) { 1035 1036 /* Here no compile-time swash, and no run-time only data. Use the 1037 * node's inversion list */ 1038 invlist = sv_2mortal(invlist_clone(ary[3])); 1039 } 1040 1041 /* Get the code points valid only under UTF-8 locales */ 1042 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) 1043 && ary[2] && ary[2] != &PL_sv_undef) 1044 { 1045 only_utf8_locale_invlist = ary[2]; 1046 } 1047 } 1048 1049 /* An ANYOF node contains a bitmap for the first 256 code points, and an 1050 * inversion list for the others, but if there are code points that should 1051 * match only conditionally on the target string being UTF-8, those are 1052 * placed in the inversion list, and not the bitmap. Since there are 1053 * circumstances under which they could match, they are included in the 1054 * SSC. But if the ANYOF node is to be inverted, we have to exclude them 1055 * here, so that when we invert below, the end result actually does include 1056 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here 1057 * before we add the unconditionally matched code points */ 1058 if (ANYOF_FLAGS(node) & ANYOF_INVERT) { 1059 _invlist_intersection_complement_2nd(invlist, 1060 PL_UpperLatin1, 1061 &invlist); 1062 } 1063 1064 /* Add in the points from the bit map */ 1065 for (i = 0; i < 256; i++) { 1066 if (ANYOF_BITMAP_TEST(node, i)) { 1067 invlist = add_cp_to_invlist(invlist, i); 1068 new_node_has_latin1 = TRUE; 1069 } 1070 } 1071 1072 /* If this can match all upper Latin1 code points, have to add them 1073 * as well */ 1074 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { 1075 _invlist_union(invlist, PL_UpperLatin1, &invlist); 1076 } 1077 1078 /* Similarly for these */ 1079 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { 1080 invlist = _add_range_to_invlist(invlist, 256, UV_MAX); 1081 } 1082 1083 if (ANYOF_FLAGS(node) & ANYOF_INVERT) { 1084 _invlist_invert(invlist); 1085 } 1086 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { 1087 1088 /* Under /li, any 0-255 could fold to any other 0-255, depending on the 1089 * locale. We can skip this if there are no 0-255 at all. */ 1090 _invlist_union(invlist, PL_Latin1, &invlist); 1091 } 1092 1093 /* Similarly add the UTF-8 locale possible matches. These have to be 1094 * deferred until after the non-UTF-8 locale ones are taken care of just 1095 * above, or it leads to wrong results under ANYOF_INVERT */ 1096 if (only_utf8_locale_invlist) { 1097 _invlist_union_maybe_complement_2nd(invlist, 1098 only_utf8_locale_invlist, 1099 ANYOF_FLAGS(node) & ANYOF_INVERT, 1100 &invlist); 1101 } 1102 1103 return invlist; 1104 } 1105 1106 /* These two functions currently do the exact same thing */ 1107 #define ssc_init_zero ssc_init 1108 1109 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) 1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) 1111 1112 /* 'AND' a given class with another one. Can create false positives. 'ssc' 1113 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if 1114 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ 1115 1116 STATIC void 1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1118 const regnode_charclass *and_with) 1119 { 1120 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either 1121 * another SSC or a regular ANYOF class. Can create false positives. */ 1122 1123 SV* anded_cp_list; 1124 U8 anded_flags; 1125 1126 PERL_ARGS_ASSERT_SSC_AND; 1127 1128 assert(is_ANYOF_SYNTHETIC(ssc)); 1129 1130 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract 1131 * the code point inversion list and just the relevant flags */ 1132 if (is_ANYOF_SYNTHETIC(and_with)) { 1133 anded_cp_list = ((regnode_ssc *)and_with)->invlist; 1134 anded_flags = ANYOF_FLAGS(and_with); 1135 1136 /* XXX This is a kludge around what appears to be deficiencies in the 1137 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, 1138 * there are paths through the optimizer where it doesn't get weeded 1139 * out when it should. And if we don't make some extra provision for 1140 * it like the code just below, it doesn't get added when it should. 1141 * This solution is to add it only when AND'ing, which is here, and 1142 * only when what is being AND'ed is the pristine, original node 1143 * matching anything. Thus it is like adding it to ssc_anything() but 1144 * only when the result is to be AND'ed. Probably the same solution 1145 * could be adopted for the same problem we have with /l matching, 1146 * which is solved differently in S_ssc_init(), and that would lead to 1147 * fewer false positives than that solution has. But if this solution 1148 * creates bugs, the consequences are only that a warning isn't raised 1149 * that should be; while the consequences for having /l bugs is 1150 * incorrect matches */ 1151 if (ssc_is_anything((regnode_ssc *)and_with)) { 1152 anded_flags |= ANYOF_WARN_SUPER; 1153 } 1154 } 1155 else { 1156 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); 1157 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; 1158 } 1159 1160 ANYOF_FLAGS(ssc) &= anded_flags; 1161 1162 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1163 * C2 is the list of code points in 'and-with'; P2, its posix classes. 1164 * 'and_with' may be inverted. When not inverted, we have the situation of 1165 * computing: 1166 * (C1 | P1) & (C2 | P2) 1167 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) 1168 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1169 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) 1170 * <= ((C1 & C2) | P1 | P2) 1171 * Alternatively, the last few steps could be: 1172 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1173 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) 1174 * <= (C1 | C2 | (P1 & P2)) 1175 * We favor the second approach if either P1 or P2 is non-empty. This is 1176 * because these components are a barrier to doing optimizations, as what 1177 * they match cannot be known until the moment of matching as they are 1178 * dependent on the current locale, 'AND"ing them likely will reduce or 1179 * eliminate them. 1180 * But we can do better if we know that C1,P1 are in their initial state (a 1181 * frequent occurrence), each matching everything: 1182 * (<everything>) & (C2 | P2) = C2 | P2 1183 * Similarly, if C2,P2 are in their initial state (again a frequent 1184 * occurrence), the result is a no-op 1185 * (C1 | P1) & (<everything>) = C1 | P1 1186 * 1187 * Inverted, we have 1188 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) 1189 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) 1190 * <= (C1 & ~C2) | (P1 & ~P2) 1191 * */ 1192 1193 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) 1194 && ! is_ANYOF_SYNTHETIC(and_with)) 1195 { 1196 unsigned int i; 1197 1198 ssc_intersection(ssc, 1199 anded_cp_list, 1200 FALSE /* Has already been inverted */ 1201 ); 1202 1203 /* If either P1 or P2 is empty, the intersection will be also; can skip 1204 * the loop */ 1205 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { 1206 ANYOF_POSIXL_ZERO(ssc); 1207 } 1208 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1209 1210 /* Note that the Posix class component P from 'and_with' actually 1211 * looks like: 1212 * P = Pa | Pb | ... | Pn 1213 * where each component is one posix class, such as in [\w\s]. 1214 * Thus 1215 * ~P = ~(Pa | Pb | ... | Pn) 1216 * = ~Pa & ~Pb & ... & ~Pn 1217 * <= ~Pa | ~Pb | ... | ~Pn 1218 * The last is something we can easily calculate, but unfortunately 1219 * is likely to have many false positives. We could do better 1220 * in some (but certainly not all) instances if two classes in 1221 * P have known relationships. For example 1222 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: 1223 * So 1224 * :lower: & :print: = :lower: 1225 * And similarly for classes that must be disjoint. For example, 1226 * since \s and \w can have no elements in common based on rules in 1227 * the POSIX standard, 1228 * \w & ^\S = nothing 1229 * Unfortunately, some vendor locales do not meet the Posix 1230 * standard, in particular almost everything by Microsoft. 1231 * The loop below just changes e.g., \w into \W and vice versa */ 1232 1233 regnode_charclass_posixl temp; 1234 int add = 1; /* To calculate the index of the complement */ 1235 1236 ANYOF_POSIXL_ZERO(&temp); 1237 for (i = 0; i < ANYOF_MAX; i++) { 1238 assert(i % 2 != 0 1239 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) 1240 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); 1241 1242 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { 1243 ANYOF_POSIXL_SET(&temp, i + add); 1244 } 1245 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ 1246 } 1247 ANYOF_POSIXL_AND(&temp, ssc); 1248 1249 } /* else ssc already has no posixes */ 1250 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC 1251 in its initial state */ 1252 else if (! is_ANYOF_SYNTHETIC(and_with) 1253 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) 1254 { 1255 /* But if 'ssc' is in its initial state, the result is just 'and_with'; 1256 * copy it over 'ssc' */ 1257 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { 1258 if (is_ANYOF_SYNTHETIC(and_with)) { 1259 StructCopy(and_with, ssc, regnode_ssc); 1260 } 1261 else { 1262 ssc->invlist = anded_cp_list; 1263 ANYOF_POSIXL_ZERO(ssc); 1264 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { 1265 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); 1266 } 1267 } 1268 } 1269 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) 1270 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) 1271 { 1272 /* One or the other of P1, P2 is non-empty. */ 1273 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { 1274 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); 1275 } 1276 ssc_union(ssc, anded_cp_list, FALSE); 1277 } 1278 else { /* P1 = P2 = empty */ 1279 ssc_intersection(ssc, anded_cp_list, FALSE); 1280 } 1281 } 1282 } 1283 1284 STATIC void 1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1286 const regnode_charclass *or_with) 1287 { 1288 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either 1289 * another SSC or a regular ANYOF class. Can create false positives if 1290 * 'or_with' is to be inverted. */ 1291 1292 SV* ored_cp_list; 1293 U8 ored_flags; 1294 1295 PERL_ARGS_ASSERT_SSC_OR; 1296 1297 assert(is_ANYOF_SYNTHETIC(ssc)); 1298 1299 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract 1300 * the code point inversion list and just the relevant flags */ 1301 if (is_ANYOF_SYNTHETIC(or_with)) { 1302 ored_cp_list = ((regnode_ssc*) or_with)->invlist; 1303 ored_flags = ANYOF_FLAGS(or_with); 1304 } 1305 else { 1306 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); 1307 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; 1308 } 1309 1310 ANYOF_FLAGS(ssc) |= ored_flags; 1311 1312 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1313 * C2 is the list of code points in 'or-with'; P2, its posix classes. 1314 * 'or_with' may be inverted. When not inverted, we have the simple 1315 * situation of computing: 1316 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) 1317 * If P1|P2 yields a situation with both a class and its complement are 1318 * set, like having both \w and \W, this matches all code points, and we 1319 * can delete these from the P component of the ssc going forward. XXX We 1320 * might be able to delete all the P components, but I (khw) am not certain 1321 * about this, and it is better to be safe. 1322 * 1323 * Inverted, we have 1324 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) 1325 * <= (C1 | P1) | ~C2 1326 * <= (C1 | ~C2) | P1 1327 * (which results in actually simpler code than the non-inverted case) 1328 * */ 1329 1330 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) 1331 && ! is_ANYOF_SYNTHETIC(or_with)) 1332 { 1333 /* We ignore P2, leaving P1 going forward */ 1334 } /* else Not inverted */ 1335 else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { 1336 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); 1337 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1338 unsigned int i; 1339 for (i = 0; i < ANYOF_MAX; i += 2) { 1340 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) 1341 { 1342 ssc_match_all_cp(ssc); 1343 ANYOF_POSIXL_CLEAR(ssc, i); 1344 ANYOF_POSIXL_CLEAR(ssc, i+1); 1345 } 1346 } 1347 } 1348 } 1349 1350 ssc_union(ssc, 1351 ored_cp_list, 1352 FALSE /* Already has been inverted */ 1353 ); 1354 } 1355 1356 PERL_STATIC_INLINE void 1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) 1358 { 1359 PERL_ARGS_ASSERT_SSC_UNION; 1360 1361 assert(is_ANYOF_SYNTHETIC(ssc)); 1362 1363 _invlist_union_maybe_complement_2nd(ssc->invlist, 1364 invlist, 1365 invert2nd, 1366 &ssc->invlist); 1367 } 1368 1369 PERL_STATIC_INLINE void 1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc, 1371 SV* const invlist, 1372 const bool invert2nd) 1373 { 1374 PERL_ARGS_ASSERT_SSC_INTERSECTION; 1375 1376 assert(is_ANYOF_SYNTHETIC(ssc)); 1377 1378 _invlist_intersection_maybe_complement_2nd(ssc->invlist, 1379 invlist, 1380 invert2nd, 1381 &ssc->invlist); 1382 } 1383 1384 PERL_STATIC_INLINE void 1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) 1386 { 1387 PERL_ARGS_ASSERT_SSC_ADD_RANGE; 1388 1389 assert(is_ANYOF_SYNTHETIC(ssc)); 1390 1391 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); 1392 } 1393 1394 PERL_STATIC_INLINE void 1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) 1396 { 1397 /* AND just the single code point 'cp' into the SSC 'ssc' */ 1398 1399 SV* cp_list = _new_invlist(2); 1400 1401 PERL_ARGS_ASSERT_SSC_CP_AND; 1402 1403 assert(is_ANYOF_SYNTHETIC(ssc)); 1404 1405 cp_list = add_cp_to_invlist(cp_list, cp); 1406 ssc_intersection(ssc, cp_list, 1407 FALSE /* Not inverted */ 1408 ); 1409 SvREFCNT_dec_NN(cp_list); 1410 } 1411 1412 PERL_STATIC_INLINE void 1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) 1414 { 1415 /* Set the SSC 'ssc' to not match any locale things */ 1416 1417 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; 1418 1419 assert(is_ANYOF_SYNTHETIC(ssc)); 1420 1421 ANYOF_POSIXL_ZERO(ssc); 1422 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; 1423 } 1424 1425 STATIC void 1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) 1427 { 1428 /* The inversion list in the SSC is marked mortal; now we need a more 1429 * permanent copy, which is stored the same way that is done in a regular 1430 * ANYOF node, with the first 256 code points in a bit map */ 1431 1432 SV* invlist = invlist_clone(ssc->invlist); 1433 1434 PERL_ARGS_ASSERT_SSC_FINALIZE; 1435 1436 assert(is_ANYOF_SYNTHETIC(ssc)); 1437 1438 /* The code in this file assumes that all but these flags aren't relevant 1439 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the 1440 * time we reach here */ 1441 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); 1442 1443 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); 1444 1445 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, 1446 NULL, NULL, NULL, FALSE); 1447 1448 /* Make sure is clone-safe */ 1449 ssc->invlist = NULL; 1450 1451 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1452 ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; 1453 } 1454 1455 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); 1456 } 1457 1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 1459 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 1461 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ 1462 ? (TRIE_LIST_CUR( idx ) - 1) \ 1463 : 0 ) 1464 1465 1466 #ifdef DEBUGGING 1467 /* 1468 dump_trie(trie,widecharmap,revcharmap) 1469 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 1470 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 1471 1472 These routines dump out a trie in a somewhat readable format. 1473 The _interim_ variants are used for debugging the interim 1474 tables that are used to generate the final compressed 1475 representation which is what dump_trie expects. 1476 1477 Part of the reason for their existence is to provide a form 1478 of documentation as to how the different representations function. 1479 1480 */ 1481 1482 /* 1483 Dumps the final compressed table form of the trie to Perl_debug_log. 1484 Used for debugging make_trie(). 1485 */ 1486 1487 STATIC void 1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 1489 AV *revcharmap, U32 depth) 1490 { 1491 U32 state; 1492 SV *sv=sv_newmortal(); 1493 int colwidth= widecharmap ? 6 : 4; 1494 U16 word; 1495 GET_RE_DEBUG_FLAGS_DECL; 1496 1497 PERL_ARGS_ASSERT_DUMP_TRIE; 1498 1499 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", 1500 (int)depth * 2 + 2,"", 1501 "Match","Base","Ofs" ); 1502 1503 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 1504 SV ** const tmp = av_fetch( revcharmap, state, 0); 1505 if ( tmp ) { 1506 PerlIO_printf( Perl_debug_log, "%*s", 1507 colwidth, 1508 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 1509 PL_colors[0], PL_colors[1], 1510 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1511 PERL_PV_ESCAPE_FIRSTCHAR 1512 ) 1513 ); 1514 } 1515 } 1516 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", 1517 (int)depth * 2 + 2,""); 1518 1519 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 1520 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); 1521 PerlIO_printf( Perl_debug_log, "\n"); 1522 1523 for( state = 1 ; state < trie->statecount ; state++ ) { 1524 const U32 base = trie->states[ state ].trans.base; 1525 1526 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", 1527 (int)depth * 2 + 2,"", (UV)state); 1528 1529 if ( trie->states[ state ].wordnum ) { 1530 PerlIO_printf( Perl_debug_log, " W%4X", 1531 trie->states[ state ].wordnum ); 1532 } else { 1533 PerlIO_printf( Perl_debug_log, "%6s", "" ); 1534 } 1535 1536 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); 1537 1538 if ( base ) { 1539 U32 ofs = 0; 1540 1541 while( ( base + ofs < trie->uniquecharcount ) || 1542 ( base + ofs - trie->uniquecharcount < trie->lasttrans 1543 && trie->trans[ base + ofs - trie->uniquecharcount ].check 1544 != state)) 1545 ofs++; 1546 1547 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); 1548 1549 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 1550 if ( ( base + ofs >= trie->uniquecharcount ) 1551 && ( base + ofs - trie->uniquecharcount 1552 < trie->lasttrans ) 1553 && trie->trans[ base + ofs 1554 - trie->uniquecharcount ].check == state ) 1555 { 1556 PerlIO_printf( Perl_debug_log, "%*"UVXf, 1557 colwidth, 1558 (UV)trie->trans[ base + ofs 1559 - trie->uniquecharcount ].next ); 1560 } else { 1561 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); 1562 } 1563 } 1564 1565 PerlIO_printf( Perl_debug_log, "]"); 1566 1567 } 1568 PerlIO_printf( Perl_debug_log, "\n" ); 1569 } 1570 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", 1571 (int)depth*2, ""); 1572 for (word=1; word <= trie->wordcount; word++) { 1573 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", 1574 (int)word, (int)(trie->wordinfo[word].prev), 1575 (int)(trie->wordinfo[word].len)); 1576 } 1577 PerlIO_printf(Perl_debug_log, "\n" ); 1578 } 1579 /* 1580 Dumps a fully constructed but uncompressed trie in list form. 1581 List tries normally only are used for construction when the number of 1582 possible chars (trie->uniquecharcount) is very high. 1583 Used for debugging make_trie(). 1584 */ 1585 STATIC void 1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 1587 HV *widecharmap, AV *revcharmap, U32 next_alloc, 1588 U32 depth) 1589 { 1590 U32 state; 1591 SV *sv=sv_newmortal(); 1592 int colwidth= widecharmap ? 6 : 4; 1593 GET_RE_DEBUG_FLAGS_DECL; 1594 1595 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 1596 1597 /* print out the table precompression. */ 1598 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", 1599 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", 1600 "------:-----+-----------------\n" ); 1601 1602 for( state=1 ; state < next_alloc ; state ++ ) { 1603 U16 charid; 1604 1605 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", 1606 (int)depth * 2 + 2,"", (UV)state ); 1607 if ( ! trie->states[ state ].wordnum ) { 1608 PerlIO_printf( Perl_debug_log, "%5s| ",""); 1609 } else { 1610 PerlIO_printf( Perl_debug_log, "W%4x| ", 1611 trie->states[ state ].wordnum 1612 ); 1613 } 1614 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 1615 SV ** const tmp = av_fetch( revcharmap, 1616 TRIE_LIST_ITEM(state,charid).forid, 0); 1617 if ( tmp ) { 1618 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", 1619 colwidth, 1620 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 1621 colwidth, 1622 PL_colors[0], PL_colors[1], 1623 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 1624 | PERL_PV_ESCAPE_FIRSTCHAR 1625 ) , 1626 TRIE_LIST_ITEM(state,charid).forid, 1627 (UV)TRIE_LIST_ITEM(state,charid).newstate 1628 ); 1629 if (!(charid % 10)) 1630 PerlIO_printf(Perl_debug_log, "\n%*s| ", 1631 (int)((depth * 2) + 14), ""); 1632 } 1633 } 1634 PerlIO_printf( Perl_debug_log, "\n"); 1635 } 1636 } 1637 1638 /* 1639 Dumps a fully constructed but uncompressed trie in table form. 1640 This is the normal DFA style state transition table, with a few 1641 twists to facilitate compression later. 1642 Used for debugging make_trie(). 1643 */ 1644 STATIC void 1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 1646 HV *widecharmap, AV *revcharmap, U32 next_alloc, 1647 U32 depth) 1648 { 1649 U32 state; 1650 U16 charid; 1651 SV *sv=sv_newmortal(); 1652 int colwidth= widecharmap ? 6 : 4; 1653 GET_RE_DEBUG_FLAGS_DECL; 1654 1655 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 1656 1657 /* 1658 print out the table precompression so that we can do a visual check 1659 that they are identical. 1660 */ 1661 1662 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); 1663 1664 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1665 SV ** const tmp = av_fetch( revcharmap, charid, 0); 1666 if ( tmp ) { 1667 PerlIO_printf( Perl_debug_log, "%*s", 1668 colwidth, 1669 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 1670 PL_colors[0], PL_colors[1], 1671 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1672 PERL_PV_ESCAPE_FIRSTCHAR 1673 ) 1674 ); 1675 } 1676 } 1677 1678 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); 1679 1680 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 1681 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); 1682 } 1683 1684 PerlIO_printf( Perl_debug_log, "\n" ); 1685 1686 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 1687 1688 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", 1689 (int)depth * 2 + 2,"", 1690 (UV)TRIE_NODENUM( state ) ); 1691 1692 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1693 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 1694 if (v) 1695 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); 1696 else 1697 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); 1698 } 1699 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 1700 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", 1701 (UV)trie->trans[ state ].check ); 1702 } else { 1703 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", 1704 (UV)trie->trans[ state ].check, 1705 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 1706 } 1707 } 1708 } 1709 1710 #endif 1711 1712 1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 1714 startbranch: the first branch in the whole branch sequence 1715 first : start branch of sequence of branch-exact nodes. 1716 May be the same as startbranch 1717 last : Thing following the last branch. 1718 May be the same as tail. 1719 tail : item following the branch sequence 1720 count : words in the sequence 1721 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ 1722 depth : indent depth 1723 1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 1725 1726 A trie is an N'ary tree where the branches are determined by digital 1727 decomposition of the key. IE, at the root node you look up the 1st character and 1728 follow that branch repeat until you find the end of the branches. Nodes can be 1729 marked as "accepting" meaning they represent a complete word. Eg: 1730 1731 /he|she|his|hers/ 1732 1733 would convert into the following structure. Numbers represent states, letters 1734 following numbers represent valid transitions on the letter from that state, if 1735 the number is in square brackets it represents an accepting state, otherwise it 1736 will be in parenthesis. 1737 1738 +-h->+-e->[3]-+-r->(8)-+-s->[9] 1739 | | 1740 | (2) 1741 | | 1742 (1) +-i->(6)-+-s->[7] 1743 | 1744 +-s->(3)-+-h->(4)-+-e->[5] 1745 1746 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 1747 1748 This shows that when matching against the string 'hers' we will begin at state 1 1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 1752 single traverse. We store a mapping from accepting to state to which word was 1753 matched, and then when we have multiple possibilities we try to complete the 1754 rest of the regex in the order in which they occured in the alternation. 1755 1756 The only prior NFA like behaviour that would be changed by the TRIE support is 1757 the silent ignoring of duplicate alternations which are of the form: 1758 1759 / (DUPE|DUPE) X? (?{ ... }) Y /x 1760 1761 Thus EVAL blocks following a trie may be called a different number of times with 1762 and without the optimisation. With the optimisations dupes will be silently 1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as 1764 the following demonstrates: 1765 1766 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 1767 1768 which prints out 'word' three times, but 1769 1770 'words'=~/(word|word|word)(?{ print $1 })S/ 1771 1772 which doesnt print it out at all. This is due to other optimisations kicking in. 1773 1774 Example of what happens on a structural level: 1775 1776 The regexp /(ac|ad|ab)+/ will produce the following debug output: 1777 1778 1: CURLYM[1] {1,32767}(18) 1779 5: BRANCH(8) 1780 6: EXACT <ac>(16) 1781 8: BRANCH(11) 1782 9: EXACT <ad>(16) 1783 11: BRANCH(14) 1784 12: EXACT <ab>(16) 1785 16: SUCCEED(0) 1786 17: NOTHING(18) 1787 18: END(0) 1788 1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16 1790 and should turn into: 1791 1792 1: CURLYM[1] {1,32767}(18) 1793 5: TRIE(16) 1794 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 1795 <ac> 1796 <ad> 1797 <ab> 1798 16: SUCCEED(0) 1799 17: NOTHING(18) 1800 18: END(0) 1801 1802 Cases where tail != last would be like /(?foo|bar)baz/: 1803 1804 1: BRANCH(4) 1805 2: EXACT <foo>(8) 1806 4: BRANCH(7) 1807 5: EXACT <bar>(8) 1808 7: TAIL(8) 1809 8: EXACT <baz>(10) 1810 10: END(0) 1811 1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8 1813 and would end up looking like: 1814 1815 1: TRIE(8) 1816 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 1817 <foo> 1818 <bar> 1819 7: TAIL(8) 1820 8: EXACT <baz>(10) 1821 10: END(0) 1822 1823 d = uvchr_to_utf8_flags(d, uv, 0); 1824 1825 is the recommended Unicode-aware way of saying 1826 1827 *(d++) = uv; 1828 */ 1829 1830 #define TRIE_STORE_REVCHAR(val) \ 1831 STMT_START { \ 1832 if (UTF) { \ 1833 SV *zlopp = newSV(7); /* XXX: optimize me */ \ 1834 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 1835 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ 1836 SvCUR_set(zlopp, kapow - flrbbbbb); \ 1837 SvPOK_on(zlopp); \ 1838 SvUTF8_on(zlopp); \ 1839 av_push(revcharmap, zlopp); \ 1840 } else { \ 1841 char ooooff = (char)val; \ 1842 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 1843 } \ 1844 } STMT_END 1845 1846 /* This gets the next character from the input, folding it if not already 1847 * folded. */ 1848 #define TRIE_READ_CHAR STMT_START { \ 1849 wordlen++; \ 1850 if ( UTF ) { \ 1851 /* if it is UTF then it is either already folded, or does not need \ 1852 * folding */ \ 1853 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ 1854 } \ 1855 else if (folder == PL_fold_latin1) { \ 1856 /* This folder implies Unicode rules, which in the range expressible \ 1857 * by not UTF is the lower case, with the two exceptions, one of \ 1858 * which should have been taken care of before calling this */ \ 1859 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ 1860 uvc = toLOWER_L1(*uc); \ 1861 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ 1862 len = 1; \ 1863 } else { \ 1864 /* raw data, will be folded later if needed */ \ 1865 uvc = (U32)*uc; \ 1866 len = 1; \ 1867 } \ 1868 } STMT_END 1869 1870 1871 1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 1873 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 1874 U32 ging = TRIE_LIST_LEN( state ) *= 2; \ 1875 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 1876 } \ 1877 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 1878 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 1879 TRIE_LIST_CUR( state )++; \ 1880 } STMT_END 1881 1882 #define TRIE_LIST_NEW(state) STMT_START { \ 1883 Newxz( trie->states[ state ].trans.list, \ 1884 4, reg_trie_trans_le ); \ 1885 TRIE_LIST_CUR( state ) = 1; \ 1886 TRIE_LIST_LEN( state ) = 4; \ 1887 } STMT_END 1888 1889 #define TRIE_HANDLE_WORD(state) STMT_START { \ 1890 U16 dupe= trie->states[ state ].wordnum; \ 1891 regnode * const noper_next = regnext( noper ); \ 1892 \ 1893 DEBUG_r({ \ 1894 /* store the word for dumping */ \ 1895 SV* tmp; \ 1896 if (OP(noper) != NOTHING) \ 1897 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 1898 else \ 1899 tmp = newSVpvn_utf8( "", 0, UTF ); \ 1900 av_push( trie_words, tmp ); \ 1901 }); \ 1902 \ 1903 curword++; \ 1904 trie->wordinfo[curword].prev = 0; \ 1905 trie->wordinfo[curword].len = wordlen; \ 1906 trie->wordinfo[curword].accept = state; \ 1907 \ 1908 if ( noper_next < tail ) { \ 1909 if (!trie->jump) \ 1910 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ 1911 sizeof(U16) ); \ 1912 trie->jump[curword] = (U16)(noper_next - convert); \ 1913 if (!jumper) \ 1914 jumper = noper_next; \ 1915 if (!nextbranch) \ 1916 nextbranch= regnext(cur); \ 1917 } \ 1918 \ 1919 if ( dupe ) { \ 1920 /* It's a dupe. Pre-insert into the wordinfo[].prev */\ 1921 /* chain, so that when the bits of chain are later */\ 1922 /* linked together, the dups appear in the chain */\ 1923 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ 1924 trie->wordinfo[dupe].prev = curword; \ 1925 } else { \ 1926 /* we haven't inserted this word yet. */ \ 1927 trie->states[ state ].wordnum = curword; \ 1928 } \ 1929 } STMT_END 1930 1931 1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 1933 ( ( base + charid >= ucharcount \ 1934 && base + charid < ubound \ 1935 && state == trie->trans[ base - ucharcount + charid ].check \ 1936 && trie->trans[ base - ucharcount + charid ].next ) \ 1937 ? trie->trans[ base - ucharcount + charid ].next \ 1938 : ( state==1 ? special : 0 ) \ 1939 ) 1940 1941 #define MADE_TRIE 1 1942 #define MADE_JUMP_TRIE 2 1943 #define MADE_EXACT_TRIE 4 1944 1945 STATIC I32 1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, 1947 regnode *first, regnode *last, regnode *tail, 1948 U32 word_count, U32 flags, U32 depth) 1949 { 1950 dVAR; 1951 /* first pass, loop through and scan words */ 1952 reg_trie_data *trie; 1953 HV *widecharmap = NULL; 1954 AV *revcharmap = newAV(); 1955 regnode *cur; 1956 STRLEN len = 0; 1957 UV uvc = 0; 1958 U16 curword = 0; 1959 U32 next_alloc = 0; 1960 regnode *jumper = NULL; 1961 regnode *nextbranch = NULL; 1962 regnode *convert = NULL; 1963 U32 *prev_states; /* temp array mapping each state to previous one */ 1964 /* we just use folder as a flag in utf8 */ 1965 const U8 * folder = NULL; 1966 1967 #ifdef DEBUGGING 1968 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); 1969 AV *trie_words = NULL; 1970 /* along with revcharmap, this only used during construction but both are 1971 * useful during debugging so we store them in the struct when debugging. 1972 */ 1973 #else 1974 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); 1975 STRLEN trie_charcount=0; 1976 #endif 1977 SV *re_trie_maxbuff; 1978 GET_RE_DEBUG_FLAGS_DECL; 1979 1980 PERL_ARGS_ASSERT_MAKE_TRIE; 1981 #ifndef DEBUGGING 1982 PERL_UNUSED_ARG(depth); 1983 #endif 1984 1985 switch (flags) { 1986 case EXACT: break; 1987 case EXACTFA: 1988 case EXACTFU_SS: 1989 case EXACTFU: folder = PL_fold_latin1; break; 1990 case EXACTF: folder = PL_fold; break; 1991 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); 1992 } 1993 1994 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 1995 trie->refcount = 1; 1996 trie->startstate = 1; 1997 trie->wordcount = word_count; 1998 RExC_rxi->data->data[ data_slot ] = (void*)trie; 1999 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 2000 if (flags == EXACT) 2001 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 2002 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( 2003 trie->wordcount+1, sizeof(reg_trie_wordinfo)); 2004 2005 DEBUG_r({ 2006 trie_words = newAV(); 2007 }); 2008 2009 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 2010 if (!SvIOK(re_trie_maxbuff)) { 2011 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2012 } 2013 DEBUG_TRIE_COMPILE_r({ 2014 PerlIO_printf( Perl_debug_log, 2015 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 2016 (int)depth * 2 + 2, "", 2017 REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 2018 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); 2019 }); 2020 2021 /* Find the node we are going to overwrite */ 2022 if ( first == startbranch && OP( last ) != BRANCH ) { 2023 /* whole branch chain */ 2024 convert = first; 2025 } else { 2026 /* branch sub-chain */ 2027 convert = NEXTOPER( first ); 2028 } 2029 2030 /* -- First loop and Setup -- 2031 2032 We first traverse the branches and scan each word to determine if it 2033 contains widechars, and how many unique chars there are, this is 2034 important as we have to build a table with at least as many columns as we 2035 have unique chars. 2036 2037 We use an array of integers to represent the character codes 0..255 2038 (trie->charmap) and we use a an HV* to store Unicode characters. We use 2039 the native representation of the character value as the key and IV's for 2040 the coded index. 2041 2042 *TODO* If we keep track of how many times each character is used we can 2043 remap the columns so that the table compression later on is more 2044 efficient in terms of memory by ensuring the most common value is in the 2045 middle and the least common are on the outside. IMO this would be better 2046 than a most to least common mapping as theres a decent chance the most 2047 common letter will share a node with the least common, meaning the node 2048 will not be compressible. With a middle is most common approach the worst 2049 case is when we have the least common nodes twice. 2050 2051 */ 2052 2053 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2054 regnode *noper = NEXTOPER( cur ); 2055 const U8 *uc = (U8*)STRING( noper ); 2056 const U8 *e = uc + STR_LEN( noper ); 2057 int foldlen = 0; 2058 U32 wordlen = 0; /* required init */ 2059 STRLEN minchars = 0; 2060 STRLEN maxchars = 0; 2061 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the 2062 bitmap?*/ 2063 2064 if (OP(noper) == NOTHING) { 2065 regnode *noper_next= regnext(noper); 2066 if (noper_next != tail && OP(noper_next) == flags) { 2067 noper = noper_next; 2068 uc= (U8*)STRING(noper); 2069 e= uc + STR_LEN(noper); 2070 trie->minlen= STR_LEN(noper); 2071 } else { 2072 trie->minlen= 0; 2073 continue; 2074 } 2075 } 2076 2077 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ 2078 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 2079 regardless of encoding */ 2080 if (OP( noper ) == EXACTFU_SS) { 2081 /* false positives are ok, so just set this */ 2082 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); 2083 } 2084 } 2085 for ( ; uc < e ; uc += len ) { /* Look at each char in the current 2086 branch */ 2087 TRIE_CHARCOUNT(trie)++; 2088 TRIE_READ_CHAR; 2089 2090 /* TRIE_READ_CHAR returns the current character, or its fold if /i 2091 * is in effect. Under /i, this character can match itself, or 2092 * anything that folds to it. If not under /i, it can match just 2093 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN 2094 * all fold to k, and all are single characters. But some folds 2095 * expand to more than one character, so for example LATIN SMALL 2096 * LIGATURE FFI folds to the three character sequence 'ffi'. If 2097 * the string beginning at 'uc' is 'ffi', it could be matched by 2098 * three characters, or just by the one ligature character. (It 2099 * could also be matched by two characters: LATIN SMALL LIGATURE FF 2100 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). 2101 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also 2102 * match.) The trie needs to know the minimum and maximum number 2103 * of characters that could match so that it can use size alone to 2104 * quickly reject many match attempts. The max is simple: it is 2105 * the number of folded characters in this branch (since a fold is 2106 * never shorter than what folds to it. */ 2107 2108 maxchars++; 2109 2110 /* And the min is equal to the max if not under /i (indicated by 2111 * 'folder' being NULL), or there are no multi-character folds. If 2112 * there is a multi-character fold, the min is incremented just 2113 * once, for the character that folds to the sequence. Each 2114 * character in the sequence needs to be added to the list below of 2115 * characters in the trie, but we count only the first towards the 2116 * min number of characters needed. This is done through the 2117 * variable 'foldlen', which is returned by the macros that look 2118 * for these sequences as the number of bytes the sequence 2119 * occupies. Each time through the loop, we decrement 'foldlen' by 2120 * how many bytes the current char occupies. Only when it reaches 2121 * 0 do we increment 'minchars' or look for another multi-character 2122 * sequence. */ 2123 if (folder == NULL) { 2124 minchars++; 2125 } 2126 else if (foldlen > 0) { 2127 foldlen -= (UTF) ? UTF8SKIP(uc) : 1; 2128 } 2129 else { 2130 minchars++; 2131 2132 /* See if *uc is the beginning of a multi-character fold. If 2133 * so, we decrement the length remaining to look at, to account 2134 * for the current character this iteration. (We can use 'uc' 2135 * instead of the fold returned by TRIE_READ_CHAR because for 2136 * non-UTF, the latin1_safe macro is smart enough to account 2137 * for all the unfolded characters, and because for UTF, the 2138 * string will already have been folded earlier in the 2139 * compilation process */ 2140 if (UTF) { 2141 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { 2142 foldlen -= UTF8SKIP(uc); 2143 } 2144 } 2145 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { 2146 foldlen--; 2147 } 2148 } 2149 2150 /* The current character (and any potential folds) should be added 2151 * to the possible matching characters for this position in this 2152 * branch */ 2153 if ( uvc < 256 ) { 2154 if ( folder ) { 2155 U8 folded= folder[ (U8) uvc ]; 2156 if ( !trie->charmap[ folded ] ) { 2157 trie->charmap[ folded ]=( ++trie->uniquecharcount ); 2158 TRIE_STORE_REVCHAR( folded ); 2159 } 2160 } 2161 if ( !trie->charmap[ uvc ] ) { 2162 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 2163 TRIE_STORE_REVCHAR( uvc ); 2164 } 2165 if ( set_bit ) { 2166 /* store the codepoint in the bitmap, and its folded 2167 * equivalent. */ 2168 TRIE_BITMAP_SET(trie, uvc); 2169 2170 /* store the folded codepoint */ 2171 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); 2172 2173 if ( !UTF ) { 2174 /* store first byte of utf8 representation of 2175 variant codepoints */ 2176 if (! UVCHR_IS_INVARIANT(uvc)) { 2177 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); 2178 } 2179 } 2180 set_bit = 0; /* We've done our bit :-) */ 2181 } 2182 } else { 2183 2184 /* XXX We could come up with the list of code points that fold 2185 * to this using PL_utf8_foldclosures, except not for 2186 * multi-char folds, as there may be multiple combinations 2187 * there that could work, which needs to wait until runtime to 2188 * resolve (The comment about LIGATURE FFI above is such an 2189 * example */ 2190 2191 SV** svpp; 2192 if ( !widecharmap ) 2193 widecharmap = newHV(); 2194 2195 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 2196 2197 if ( !svpp ) 2198 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); 2199 2200 if ( !SvTRUE( *svpp ) ) { 2201 sv_setiv( *svpp, ++trie->uniquecharcount ); 2202 TRIE_STORE_REVCHAR(uvc); 2203 } 2204 } 2205 } /* end loop through characters in this branch of the trie */ 2206 2207 /* We take the min and max for this branch and combine to find the min 2208 * and max for all branches processed so far */ 2209 if( cur == first ) { 2210 trie->minlen = minchars; 2211 trie->maxlen = maxchars; 2212 } else if (minchars < trie->minlen) { 2213 trie->minlen = minchars; 2214 } else if (maxchars > trie->maxlen) { 2215 trie->maxlen = maxchars; 2216 } 2217 } /* end first pass */ 2218 DEBUG_TRIE_COMPILE_r( 2219 PerlIO_printf( Perl_debug_log, 2220 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 2221 (int)depth * 2 + 2,"", 2222 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 2223 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 2224 (int)trie->minlen, (int)trie->maxlen ) 2225 ); 2226 2227 /* 2228 We now know what we are dealing with in terms of unique chars and 2229 string sizes so we can calculate how much memory a naive 2230 representation using a flat table will take. If it's over a reasonable 2231 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 2232 conservative but potentially much slower representation using an array 2233 of lists. 2234 2235 At the end we convert both representations into the same compressed 2236 form that will be used in regexec.c for matching with. The latter 2237 is a form that cannot be used to construct with but has memory 2238 properties similar to the list form and access properties similar 2239 to the table form making it both suitable for fast searches and 2240 small enough that its feasable to store for the duration of a program. 2241 2242 See the comment in the code where the compressed table is produced 2243 inplace from the flat tabe representation for an explanation of how 2244 the compression works. 2245 2246 */ 2247 2248 2249 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); 2250 prev_states[1] = 0; 2251 2252 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) 2253 > SvIV(re_trie_maxbuff) ) 2254 { 2255 /* 2256 Second Pass -- Array Of Lists Representation 2257 2258 Each state will be represented by a list of charid:state records 2259 (reg_trie_trans_le) the first such element holds the CUR and LEN 2260 points of the allocated array. (See defines above). 2261 2262 We build the initial structure using the lists, and then convert 2263 it into the compressed table form which allows faster lookups 2264 (but cant be modified once converted). 2265 */ 2266 2267 STRLEN transcount = 1; 2268 2269 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 2270 "%*sCompiling trie using list compiler\n", 2271 (int)depth * 2 + 2, "")); 2272 2273 trie->states = (reg_trie_state *) 2274 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 2275 sizeof(reg_trie_state) ); 2276 TRIE_LIST_NEW(1); 2277 next_alloc = 2; 2278 2279 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2280 2281 regnode *noper = NEXTOPER( cur ); 2282 U8 *uc = (U8*)STRING( noper ); 2283 const U8 *e = uc + STR_LEN( noper ); 2284 U32 state = 1; /* required init */ 2285 U16 charid = 0; /* sanity init */ 2286 U32 wordlen = 0; /* required init */ 2287 2288 if (OP(noper) == NOTHING) { 2289 regnode *noper_next= regnext(noper); 2290 if (noper_next != tail && OP(noper_next) == flags) { 2291 noper = noper_next; 2292 uc= (U8*)STRING(noper); 2293 e= uc + STR_LEN(noper); 2294 } 2295 } 2296 2297 if (OP(noper) != NOTHING) { 2298 for ( ; uc < e ; uc += len ) { 2299 2300 TRIE_READ_CHAR; 2301 2302 if ( uvc < 256 ) { 2303 charid = trie->charmap[ uvc ]; 2304 } else { 2305 SV** const svpp = hv_fetch( widecharmap, 2306 (char*)&uvc, 2307 sizeof( UV ), 2308 0); 2309 if ( !svpp ) { 2310 charid = 0; 2311 } else { 2312 charid=(U16)SvIV( *svpp ); 2313 } 2314 } 2315 /* charid is now 0 if we dont know the char read, or 2316 * nonzero if we do */ 2317 if ( charid ) { 2318 2319 U16 check; 2320 U32 newstate = 0; 2321 2322 charid--; 2323 if ( !trie->states[ state ].trans.list ) { 2324 TRIE_LIST_NEW( state ); 2325 } 2326 for ( check = 1; 2327 check <= TRIE_LIST_USED( state ); 2328 check++ ) 2329 { 2330 if ( TRIE_LIST_ITEM( state, check ).forid 2331 == charid ) 2332 { 2333 newstate = TRIE_LIST_ITEM( state, check ).newstate; 2334 break; 2335 } 2336 } 2337 if ( ! newstate ) { 2338 newstate = next_alloc++; 2339 prev_states[newstate] = state; 2340 TRIE_LIST_PUSH( state, charid, newstate ); 2341 transcount++; 2342 } 2343 state = newstate; 2344 } else { 2345 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 2346 } 2347 } 2348 } 2349 TRIE_HANDLE_WORD(state); 2350 2351 } /* end second pass */ 2352 2353 /* next alloc is the NEXT state to be allocated */ 2354 trie->statecount = next_alloc; 2355 trie->states = (reg_trie_state *) 2356 PerlMemShared_realloc( trie->states, 2357 next_alloc 2358 * sizeof(reg_trie_state) ); 2359 2360 /* and now dump it out before we compress it */ 2361 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 2362 revcharmap, next_alloc, 2363 depth+1) 2364 ); 2365 2366 trie->trans = (reg_trie_trans *) 2367 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 2368 { 2369 U32 state; 2370 U32 tp = 0; 2371 U32 zp = 0; 2372 2373 2374 for( state=1 ; state < next_alloc ; state ++ ) { 2375 U32 base=0; 2376 2377 /* 2378 DEBUG_TRIE_COMPILE_MORE_r( 2379 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) 2380 ); 2381 */ 2382 2383 if (trie->states[state].trans.list) { 2384 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 2385 U16 maxid=minid; 2386 U16 idx; 2387 2388 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 2389 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 2390 if ( forid < minid ) { 2391 minid=forid; 2392 } else if ( forid > maxid ) { 2393 maxid=forid; 2394 } 2395 } 2396 if ( transcount < tp + maxid - minid + 1) { 2397 transcount *= 2; 2398 trie->trans = (reg_trie_trans *) 2399 PerlMemShared_realloc( trie->trans, 2400 transcount 2401 * sizeof(reg_trie_trans) ); 2402 Zero( trie->trans + (transcount / 2), 2403 transcount / 2, 2404 reg_trie_trans ); 2405 } 2406 base = trie->uniquecharcount + tp - minid; 2407 if ( maxid == minid ) { 2408 U32 set = 0; 2409 for ( ; zp < tp ; zp++ ) { 2410 if ( ! trie->trans[ zp ].next ) { 2411 base = trie->uniquecharcount + zp - minid; 2412 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 2413 1).newstate; 2414 trie->trans[ zp ].check = state; 2415 set = 1; 2416 break; 2417 } 2418 } 2419 if ( !set ) { 2420 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 2421 1).newstate; 2422 trie->trans[ tp ].check = state; 2423 tp++; 2424 zp = tp; 2425 } 2426 } else { 2427 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 2428 const U32 tid = base 2429 - trie->uniquecharcount 2430 + TRIE_LIST_ITEM( state, idx ).forid; 2431 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, 2432 idx ).newstate; 2433 trie->trans[ tid ].check = state; 2434 } 2435 tp += ( maxid - minid + 1 ); 2436 } 2437 Safefree(trie->states[ state ].trans.list); 2438 } 2439 /* 2440 DEBUG_TRIE_COMPILE_MORE_r( 2441 PerlIO_printf( Perl_debug_log, " base: %d\n",base); 2442 ); 2443 */ 2444 trie->states[ state ].trans.base=base; 2445 } 2446 trie->lasttrans = tp + 1; 2447 } 2448 } else { 2449 /* 2450 Second Pass -- Flat Table Representation. 2451 2452 we dont use the 0 slot of either trans[] or states[] so we add 1 to 2453 each. We know that we will need Charcount+1 trans at most to store 2454 the data (one row per char at worst case) So we preallocate both 2455 structures assuming worst case. 2456 2457 We then construct the trie using only the .next slots of the entry 2458 structs. 2459 2460 We use the .check field of the first entry of the node temporarily 2461 to make compression both faster and easier by keeping track of how 2462 many non zero fields are in the node. 2463 2464 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 2465 transition. 2466 2467 There are two terms at use here: state as a TRIE_NODEIDX() which is 2468 a number representing the first entry of the node, and state as a 2469 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) 2470 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) 2471 if there are 2 entrys per node. eg: 2472 2473 A B A B 2474 1. 2 4 1. 3 7 2475 2. 0 3 3. 0 5 2476 3. 0 0 5. 0 0 2477 4. 0 0 7. 0 0 2478 2479 The table is internally in the right hand, idx form. However as we 2480 also have to deal with the states array which is indexed by nodenum 2481 we have to use TRIE_NODENUM() to convert. 2482 2483 */ 2484 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 2485 "%*sCompiling trie using table compiler\n", 2486 (int)depth * 2 + 2, "")); 2487 2488 trie->trans = (reg_trie_trans *) 2489 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 2490 * trie->uniquecharcount + 1, 2491 sizeof(reg_trie_trans) ); 2492 trie->states = (reg_trie_state *) 2493 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 2494 sizeof(reg_trie_state) ); 2495 next_alloc = trie->uniquecharcount + 1; 2496 2497 2498 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2499 2500 regnode *noper = NEXTOPER( cur ); 2501 const U8 *uc = (U8*)STRING( noper ); 2502 const U8 *e = uc + STR_LEN( noper ); 2503 2504 U32 state = 1; /* required init */ 2505 2506 U16 charid = 0; /* sanity init */ 2507 U32 accept_state = 0; /* sanity init */ 2508 2509 U32 wordlen = 0; /* required init */ 2510 2511 if (OP(noper) == NOTHING) { 2512 regnode *noper_next= regnext(noper); 2513 if (noper_next != tail && OP(noper_next) == flags) { 2514 noper = noper_next; 2515 uc= (U8*)STRING(noper); 2516 e= uc + STR_LEN(noper); 2517 } 2518 } 2519 2520 if ( OP(noper) != NOTHING ) { 2521 for ( ; uc < e ; uc += len ) { 2522 2523 TRIE_READ_CHAR; 2524 2525 if ( uvc < 256 ) { 2526 charid = trie->charmap[ uvc ]; 2527 } else { 2528 SV* const * const svpp = hv_fetch( widecharmap, 2529 (char*)&uvc, 2530 sizeof( UV ), 2531 0); 2532 charid = svpp ? (U16)SvIV(*svpp) : 0; 2533 } 2534 if ( charid ) { 2535 charid--; 2536 if ( !trie->trans[ state + charid ].next ) { 2537 trie->trans[ state + charid ].next = next_alloc; 2538 trie->trans[ state ].check++; 2539 prev_states[TRIE_NODENUM(next_alloc)] 2540 = TRIE_NODENUM(state); 2541 next_alloc += trie->uniquecharcount; 2542 } 2543 state = trie->trans[ state + charid ].next; 2544 } else { 2545 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 2546 } 2547 /* charid is now 0 if we dont know the char read, or 2548 * nonzero if we do */ 2549 } 2550 } 2551 accept_state = TRIE_NODENUM( state ); 2552 TRIE_HANDLE_WORD(accept_state); 2553 2554 } /* end second pass */ 2555 2556 /* and now dump it out before we compress it */ 2557 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 2558 revcharmap, 2559 next_alloc, depth+1)); 2560 2561 { 2562 /* 2563 * Inplace compress the table.* 2564 2565 For sparse data sets the table constructed by the trie algorithm will 2566 be mostly 0/FAIL transitions or to put it another way mostly empty. 2567 (Note that leaf nodes will not contain any transitions.) 2568 2569 This algorithm compresses the tables by eliminating most such 2570 transitions, at the cost of a modest bit of extra work during lookup: 2571 2572 - Each states[] entry contains a .base field which indicates the 2573 index in the state[] array wheres its transition data is stored. 2574 2575 - If .base is 0 there are no valid transitions from that node. 2576 2577 - If .base is nonzero then charid is added to it to find an entry in 2578 the trans array. 2579 2580 -If trans[states[state].base+charid].check!=state then the 2581 transition is taken to be a 0/Fail transition. Thus if there are fail 2582 transitions at the front of the node then the .base offset will point 2583 somewhere inside the previous nodes data (or maybe even into a node 2584 even earlier), but the .check field determines if the transition is 2585 valid. 2586 2587 XXX - wrong maybe? 2588 The following process inplace converts the table to the compressed 2589 table: We first do not compress the root node 1,and mark all its 2590 .check pointers as 1 and set its .base pointer as 1 as well. This 2591 allows us to do a DFA construction from the compressed table later, 2592 and ensures that any .base pointers we calculate later are greater 2593 than 0. 2594 2595 - We set 'pos' to indicate the first entry of the second node. 2596 2597 - We then iterate over the columns of the node, finding the first and 2598 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 2599 and set the .check pointers accordingly, and advance pos 2600 appropriately and repreat for the next node. Note that when we copy 2601 the next pointers we have to convert them from the original 2602 NODEIDX form to NODENUM form as the former is not valid post 2603 compression. 2604 2605 - If a node has no transitions used we mark its base as 0 and do not 2606 advance the pos pointer. 2607 2608 - If a node only has one transition we use a second pointer into the 2609 structure to fill in allocated fail transitions from other states. 2610 This pointer is independent of the main pointer and scans forward 2611 looking for null transitions that are allocated to a state. When it 2612 finds one it writes the single transition into the "hole". If the 2613 pointer doesnt find one the single transition is appended as normal. 2614 2615 - Once compressed we can Renew/realloc the structures to release the 2616 excess space. 2617 2618 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 2619 specifically Fig 3.47 and the associated pseudocode. 2620 2621 demq 2622 */ 2623 const U32 laststate = TRIE_NODENUM( next_alloc ); 2624 U32 state, charid; 2625 U32 pos = 0, zp=0; 2626 trie->statecount = laststate; 2627 2628 for ( state = 1 ; state < laststate ; state++ ) { 2629 U8 flag = 0; 2630 const U32 stateidx = TRIE_NODEIDX( state ); 2631 const U32 o_used = trie->trans[ stateidx ].check; 2632 U32 used = trie->trans[ stateidx ].check; 2633 trie->trans[ stateidx ].check = 0; 2634 2635 for ( charid = 0; 2636 used && charid < trie->uniquecharcount; 2637 charid++ ) 2638 { 2639 if ( flag || trie->trans[ stateidx + charid ].next ) { 2640 if ( trie->trans[ stateidx + charid ].next ) { 2641 if (o_used == 1) { 2642 for ( ; zp < pos ; zp++ ) { 2643 if ( ! trie->trans[ zp ].next ) { 2644 break; 2645 } 2646 } 2647 trie->states[ state ].trans.base 2648 = zp 2649 + trie->uniquecharcount 2650 - charid ; 2651 trie->trans[ zp ].next 2652 = SAFE_TRIE_NODENUM( trie->trans[ stateidx 2653 + charid ].next ); 2654 trie->trans[ zp ].check = state; 2655 if ( ++zp > pos ) pos = zp; 2656 break; 2657 } 2658 used--; 2659 } 2660 if ( !flag ) { 2661 flag = 1; 2662 trie->states[ state ].trans.base 2663 = pos + trie->uniquecharcount - charid ; 2664 } 2665 trie->trans[ pos ].next 2666 = SAFE_TRIE_NODENUM( 2667 trie->trans[ stateidx + charid ].next ); 2668 trie->trans[ pos ].check = state; 2669 pos++; 2670 } 2671 } 2672 } 2673 trie->lasttrans = pos + 1; 2674 trie->states = (reg_trie_state *) 2675 PerlMemShared_realloc( trie->states, laststate 2676 * sizeof(reg_trie_state) ); 2677 DEBUG_TRIE_COMPILE_MORE_r( 2678 PerlIO_printf( Perl_debug_log, 2679 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", 2680 (int)depth * 2 + 2,"", 2681 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount 2682 + 1 ), 2683 (IV)next_alloc, 2684 (IV)pos, 2685 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 2686 ); 2687 2688 } /* end table compress */ 2689 } 2690 DEBUG_TRIE_COMPILE_MORE_r( 2691 PerlIO_printf(Perl_debug_log, 2692 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", 2693 (int)depth * 2 + 2, "", 2694 (UV)trie->statecount, 2695 (UV)trie->lasttrans) 2696 ); 2697 /* resize the trans array to remove unused space */ 2698 trie->trans = (reg_trie_trans *) 2699 PerlMemShared_realloc( trie->trans, trie->lasttrans 2700 * sizeof(reg_trie_trans) ); 2701 2702 { /* Modify the program and insert the new TRIE node */ 2703 U8 nodetype =(U8)(flags & 0xFF); 2704 char *str=NULL; 2705 2706 #ifdef DEBUGGING 2707 regnode *optimize = NULL; 2708 #ifdef RE_TRACK_PATTERN_OFFSETS 2709 2710 U32 mjd_offset = 0; 2711 U32 mjd_nodelen = 0; 2712 #endif /* RE_TRACK_PATTERN_OFFSETS */ 2713 #endif /* DEBUGGING */ 2714 /* 2715 This means we convert either the first branch or the first Exact, 2716 depending on whether the thing following (in 'last') is a branch 2717 or not and whther first is the startbranch (ie is it a sub part of 2718 the alternation or is it the whole thing.) 2719 Assuming its a sub part we convert the EXACT otherwise we convert 2720 the whole branch sequence, including the first. 2721 */ 2722 /* Find the node we are going to overwrite */ 2723 if ( first != startbranch || OP( last ) == BRANCH ) { 2724 /* branch sub-chain */ 2725 NEXT_OFF( first ) = (U16)(last - first); 2726 #ifdef RE_TRACK_PATTERN_OFFSETS 2727 DEBUG_r({ 2728 mjd_offset= Node_Offset((convert)); 2729 mjd_nodelen= Node_Length((convert)); 2730 }); 2731 #endif 2732 /* whole branch chain */ 2733 } 2734 #ifdef RE_TRACK_PATTERN_OFFSETS 2735 else { 2736 DEBUG_r({ 2737 const regnode *nop = NEXTOPER( convert ); 2738 mjd_offset= Node_Offset((nop)); 2739 mjd_nodelen= Node_Length((nop)); 2740 }); 2741 } 2742 DEBUG_OPTIMISE_r( 2743 PerlIO_printf(Perl_debug_log, 2744 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", 2745 (int)depth * 2 + 2, "", 2746 (UV)mjd_offset, (UV)mjd_nodelen) 2747 ); 2748 #endif 2749 /* But first we check to see if there is a common prefix we can 2750 split out as an EXACT and put in front of the TRIE node. */ 2751 trie->startstate= 1; 2752 if ( trie->bitmap && !widecharmap && !trie->jump ) { 2753 U32 state; 2754 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 2755 U32 ofs = 0; 2756 I32 idx = -1; 2757 U32 count = 0; 2758 const U32 base = trie->states[ state ].trans.base; 2759 2760 if ( trie->states[state].wordnum ) 2761 count = 1; 2762 2763 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 2764 if ( ( base + ofs >= trie->uniquecharcount ) && 2765 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 2766 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 2767 { 2768 if ( ++count > 1 ) { 2769 SV **tmp = av_fetch( revcharmap, ofs, 0); 2770 const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); 2771 if ( state == 1 ) break; 2772 if ( count == 2 ) { 2773 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 2774 DEBUG_OPTIMISE_r( 2775 PerlIO_printf(Perl_debug_log, 2776 "%*sNew Start State=%"UVuf" Class: [", 2777 (int)depth * 2 + 2, "", 2778 (UV)state)); 2779 if (idx >= 0) { 2780 SV ** const tmp = av_fetch( revcharmap, idx, 0); 2781 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 2782 2783 TRIE_BITMAP_SET(trie,*ch); 2784 if ( folder ) 2785 TRIE_BITMAP_SET(trie, folder[ *ch ]); 2786 DEBUG_OPTIMISE_r( 2787 PerlIO_printf(Perl_debug_log, "%s", (char*)ch) 2788 ); 2789 } 2790 } 2791 TRIE_BITMAP_SET(trie,*ch); 2792 if ( folder ) 2793 TRIE_BITMAP_SET(trie,folder[ *ch ]); 2794 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); 2795 } 2796 idx = ofs; 2797 } 2798 } 2799 if ( count == 1 ) { 2800 SV **tmp = av_fetch( revcharmap, idx, 0); 2801 STRLEN len; 2802 char *ch = SvPV( *tmp, len ); 2803 DEBUG_OPTIMISE_r({ 2804 SV *sv=sv_newmortal(); 2805 PerlIO_printf( Perl_debug_log, 2806 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", 2807 (int)depth * 2 + 2, "", 2808 (UV)state, (UV)idx, 2809 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 2810 PL_colors[0], PL_colors[1], 2811 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2812 PERL_PV_ESCAPE_FIRSTCHAR 2813 ) 2814 ); 2815 }); 2816 if ( state==1 ) { 2817 OP( convert ) = nodetype; 2818 str=STRING(convert); 2819 STR_LEN(convert)=0; 2820 } 2821 STR_LEN(convert) += len; 2822 while (len--) 2823 *str++ = *ch++; 2824 } else { 2825 #ifdef DEBUGGING 2826 if (state>1) 2827 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); 2828 #endif 2829 break; 2830 } 2831 } 2832 trie->prefixlen = (state-1); 2833 if (str) { 2834 regnode *n = convert+NODE_SZ_STR(convert); 2835 NEXT_OFF(convert) = NODE_SZ_STR(convert); 2836 trie->startstate = state; 2837 trie->minlen -= (state - 1); 2838 trie->maxlen -= (state - 1); 2839 #ifdef DEBUGGING 2840 /* At least the UNICOS C compiler choked on this 2841 * being argument to DEBUG_r(), so let's just have 2842 * it right here. */ 2843 if ( 2844 #ifdef PERL_EXT_RE_BUILD 2845 1 2846 #else 2847 DEBUG_r_TEST 2848 #endif 2849 ) { 2850 regnode *fix = convert; 2851 U32 word = trie->wordcount; 2852 mjd_nodelen++; 2853 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 2854 while( ++fix < n ) { 2855 Set_Node_Offset_Length(fix, 0, 0); 2856 } 2857 while (word--) { 2858 SV ** const tmp = av_fetch( trie_words, word, 0 ); 2859 if (tmp) { 2860 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 2861 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 2862 else 2863 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 2864 } 2865 } 2866 } 2867 #endif 2868 if (trie->maxlen) { 2869 convert = n; 2870 } else { 2871 NEXT_OFF(convert) = (U16)(tail - convert); 2872 DEBUG_r(optimize= n); 2873 } 2874 } 2875 } 2876 if (!jumper) 2877 jumper = last; 2878 if ( trie->maxlen ) { 2879 NEXT_OFF( convert ) = (U16)(tail - convert); 2880 ARG_SET( convert, data_slot ); 2881 /* Store the offset to the first unabsorbed branch in 2882 jump[0], which is otherwise unused by the jump logic. 2883 We use this when dumping a trie and during optimisation. */ 2884 if (trie->jump) 2885 trie->jump[0] = (U16)(nextbranch - convert); 2886 2887 /* If the start state is not accepting (meaning there is no empty string/NOTHING) 2888 * and there is a bitmap 2889 * and the first "jump target" node we found leaves enough room 2890 * then convert the TRIE node into a TRIEC node, with the bitmap 2891 * embedded inline in the opcode - this is hypothetically faster. 2892 */ 2893 if ( !trie->states[trie->startstate].wordnum 2894 && trie->bitmap 2895 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 2896 { 2897 OP( convert ) = TRIEC; 2898 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 2899 PerlMemShared_free(trie->bitmap); 2900 trie->bitmap= NULL; 2901 } else 2902 OP( convert ) = TRIE; 2903 2904 /* store the type in the flags */ 2905 convert->flags = nodetype; 2906 DEBUG_r({ 2907 optimize = convert 2908 + NODE_STEP_REGNODE 2909 + regarglen[ OP( convert ) ]; 2910 }); 2911 /* XXX We really should free up the resource in trie now, 2912 as we won't use them - (which resources?) dmq */ 2913 } 2914 /* needed for dumping*/ 2915 DEBUG_r(if (optimize) { 2916 regnode *opt = convert; 2917 2918 while ( ++opt < optimize) { 2919 Set_Node_Offset_Length(opt,0,0); 2920 } 2921 /* 2922 Try to clean up some of the debris left after the 2923 optimisation. 2924 */ 2925 while( optimize < jumper ) { 2926 mjd_nodelen += Node_Length((optimize)); 2927 OP( optimize ) = OPTIMIZED; 2928 Set_Node_Offset_Length(optimize,0,0); 2929 optimize++; 2930 } 2931 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); 2932 }); 2933 } /* end node insert */ 2934 2935 /* Finish populating the prev field of the wordinfo array. Walk back 2936 * from each accept state until we find another accept state, and if 2937 * so, point the first word's .prev field at the second word. If the 2938 * second already has a .prev field set, stop now. This will be the 2939 * case either if we've already processed that word's accept state, 2940 * or that state had multiple words, and the overspill words were 2941 * already linked up earlier. 2942 */ 2943 { 2944 U16 word; 2945 U32 state; 2946 U16 prev; 2947 2948 for (word=1; word <= trie->wordcount; word++) { 2949 prev = 0; 2950 if (trie->wordinfo[word].prev) 2951 continue; 2952 state = trie->wordinfo[word].accept; 2953 while (state) { 2954 state = prev_states[state]; 2955 if (!state) 2956 break; 2957 prev = trie->states[state].wordnum; 2958 if (prev) 2959 break; 2960 } 2961 trie->wordinfo[word].prev = prev; 2962 } 2963 Safefree(prev_states); 2964 } 2965 2966 2967 /* and now dump out the compressed format */ 2968 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 2969 2970 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 2971 #ifdef DEBUGGING 2972 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 2973 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 2974 #else 2975 SvREFCNT_dec_NN(revcharmap); 2976 #endif 2977 return trie->jump 2978 ? MADE_JUMP_TRIE 2979 : trie->startstate>1 2980 ? MADE_EXACT_TRIE 2981 : MADE_TRIE; 2982 } 2983 2984 STATIC void 2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) 2986 { 2987 /* The Trie is constructed and compressed now so we can build a fail array if 2988 * it's needed 2989 2990 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 2991 3.32 in the 2992 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, 2993 Ullman 1985/88 2994 ISBN 0-201-10088-6 2995 2996 We find the fail state for each state in the trie, this state is the longest 2997 proper suffix of the current state's 'word' that is also a proper prefix of 2998 another word in our trie. State 1 represents the word '' and is thus the 2999 default fail state. This allows the DFA not to have to restart after its 3000 tried and failed a word at a given point, it simply continues as though it 3001 had been matching the other word in the first place. 3002 Consider 3003 'abcdgu'=~/abcdefg|cdgu/ 3004 When we get to 'd' we are still matching the first word, we would encounter 3005 'g' which would fail, which would bring us to the state representing 'd' in 3006 the second word where we would try 'g' and succeed, proceeding to match 3007 'cdgu'. 3008 */ 3009 /* add a fail transition */ 3010 const U32 trie_offset = ARG(source); 3011 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 3012 U32 *q; 3013 const U32 ucharcount = trie->uniquecharcount; 3014 const U32 numstates = trie->statecount; 3015 const U32 ubound = trie->lasttrans + ucharcount; 3016 U32 q_read = 0; 3017 U32 q_write = 0; 3018 U32 charid; 3019 U32 base = trie->states[ 1 ].trans.base; 3020 U32 *fail; 3021 reg_ac_data *aho; 3022 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); 3023 GET_RE_DEBUG_FLAGS_DECL; 3024 3025 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; 3026 #ifndef DEBUGGING 3027 PERL_UNUSED_ARG(depth); 3028 #endif 3029 3030 3031 ARG_SET( stclass, data_slot ); 3032 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 3033 RExC_rxi->data->data[ data_slot ] = (void*)aho; 3034 aho->trie=trie_offset; 3035 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 3036 Copy( trie->states, aho->states, numstates, reg_trie_state ); 3037 Newxz( q, numstates, U32); 3038 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 3039 aho->refcount = 1; 3040 fail = aho->fail; 3041 /* initialize fail[0..1] to be 1 so that we always have 3042 a valid final fail state */ 3043 fail[ 0 ] = fail[ 1 ] = 1; 3044 3045 for ( charid = 0; charid < ucharcount ; charid++ ) { 3046 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 3047 if ( newstate ) { 3048 q[ q_write ] = newstate; 3049 /* set to point at the root */ 3050 fail[ q[ q_write++ ] ]=1; 3051 } 3052 } 3053 while ( q_read < q_write) { 3054 const U32 cur = q[ q_read++ % numstates ]; 3055 base = trie->states[ cur ].trans.base; 3056 3057 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 3058 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 3059 if (ch_state) { 3060 U32 fail_state = cur; 3061 U32 fail_base; 3062 do { 3063 fail_state = fail[ fail_state ]; 3064 fail_base = aho->states[ fail_state ].trans.base; 3065 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 3066 3067 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 3068 fail[ ch_state ] = fail_state; 3069 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 3070 { 3071 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 3072 } 3073 q[ q_write++ % numstates] = ch_state; 3074 } 3075 } 3076 } 3077 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 3078 when we fail in state 1, this allows us to use the 3079 charclass scan to find a valid start char. This is based on the principle 3080 that theres a good chance the string being searched contains lots of stuff 3081 that cant be a start char. 3082 */ 3083 fail[ 0 ] = fail[ 1 ] = 0; 3084 DEBUG_TRIE_COMPILE_r({ 3085 PerlIO_printf(Perl_debug_log, 3086 "%*sStclass Failtable (%"UVuf" states): 0", 3087 (int)(depth * 2), "", (UV)numstates 3088 ); 3089 for( q_read=1; q_read<numstates; q_read++ ) { 3090 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); 3091 } 3092 PerlIO_printf(Perl_debug_log, "\n"); 3093 }); 3094 Safefree(q); 3095 /*RExC_seen |= REG_TRIEDFA_SEEN;*/ 3096 } 3097 3098 3099 #define DEBUG_PEEP(str,scan,depth) \ 3100 DEBUG_OPTIMISE_r({if (scan){ \ 3101 SV * const mysv=sv_newmortal(); \ 3102 regnode *Next = regnext(scan); \ 3103 regprop(RExC_rx, mysv, scan, NULL); \ 3104 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ 3105 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ 3106 Next ? (REG_NODE_NUM(Next)) : 0 ); \ 3107 }}); 3108 3109 3110 /* The below joins as many adjacent EXACTish nodes as possible into a single 3111 * one. The regop may be changed if the node(s) contain certain sequences that 3112 * require special handling. The joining is only done if: 3113 * 1) there is room in the current conglomerated node to entirely contain the 3114 * next one. 3115 * 2) they are the exact same node type 3116 * 3117 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and 3118 * these get optimized out 3119 * 3120 * If a node is to match under /i (folded), the number of characters it matches 3121 * can be different than its character length if it contains a multi-character 3122 * fold. *min_subtract is set to the total delta number of characters of the 3123 * input nodes. 3124 * 3125 * And *unfolded_multi_char is set to indicate whether or not the node contains 3126 * an unfolded multi-char fold. This happens when whether the fold is valid or 3127 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN 3128 * SMALL LETTER SHARP S, as only if the target string being matched against 3129 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose 3130 * folding rules depend on the locale in force at runtime. (Multi-char folds 3131 * whose components are all above the Latin1 range are not run-time locale 3132 * dependent, and have already been folded by the time this function is 3133 * called.) 3134 * 3135 * This is as good a place as any to discuss the design of handling these 3136 * multi-character fold sequences. It's been wrong in Perl for a very long 3137 * time. There are three code points in Unicode whose multi-character folds 3138 * were long ago discovered to mess things up. The previous designs for 3139 * dealing with these involved assigning a special node for them. This 3140 * approach doesn't always work, as evidenced by this example: 3141 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches 3142 * Both sides fold to "sss", but if the pattern is parsed to create a node that 3143 * would match just the \xDF, it won't be able to handle the case where a 3144 * successful match would have to cross the node's boundary. The new approach 3145 * that hopefully generally solves the problem generates an EXACTFU_SS node 3146 * that is "sss" in this case. 3147 * 3148 * It turns out that there are problems with all multi-character folds, and not 3149 * just these three. Now the code is general, for all such cases. The 3150 * approach taken is: 3151 * 1) This routine examines each EXACTFish node that could contain multi- 3152 * character folded sequences. Since a single character can fold into 3153 * such a sequence, the minimum match length for this node is less than 3154 * the number of characters in the node. This routine returns in 3155 * *min_subtract how many characters to subtract from the the actual 3156 * length of the string to get a real minimum match length; it is 0 if 3157 * there are no multi-char foldeds. This delta is used by the caller to 3158 * adjust the min length of the match, and the delta between min and max, 3159 * so that the optimizer doesn't reject these possibilities based on size 3160 * constraints. 3161 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS 3162 * is used for an EXACTFU node that contains at least one "ss" sequence in 3163 * it. For non-UTF-8 patterns and strings, this is the only case where 3164 * there is a possible fold length change. That means that a regular 3165 * EXACTFU node without UTF-8 involvement doesn't have to concern itself 3166 * with length changes, and so can be processed faster. regexec.c takes 3167 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is 3168 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't 3169 * known until runtime). This saves effort in regex matching. However, 3170 * the pre-folding isn't done for non-UTF8 patterns because the fold of 3171 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by 3172 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, 3173 * again, EXACTFL) nodes fold to isn't known until runtime. The fold 3174 * possibilities for the non-UTF8 patterns are quite simple, except for 3175 * the sharp s. All the ones that don't involve a UTF-8 target string are 3176 * members of a fold-pair, and arrays are set up for all of them so that 3177 * the other member of the pair can be found quickly. Code elsewhere in 3178 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to 3179 * 'ss', even if the pattern isn't UTF-8. This avoids the issues 3180 * described in the next item. 3181 * 3) A problem remains for unfolded multi-char folds. (These occur when the 3182 * validity of the fold won't be known until runtime, and so must remain 3183 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA 3184 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot 3185 * be an EXACTF node with a UTF-8 pattern.) They also occur for various 3186 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) 3187 * The reason this is a problem is that the optimizer part of regexec.c 3188 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption 3189 * that a character in the pattern corresponds to at most a single 3190 * character in the target string. (And I do mean character, and not byte 3191 * here, unlike other parts of the documentation that have never been 3192 * updated to account for multibyte Unicode.) sharp s in EXACTF and 3193 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes 3194 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL 3195 * nodes, violate the assumption, and they are the only instances where it 3196 * is violated. I'm reluctant to try to change the assumption, as the 3197 * code involved is impenetrable to me (khw), so instead the code here 3198 * punts. This routine examines EXACTFL nodes, and (when the pattern 3199 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a 3200 * boolean indicating whether or not the node contains such a fold. When 3201 * it is true, the caller sets a flag that later causes the optimizer in 3202 * this file to not set values for the floating and fixed string lengths, 3203 * and thus avoids the optimizer code in regexec.c that makes the invalid 3204 * assumption. Thus, there is no optimization based on string lengths for 3205 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern 3206 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the 3207 * assumption is wrong only in these cases is that all other non-UTF-8 3208 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to 3209 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in 3210 * EXACTF nodes because we don't know at compile time if it actually 3211 * matches 'ss' or not. For EXACTF nodes it will match iff the target 3212 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it 3213 * always matches; and EXACTFA where it never does. In an EXACTFA node in 3214 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the 3215 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 3216 * string would require the pattern to be forced into UTF-8, the overhead 3217 * of which we want to avoid. Similarly the unfolded multi-char folds in 3218 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 3219 * locale.) 3220 * 3221 * Similarly, the code that generates tries doesn't currently handle 3222 * not-already-folded multi-char folds, and it looks like a pain to change 3223 * that. Therefore, trie generation of EXACTFA nodes with the sharp s 3224 * doesn't work. Instead, such an EXACTFA is turned into a new regnode, 3225 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people 3226 * using /iaa matching will be doing so almost entirely with ASCII 3227 * strings, so this should rarely be encountered in practice */ 3228 3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ 3230 if (PL_regkind[OP(scan)] == EXACT) \ 3231 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) 3232 3233 STATIC U32 3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, 3235 UV *min_subtract, bool *unfolded_multi_char, 3236 U32 flags,regnode *val, U32 depth) 3237 { 3238 /* Merge several consecutive EXACTish nodes into one. */ 3239 regnode *n = regnext(scan); 3240 U32 stringok = 1; 3241 regnode *next = scan + NODE_SZ_STR(scan); 3242 U32 merged = 0; 3243 U32 stopnow = 0; 3244 #ifdef DEBUGGING 3245 regnode *stop = scan; 3246 GET_RE_DEBUG_FLAGS_DECL; 3247 #else 3248 PERL_UNUSED_ARG(depth); 3249 #endif 3250 3251 PERL_ARGS_ASSERT_JOIN_EXACT; 3252 #ifndef EXPERIMENTAL_INPLACESCAN 3253 PERL_UNUSED_ARG(flags); 3254 PERL_UNUSED_ARG(val); 3255 #endif 3256 DEBUG_PEEP("join",scan,depth); 3257 3258 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge 3259 * EXACT ones that are mergeable to the current one. */ 3260 while (n 3261 && (PL_regkind[OP(n)] == NOTHING 3262 || (stringok && OP(n) == OP(scan))) 3263 && NEXT_OFF(n) 3264 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) 3265 { 3266 3267 if (OP(n) == TAIL || n > next) 3268 stringok = 0; 3269 if (PL_regkind[OP(n)] == NOTHING) { 3270 DEBUG_PEEP("skip:",n,depth); 3271 NEXT_OFF(scan) += NEXT_OFF(n); 3272 next = n + NODE_STEP_REGNODE; 3273 #ifdef DEBUGGING 3274 if (stringok) 3275 stop = n; 3276 #endif 3277 n = regnext(n); 3278 } 3279 else if (stringok) { 3280 const unsigned int oldl = STR_LEN(scan); 3281 regnode * const nnext = regnext(n); 3282 3283 /* XXX I (khw) kind of doubt that this works on platforms (should 3284 * Perl ever run on one) where U8_MAX is above 255 because of lots 3285 * of other assumptions */ 3286 /* Don't join if the sum can't fit into a single node */ 3287 if (oldl + STR_LEN(n) > U8_MAX) 3288 break; 3289 3290 DEBUG_PEEP("merg",n,depth); 3291 merged++; 3292 3293 NEXT_OFF(scan) += NEXT_OFF(n); 3294 STR_LEN(scan) += STR_LEN(n); 3295 next = n + NODE_SZ_STR(n); 3296 /* Now we can overwrite *n : */ 3297 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 3298 #ifdef DEBUGGING 3299 stop = next - 1; 3300 #endif 3301 n = nnext; 3302 if (stopnow) break; 3303 } 3304 3305 #ifdef EXPERIMENTAL_INPLACESCAN 3306 if (flags && !NEXT_OFF(n)) { 3307 DEBUG_PEEP("atch", val, depth); 3308 if (reg_off_by_arg[OP(n)]) { 3309 ARG_SET(n, val - n); 3310 } 3311 else { 3312 NEXT_OFF(n) = val - n; 3313 } 3314 stopnow = 1; 3315 } 3316 #endif 3317 } 3318 3319 *min_subtract = 0; 3320 *unfolded_multi_char = FALSE; 3321 3322 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We 3323 * can now analyze for sequences of problematic code points. (Prior to 3324 * this final joining, sequences could have been split over boundaries, and 3325 * hence missed). The sequences only happen in folding, hence for any 3326 * non-EXACT EXACTish node */ 3327 if (OP(scan) != EXACT) { 3328 U8* s0 = (U8*) STRING(scan); 3329 U8* s = s0; 3330 U8* s_end = s0 + STR_LEN(scan); 3331 3332 int total_count_delta = 0; /* Total delta number of characters that 3333 multi-char folds expand to */ 3334 3335 /* One pass is made over the node's string looking for all the 3336 * possibilities. To avoid some tests in the loop, there are two main 3337 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and 3338 * non-UTF-8 */ 3339 if (UTF) { 3340 U8* folded = NULL; 3341 3342 if (OP(scan) == EXACTFL) { 3343 U8 *d; 3344 3345 /* An EXACTFL node would already have been changed to another 3346 * node type unless there is at least one character in it that 3347 * is problematic; likely a character whose fold definition 3348 * won't be known until runtime, and so has yet to be folded. 3349 * For all but the UTF-8 locale, folds are 1-1 in length, but 3350 * to handle the UTF-8 case, we need to create a temporary 3351 * folded copy using UTF-8 locale rules in order to analyze it. 3352 * This is because our macros that look to see if a sequence is 3353 * a multi-char fold assume everything is folded (otherwise the 3354 * tests in those macros would be too complicated and slow). 3355 * Note that here, the non-problematic folds will have already 3356 * been done, so we can just copy such characters. We actually 3357 * don't completely fold the EXACTFL string. We skip the 3358 * unfolded multi-char folds, as that would just create work 3359 * below to figure out the size they already are */ 3360 3361 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); 3362 d = folded; 3363 while (s < s_end) { 3364 STRLEN s_len = UTF8SKIP(s); 3365 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { 3366 Copy(s, d, s_len, U8); 3367 d += s_len; 3368 } 3369 else if (is_FOLDS_TO_MULTI_utf8(s)) { 3370 *unfolded_multi_char = TRUE; 3371 Copy(s, d, s_len, U8); 3372 d += s_len; 3373 } 3374 else if (isASCII(*s)) { 3375 *(d++) = toFOLD(*s); 3376 } 3377 else { 3378 STRLEN len; 3379 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); 3380 d += len; 3381 } 3382 s += s_len; 3383 } 3384 3385 /* Point the remainder of the routine to look at our temporary 3386 * folded copy */ 3387 s = folded; 3388 s_end = d; 3389 } /* End of creating folded copy of EXACTFL string */ 3390 3391 /* Examine the string for a multi-character fold sequence. UTF-8 3392 * patterns have all characters pre-folded by the time this code is 3393 * executed */ 3394 while (s < s_end - 1) /* Can stop 1 before the end, as minimum 3395 length sequence we are looking for is 2 */ 3396 { 3397 int count = 0; /* How many characters in a multi-char fold */ 3398 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); 3399 if (! len) { /* Not a multi-char fold: get next char */ 3400 s += UTF8SKIP(s); 3401 continue; 3402 } 3403 3404 /* Nodes with 'ss' require special handling, except for 3405 * EXACTFA-ish for which there is no multi-char fold to this */ 3406 if (len == 2 && *s == 's' && *(s+1) == 's' 3407 && OP(scan) != EXACTFA 3408 && OP(scan) != EXACTFA_NO_TRIE) 3409 { 3410 count = 2; 3411 if (OP(scan) != EXACTFL) { 3412 OP(scan) = EXACTFU_SS; 3413 } 3414 s += 2; 3415 } 3416 else { /* Here is a generic multi-char fold. */ 3417 U8* multi_end = s + len; 3418 3419 /* Count how many characters in it. In the case of /aa, no 3420 * folds which contain ASCII code points are allowed, so 3421 * check for those, and skip if found. */ 3422 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { 3423 count = utf8_length(s, multi_end); 3424 s = multi_end; 3425 } 3426 else { 3427 while (s < multi_end) { 3428 if (isASCII(*s)) { 3429 s++; 3430 goto next_iteration; 3431 } 3432 else { 3433 s += UTF8SKIP(s); 3434 } 3435 count++; 3436 } 3437 } 3438 } 3439 3440 /* The delta is how long the sequence is minus 1 (1 is how long 3441 * the character that folds to the sequence is) */ 3442 total_count_delta += count - 1; 3443 next_iteration: ; 3444 } 3445 3446 /* We created a temporary folded copy of the string in EXACTFL 3447 * nodes. Therefore we need to be sure it doesn't go below zero, 3448 * as the real string could be shorter */ 3449 if (OP(scan) == EXACTFL) { 3450 int total_chars = utf8_length((U8*) STRING(scan), 3451 (U8*) STRING(scan) + STR_LEN(scan)); 3452 if (total_count_delta > total_chars) { 3453 total_count_delta = total_chars; 3454 } 3455 } 3456 3457 *min_subtract += total_count_delta; 3458 Safefree(folded); 3459 } 3460 else if (OP(scan) == EXACTFA) { 3461 3462 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char 3463 * fold to the ASCII range (and there are no existing ones in the 3464 * upper latin1 range). But, as outlined in the comments preceding 3465 * this function, we need to flag any occurrences of the sharp s. 3466 * This character forbids trie formation (because of added 3467 * complexity) */ 3468 while (s < s_end) { 3469 if (*s == LATIN_SMALL_LETTER_SHARP_S) { 3470 OP(scan) = EXACTFA_NO_TRIE; 3471 *unfolded_multi_char = TRUE; 3472 break; 3473 } 3474 s++; 3475 continue; 3476 } 3477 } 3478 else { 3479 3480 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char 3481 * folds that are all Latin1. As explained in the comments 3482 * preceding this function, we look also for the sharp s in EXACTF 3483 * and EXACTFL nodes; it can be in the final position. Otherwise 3484 * we can stop looking 1 byte earlier because have to find at least 3485 * two characters for a multi-fold */ 3486 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) 3487 ? s_end 3488 : s_end -1; 3489 3490 while (s < upper) { 3491 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); 3492 if (! len) { /* Not a multi-char fold. */ 3493 if (*s == LATIN_SMALL_LETTER_SHARP_S 3494 && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) 3495 { 3496 *unfolded_multi_char = TRUE; 3497 } 3498 s++; 3499 continue; 3500 } 3501 3502 if (len == 2 3503 && isARG2_lower_or_UPPER_ARG1('s', *s) 3504 && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) 3505 { 3506 3507 /* EXACTF nodes need to know that the minimum length 3508 * changed so that a sharp s in the string can match this 3509 * ss in the pattern, but they remain EXACTF nodes, as they 3510 * won't match this unless the target string is is UTF-8, 3511 * which we don't know until runtime. EXACTFL nodes can't 3512 * transform into EXACTFU nodes */ 3513 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { 3514 OP(scan) = EXACTFU_SS; 3515 } 3516 } 3517 3518 *min_subtract += len - 1; 3519 s += len; 3520 } 3521 } 3522 } 3523 3524 #ifdef DEBUGGING 3525 /* Allow dumping but overwriting the collection of skipped 3526 * ops and/or strings with fake optimized ops */ 3527 n = scan + NODE_SZ_STR(scan); 3528 while (n <= stop) { 3529 OP(n) = OPTIMIZED; 3530 FLAGS(n) = 0; 3531 NEXT_OFF(n) = 0; 3532 n++; 3533 } 3534 #endif 3535 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); 3536 return stopnow; 3537 } 3538 3539 /* REx optimizer. Converts nodes into quicker variants "in place". 3540 Finds fixed substrings. */ 3541 3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 3543 to the position after last scanned or to NULL. */ 3544 3545 #define INIT_AND_WITHP \ 3546 assert(!and_withp); \ 3547 Newx(and_withp,1, regnode_ssc); \ 3548 SAVEFREEPV(and_withp) 3549 3550 /* this is a chain of data about sub patterns we are processing that 3551 need to be handled separately/specially in study_chunk. Its so 3552 we can simulate recursion without losing state. */ 3553 struct scan_frame; 3554 typedef struct scan_frame { 3555 regnode *last; /* last node to process in this frame */ 3556 regnode *next; /* next node to process when last is reached */ 3557 struct scan_frame *prev; /*previous frame*/ 3558 U32 prev_recursed_depth; 3559 I32 stop; /* what stopparen do we use */ 3560 } scan_frame; 3561 3562 3563 STATIC SSize_t 3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 3565 SSize_t *minlenp, SSize_t *deltap, 3566 regnode *last, 3567 scan_data_t *data, 3568 I32 stopparen, 3569 U32 recursed_depth, 3570 regnode_ssc *and_withp, 3571 U32 flags, U32 depth) 3572 /* scanp: Start here (read-write). */ 3573 /* deltap: Write maxlen-minlen here. */ 3574 /* last: Stop before this one. */ 3575 /* data: string data about the pattern */ 3576 /* stopparen: treat close N as END */ 3577 /* recursed: which subroutines have we recursed into */ 3578 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 3579 { 3580 dVAR; 3581 /* There must be at least this number of characters to match */ 3582 SSize_t min = 0; 3583 I32 pars = 0, code; 3584 regnode *scan = *scanp, *next; 3585 SSize_t delta = 0; 3586 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 3587 int is_inf_internal = 0; /* The studied chunk is infinite */ 3588 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 3589 scan_data_t data_fake; 3590 SV *re_trie_maxbuff = NULL; 3591 regnode *first_non_open = scan; 3592 SSize_t stopmin = SSize_t_MAX; 3593 scan_frame *frame = NULL; 3594 GET_RE_DEBUG_FLAGS_DECL; 3595 3596 PERL_ARGS_ASSERT_STUDY_CHUNK; 3597 3598 #ifdef DEBUGGING 3599 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 3600 #endif 3601 if ( depth == 0 ) { 3602 while (first_non_open && OP(first_non_open) == OPEN) 3603 first_non_open=regnext(first_non_open); 3604 } 3605 3606 3607 fake_study_recurse: 3608 while ( scan && OP(scan) != END && scan < last ){ 3609 UV min_subtract = 0; /* How mmany chars to subtract from the minimum 3610 node length to get a real minimum (because 3611 the folded version may be shorter) */ 3612 bool unfolded_multi_char = FALSE; 3613 /* Peephole optimizer: */ 3614 DEBUG_OPTIMISE_MORE_r( 3615 { 3616 PerlIO_printf(Perl_debug_log, 3617 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", 3618 ((int) depth*2), "", (long)stopparen, 3619 (unsigned long)depth, (unsigned long)recursed_depth); 3620 if (recursed_depth) { 3621 U32 i; 3622 U32 j; 3623 for ( j = 0 ; j < recursed_depth ; j++ ) { 3624 PerlIO_printf(Perl_debug_log,"["); 3625 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) 3626 PerlIO_printf(Perl_debug_log,"%d", 3627 PAREN_TEST(RExC_study_chunk_recursed + 3628 (j * RExC_study_chunk_recursed_bytes), i) 3629 ? 1 : 0 3630 ); 3631 PerlIO_printf(Perl_debug_log,"]"); 3632 } 3633 } 3634 PerlIO_printf(Perl_debug_log,"\n"); 3635 } 3636 ); 3637 DEBUG_STUDYDATA("Peep:", data, depth); 3638 DEBUG_PEEP("Peep", scan, depth); 3639 3640 3641 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ 3642 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled 3643 * by a different invocation of reg() -- Yves 3644 */ 3645 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); 3646 3647 /* Follow the next-chain of the current node and optimize 3648 away all the NOTHINGs from it. */ 3649 if (OP(scan) != CURLYX) { 3650 const int max = (reg_off_by_arg[OP(scan)] 3651 ? I32_MAX 3652 /* I32 may be smaller than U16 on CRAYs! */ 3653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 3654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 3655 int noff; 3656 regnode *n = scan; 3657 3658 /* Skip NOTHING and LONGJMP. */ 3659 while ((n = regnext(n)) 3660 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 3661 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 3662 && off + noff < max) 3663 off += noff; 3664 if (reg_off_by_arg[OP(scan)]) 3665 ARG(scan) = off; 3666 else 3667 NEXT_OFF(scan) = off; 3668 } 3669 3670 3671 3672 /* The principal pseudo-switch. Cannot be a switch, since we 3673 look into several different things. */ 3674 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 3675 || OP(scan) == IFTHEN) { 3676 next = regnext(scan); 3677 code = OP(scan); 3678 /* demq: the op(next)==code check is to see if we have 3679 * "branch-branch" AFAICT */ 3680 3681 if (OP(next) == code || code == IFTHEN) { 3682 /* NOTE - There is similar code to this block below for 3683 * handling TRIE nodes on a re-study. If you change stuff here 3684 * check there too. */ 3685 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; 3686 regnode_ssc accum; 3687 regnode * const startbranch=scan; 3688 3689 if (flags & SCF_DO_SUBSTR) { 3690 /* Cannot merge strings after this. */ 3691 scan_commit(pRExC_state, data, minlenp, is_inf); 3692 } 3693 3694 if (flags & SCF_DO_STCLASS) 3695 ssc_init_zero(pRExC_state, &accum); 3696 3697 while (OP(scan) == code) { 3698 SSize_t deltanext, minnext, fake; 3699 I32 f = 0; 3700 regnode_ssc this_class; 3701 3702 num++; 3703 data_fake.flags = 0; 3704 if (data) { 3705 data_fake.whilem_c = data->whilem_c; 3706 data_fake.last_closep = data->last_closep; 3707 } 3708 else 3709 data_fake.last_closep = &fake; 3710 3711 data_fake.pos_delta = delta; 3712 next = regnext(scan); 3713 scan = NEXTOPER(scan); 3714 if (code != BRANCH) 3715 scan = NEXTOPER(scan); 3716 if (flags & SCF_DO_STCLASS) { 3717 ssc_init(pRExC_state, &this_class); 3718 data_fake.start_class = &this_class; 3719 f = SCF_DO_STCLASS_AND; 3720 } 3721 if (flags & SCF_WHILEM_VISITED_POS) 3722 f |= SCF_WHILEM_VISITED_POS; 3723 3724 /* we suppose the run is continuous, last=next...*/ 3725 minnext = study_chunk(pRExC_state, &scan, minlenp, 3726 &deltanext, next, &data_fake, stopparen, 3727 recursed_depth, NULL, f,depth+1); 3728 if (min1 > minnext) 3729 min1 = minnext; 3730 if (deltanext == SSize_t_MAX) { 3731 is_inf = is_inf_internal = 1; 3732 max1 = SSize_t_MAX; 3733 } else if (max1 < minnext + deltanext) 3734 max1 = minnext + deltanext; 3735 scan = next; 3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3737 pars++; 3738 if (data_fake.flags & SCF_SEEN_ACCEPT) { 3739 if ( stopmin > minnext) 3740 stopmin = min + min1; 3741 flags &= ~SCF_DO_SUBSTR; 3742 if (data) 3743 data->flags |= SCF_SEEN_ACCEPT; 3744 } 3745 if (data) { 3746 if (data_fake.flags & SF_HAS_EVAL) 3747 data->flags |= SF_HAS_EVAL; 3748 data->whilem_c = data_fake.whilem_c; 3749 } 3750 if (flags & SCF_DO_STCLASS) 3751 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); 3752 } 3753 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 3754 min1 = 0; 3755 if (flags & SCF_DO_SUBSTR) { 3756 data->pos_min += min1; 3757 if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) 3758 data->pos_delta = SSize_t_MAX; 3759 else 3760 data->pos_delta += max1 - min1; 3761 if (max1 != min1 || is_inf) 3762 data->longest = &(data->longest_float); 3763 } 3764 min += min1; 3765 if (delta == SSize_t_MAX 3766 || SSize_t_MAX - delta - (max1 - min1) < 0) 3767 delta = SSize_t_MAX; 3768 else 3769 delta += max1 - min1; 3770 if (flags & SCF_DO_STCLASS_OR) { 3771 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); 3772 if (min1) { 3773 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 3774 flags &= ~SCF_DO_STCLASS; 3775 } 3776 } 3777 else if (flags & SCF_DO_STCLASS_AND) { 3778 if (min1) { 3779 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 3780 flags &= ~SCF_DO_STCLASS; 3781 } 3782 else { 3783 /* Switch to OR mode: cache the old value of 3784 * data->start_class */ 3785 INIT_AND_WITHP; 3786 StructCopy(data->start_class, and_withp, regnode_ssc); 3787 flags &= ~SCF_DO_STCLASS_AND; 3788 StructCopy(&accum, data->start_class, regnode_ssc); 3789 flags |= SCF_DO_STCLASS_OR; 3790 } 3791 } 3792 3793 if (PERL_ENABLE_TRIE_OPTIMISATION && 3794 OP( startbranch ) == BRANCH ) 3795 { 3796 /* demq. 3797 3798 Assuming this was/is a branch we are dealing with: 'scan' 3799 now points at the item that follows the branch sequence, 3800 whatever it is. We now start at the beginning of the 3801 sequence and look for subsequences of 3802 3803 BRANCH->EXACT=>x1 3804 BRANCH->EXACT=>x2 3805 tail 3806 3807 which would be constructed from a pattern like 3808 /A|LIST|OF|WORDS/ 3809 3810 If we can find such a subsequence we need to turn the first 3811 element into a trie and then add the subsequent branch exact 3812 strings to the trie. 3813 3814 We have two cases 3815 3816 1. patterns where the whole set of branches can be 3817 converted. 3818 3819 2. patterns where only a subset can be converted. 3820 3821 In case 1 we can replace the whole set with a single regop 3822 for the trie. In case 2 we need to keep the start and end 3823 branches so 3824 3825 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 3826 becomes BRANCH TRIE; BRANCH X; 3827 3828 There is an additional case, that being where there is a 3829 common prefix, which gets split out into an EXACT like node 3830 preceding the TRIE node. 3831 3832 If x(1..n)==tail then we can do a simple trie, if not we make 3833 a "jump" trie, such that when we match the appropriate word 3834 we "jump" to the appropriate tail node. Essentially we turn 3835 a nested if into a case structure of sorts. 3836 3837 */ 3838 3839 int made=0; 3840 if (!re_trie_maxbuff) { 3841 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 3842 if (!SvIOK(re_trie_maxbuff)) 3843 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 3844 } 3845 if ( SvIV(re_trie_maxbuff)>=0 ) { 3846 regnode *cur; 3847 regnode *first = (regnode *)NULL; 3848 regnode *last = (regnode *)NULL; 3849 regnode *tail = scan; 3850 U8 trietype = 0; 3851 U32 count=0; 3852 3853 #ifdef DEBUGGING 3854 SV * const mysv = sv_newmortal(); /* for dumping */ 3855 #endif 3856 /* var tail is used because there may be a TAIL 3857 regop in the way. Ie, the exacts will point to the 3858 thing following the TAIL, but the last branch will 3859 point at the TAIL. So we advance tail. If we 3860 have nested (?:) we may have to move through several 3861 tails. 3862 */ 3863 3864 while ( OP( tail ) == TAIL ) { 3865 /* this is the TAIL generated by (?:) */ 3866 tail = regnext( tail ); 3867 } 3868 3869 3870 DEBUG_TRIE_COMPILE_r({ 3871 regprop(RExC_rx, mysv, tail, NULL); 3872 PerlIO_printf( Perl_debug_log, "%*s%s%s\n", 3873 (int)depth * 2 + 2, "", 3874 "Looking for TRIE'able sequences. Tail node is: ", 3875 SvPV_nolen_const( mysv ) 3876 ); 3877 }); 3878 3879 /* 3880 3881 Step through the branches 3882 cur represents each branch, 3883 noper is the first thing to be matched as part 3884 of that branch 3885 noper_next is the regnext() of that node. 3886 3887 We normally handle a case like this 3888 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also 3889 support building with NOJUMPTRIE, which restricts 3890 the trie logic to structures like /FOO|BAR/. 3891 3892 If noper is a trieable nodetype then the branch is 3893 a possible optimization target. If we are building 3894 under NOJUMPTRIE then we require that noper_next is 3895 the same as scan (our current position in the regex 3896 program). 3897 3898 Once we have two or more consecutive such branches 3899 we can create a trie of the EXACT's contents and 3900 stitch it in place into the program. 3901 3902 If the sequence represents all of the branches in 3903 the alternation we replace the entire thing with a 3904 single TRIE node. 3905 3906 Otherwise when it is a subsequence we need to 3907 stitch it in place and replace only the relevant 3908 branches. This means the first branch has to remain 3909 as it is used by the alternation logic, and its 3910 next pointer, and needs to be repointed at the item 3911 on the branch chain following the last branch we 3912 have optimized away. 3913 3914 This could be either a BRANCH, in which case the 3915 subsequence is internal, or it could be the item 3916 following the branch sequence in which case the 3917 subsequence is at the end (which does not 3918 necessarily mean the first node is the start of the 3919 alternation). 3920 3921 TRIE_TYPE(X) is a define which maps the optype to a 3922 trietype. 3923 3924 optype | trietype 3925 ----------------+----------- 3926 NOTHING | NOTHING 3927 EXACT | EXACT 3928 EXACTFU | EXACTFU 3929 EXACTFU_SS | EXACTFU 3930 EXACTFA | EXACTFA 3931 3932 3933 */ 3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ 3935 ( EXACT == (X) ) ? EXACT : \ 3936 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ 3937 ( EXACTFA == (X) ) ? EXACTFA : \ 3938 0 ) 3939 3940 /* dont use tail as the end marker for this traverse */ 3941 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 3942 regnode * const noper = NEXTOPER( cur ); 3943 U8 noper_type = OP( noper ); 3944 U8 noper_trietype = TRIE_TYPE( noper_type ); 3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 3946 regnode * const noper_next = regnext( noper ); 3947 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; 3948 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; 3949 #endif 3950 3951 DEBUG_TRIE_COMPILE_r({ 3952 regprop(RExC_rx, mysv, cur, NULL); 3953 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", 3954 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); 3955 3956 regprop(RExC_rx, mysv, noper, NULL); 3957 PerlIO_printf( Perl_debug_log, " -> %s", 3958 SvPV_nolen_const(mysv)); 3959 3960 if ( noper_next ) { 3961 regprop(RExC_rx, mysv, noper_next, NULL); 3962 PerlIO_printf( Perl_debug_log,"\t=> %s\t", 3963 SvPV_nolen_const(mysv)); 3964 } 3965 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", 3966 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), 3967 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] 3968 ); 3969 }); 3970 3971 /* Is noper a trieable nodetype that can be merged 3972 * with the current trie (if there is one)? */ 3973 if ( noper_trietype 3974 && 3975 ( 3976 ( noper_trietype == NOTHING) 3977 || ( trietype == NOTHING ) 3978 || ( trietype == noper_trietype ) 3979 ) 3980 #ifdef NOJUMPTRIE 3981 && noper_next == tail 3982 #endif 3983 && count < U16_MAX) 3984 { 3985 /* Handle mergable triable node Either we are 3986 * the first node in a new trieable sequence, 3987 * in which case we do some bookkeeping, 3988 * otherwise we update the end pointer. */ 3989 if ( !first ) { 3990 first = cur; 3991 if ( noper_trietype == NOTHING ) { 3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) 3993 regnode * const noper_next = regnext( noper ); 3994 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; 3995 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; 3996 #endif 3997 3998 if ( noper_next_trietype ) { 3999 trietype = noper_next_trietype; 4000 } else if (noper_next_type) { 4001 /* a NOTHING regop is 1 regop wide. 4002 * We need at least two for a trie 4003 * so we can't merge this in */ 4004 first = NULL; 4005 } 4006 } else { 4007 trietype = noper_trietype; 4008 } 4009 } else { 4010 if ( trietype == NOTHING ) 4011 trietype = noper_trietype; 4012 last = cur; 4013 } 4014 if (first) 4015 count++; 4016 } /* end handle mergable triable node */ 4017 else { 4018 /* handle unmergable node - 4019 * noper may either be a triable node which can 4020 * not be tried together with the current trie, 4021 * or a non triable node */ 4022 if ( last ) { 4023 /* If last is set and trietype is not 4024 * NOTHING then we have found at least two 4025 * triable branch sequences in a row of a 4026 * similar trietype so we can turn them 4027 * into a trie. If/when we allow NOTHING to 4028 * start a trie sequence this condition 4029 * will be required, and it isn't expensive 4030 * so we leave it in for now. */ 4031 if ( trietype && trietype != NOTHING ) 4032 make_trie( pRExC_state, 4033 startbranch, first, cur, tail, 4034 count, trietype, depth+1 ); 4035 last = NULL; /* note: we clear/update 4036 first, trietype etc below, 4037 so we dont do it here */ 4038 } 4039 if ( noper_trietype 4040 #ifdef NOJUMPTRIE 4041 && noper_next == tail 4042 #endif 4043 ){ 4044 /* noper is triable, so we can start a new 4045 * trie sequence */ 4046 count = 1; 4047 first = cur; 4048 trietype = noper_trietype; 4049 } else if (first) { 4050 /* if we already saw a first but the 4051 * current node is not triable then we have 4052 * to reset the first information. */ 4053 count = 0; 4054 first = NULL; 4055 trietype = 0; 4056 } 4057 } /* end handle unmergable node */ 4058 } /* loop over branches */ 4059 DEBUG_TRIE_COMPILE_r({ 4060 regprop(RExC_rx, mysv, cur, NULL); 4061 PerlIO_printf( Perl_debug_log, 4062 "%*s- %s (%d) <SCAN FINISHED>\n", 4063 (int)depth * 2 + 2, 4064 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); 4065 4066 }); 4067 if ( last && trietype ) { 4068 if ( trietype != NOTHING ) { 4069 /* the last branch of the sequence was part of 4070 * a trie, so we have to construct it here 4071 * outside of the loop */ 4072 made= make_trie( pRExC_state, startbranch, 4073 first, scan, tail, count, 4074 trietype, depth+1 ); 4075 #ifdef TRIE_STUDY_OPT 4076 if ( ((made == MADE_EXACT_TRIE && 4077 startbranch == first) 4078 || ( first_non_open == first )) && 4079 depth==0 ) { 4080 flags |= SCF_TRIE_RESTUDY; 4081 if ( startbranch == first 4082 && scan == tail ) 4083 { 4084 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; 4085 } 4086 } 4087 #endif 4088 } else { 4089 /* at this point we know whatever we have is a 4090 * NOTHING sequence/branch AND if 'startbranch' 4091 * is 'first' then we can turn the whole thing 4092 * into a NOTHING 4093 */ 4094 if ( startbranch == first ) { 4095 regnode *opt; 4096 /* the entire thing is a NOTHING sequence, 4097 * something like this: (?:|) So we can 4098 * turn it into a plain NOTHING op. */ 4099 DEBUG_TRIE_COMPILE_r({ 4100 regprop(RExC_rx, mysv, cur, NULL); 4101 PerlIO_printf( Perl_debug_log, 4102 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2, 4103 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); 4104 4105 }); 4106 OP(startbranch)= NOTHING; 4107 NEXT_OFF(startbranch)= tail - startbranch; 4108 for ( opt= startbranch + 1; opt < tail ; opt++ ) 4109 OP(opt)= OPTIMIZED; 4110 } 4111 } 4112 } /* end if ( last) */ 4113 } /* TRIE_MAXBUF is non zero */ 4114 4115 } /* do trie */ 4116 4117 } 4118 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 4119 scan = NEXTOPER(NEXTOPER(scan)); 4120 } else /* single branch is optimized. */ 4121 scan = NEXTOPER(scan); 4122 continue; 4123 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { 4124 scan_frame *newframe = NULL; 4125 I32 paren; 4126 regnode *start; 4127 regnode *end; 4128 U32 my_recursed_depth= recursed_depth; 4129 4130 if (OP(scan) != SUSPEND) { 4131 /* set the pointer */ 4132 if (OP(scan) == GOSUB) { 4133 paren = ARG(scan); 4134 RExC_recurse[ARG2L(scan)] = scan; 4135 start = RExC_open_parens[paren-1]; 4136 end = RExC_close_parens[paren-1]; 4137 } else { 4138 paren = 0; 4139 start = RExC_rxi->program + 1; 4140 end = RExC_opend; 4141 } 4142 if (!recursed_depth 4143 || 4144 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) 4145 ) { 4146 if (!recursed_depth) { 4147 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); 4148 } else { 4149 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), 4150 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), 4151 RExC_study_chunk_recursed_bytes, U8); 4152 } 4153 /* we havent recursed into this paren yet, so recurse into it */ 4154 DEBUG_STUDYDATA("set:", data,depth); 4155 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); 4156 my_recursed_depth= recursed_depth + 1; 4157 Newx(newframe,1,scan_frame); 4158 } else { 4159 DEBUG_STUDYDATA("inf:", data,depth); 4160 /* some form of infinite recursion, assume infinite length 4161 * */ 4162 if (flags & SCF_DO_SUBSTR) { 4163 scan_commit(pRExC_state, data, minlenp, is_inf); 4164 data->longest = &(data->longest_float); 4165 } 4166 is_inf = is_inf_internal = 1; 4167 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 4168 ssc_anything(data->start_class); 4169 flags &= ~SCF_DO_STCLASS; 4170 } 4171 } else { 4172 Newx(newframe,1,scan_frame); 4173 paren = stopparen; 4174 start = scan+2; 4175 end = regnext(scan); 4176 } 4177 if (newframe) { 4178 assert(start); 4179 assert(end); 4180 SAVEFREEPV(newframe); 4181 newframe->next = regnext(scan); 4182 newframe->last = last; 4183 newframe->stop = stopparen; 4184 newframe->prev = frame; 4185 newframe->prev_recursed_depth = recursed_depth; 4186 4187 DEBUG_STUDYDATA("frame-new:",data,depth); 4188 DEBUG_PEEP("fnew", scan, depth); 4189 4190 frame = newframe; 4191 scan = start; 4192 stopparen = paren; 4193 last = end; 4194 depth = depth + 1; 4195 recursed_depth= my_recursed_depth; 4196 4197 continue; 4198 } 4199 } 4200 else if (OP(scan) == EXACT) { 4201 SSize_t l = STR_LEN(scan); 4202 UV uc; 4203 if (UTF) { 4204 const U8 * const s = (U8*)STRING(scan); 4205 uc = utf8_to_uvchr_buf(s, s + l, NULL); 4206 l = utf8_length(s, s + l); 4207 } else { 4208 uc = *((U8*)STRING(scan)); 4209 } 4210 min += l; 4211 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 4212 /* The code below prefers earlier match for fixed 4213 offset, later match for variable offset. */ 4214 if (data->last_end == -1) { /* Update the start info. */ 4215 data->last_start_min = data->pos_min; 4216 data->last_start_max = is_inf 4217 ? SSize_t_MAX : data->pos_min + data->pos_delta; 4218 } 4219 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 4220 if (UTF) 4221 SvUTF8_on(data->last_found); 4222 { 4223 SV * const sv = data->last_found; 4224 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 4225 mg_find(sv, PERL_MAGIC_utf8) : NULL; 4226 if (mg && mg->mg_len >= 0) 4227 mg->mg_len += utf8_length((U8*)STRING(scan), 4228 (U8*)STRING(scan)+STR_LEN(scan)); 4229 } 4230 data->last_end = data->pos_min + l; 4231 data->pos_min += l; /* As in the first entry. */ 4232 data->flags &= ~SF_BEFORE_EOL; 4233 } 4234 4235 /* ANDing the code point leaves at most it, and not in locale, and 4236 * can't match null string */ 4237 if (flags & SCF_DO_STCLASS_AND) { 4238 ssc_cp_and(data->start_class, uc); 4239 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4240 ssc_clear_locale(data->start_class); 4241 } 4242 else if (flags & SCF_DO_STCLASS_OR) { 4243 ssc_add_cp(data->start_class, uc); 4244 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4245 4246 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 4247 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4248 } 4249 flags &= ~SCF_DO_STCLASS; 4250 } 4251 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ 4252 SSize_t l = STR_LEN(scan); 4253 UV uc = *((U8*)STRING(scan)); 4254 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 4255 separate code points */ 4256 4257 /* Search for fixed substrings supports EXACT only. */ 4258 if (flags & SCF_DO_SUBSTR) { 4259 assert(data); 4260 scan_commit(pRExC_state, data, minlenp, is_inf); 4261 } 4262 if (UTF) { 4263 const U8 * const s = (U8 *)STRING(scan); 4264 uc = utf8_to_uvchr_buf(s, s + l, NULL); 4265 l = utf8_length(s, s + l); 4266 } 4267 if (unfolded_multi_char) { 4268 RExC_seen |= REG_UNFOLDED_MULTI_SEEN; 4269 } 4270 min += l - min_subtract; 4271 assert (min >= 0); 4272 delta += min_subtract; 4273 if (flags & SCF_DO_SUBSTR) { 4274 data->pos_min += l - min_subtract; 4275 if (data->pos_min < 0) { 4276 data->pos_min = 0; 4277 } 4278 data->pos_delta += min_subtract; 4279 if (min_subtract) { 4280 data->longest = &(data->longest_float); 4281 } 4282 } 4283 if (OP(scan) == EXACTFL) { 4284 4285 /* We don't know what the folds are; it could be anything. XXX 4286 * Actually, we only support UTF-8 encoding for code points 4287 * above Latin1, so we could know what those folds are. */ 4288 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, 4289 0, 4290 UV_MAX); 4291 } 4292 else { /* Non-locale EXACTFish */ 4293 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); 4294 if (flags & SCF_DO_STCLASS_AND) { 4295 ssc_clear_locale(data->start_class); 4296 } 4297 if (uc < 256) { /* We know what the Latin1 folds are ... */ 4298 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we 4299 know if anything folds 4300 with this */ 4301 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, 4302 PL_fold_latin1[uc]); 4303 if (OP(scan) != EXACTFA) { /* The folds below aren't 4304 legal under /iaa */ 4305 if (isARG2_lower_or_UPPER_ARG1('s', uc)) { 4306 EXACTF_invlist 4307 = add_cp_to_invlist(EXACTF_invlist, 4308 LATIN_SMALL_LETTER_SHARP_S); 4309 } 4310 else if (uc == LATIN_SMALL_LETTER_SHARP_S) { 4311 EXACTF_invlist 4312 = add_cp_to_invlist(EXACTF_invlist, 's'); 4313 EXACTF_invlist 4314 = add_cp_to_invlist(EXACTF_invlist, 'S'); 4315 } 4316 } 4317 4318 /* We also know if there are above-Latin1 code points 4319 * that fold to this (none legal for ASCII and /iaa) */ 4320 if ((! isASCII(uc) || OP(scan) != EXACTFA) 4321 && HAS_NONLATIN1_FOLD_CLOSURE(uc)) 4322 { 4323 /* XXX We could know exactly what does fold to this 4324 * if the reverse folds are loaded, as currently in 4325 * S_regclass() */ 4326 _invlist_union(EXACTF_invlist, 4327 PL_AboveLatin1, 4328 &EXACTF_invlist); 4329 } 4330 } 4331 } 4332 else { /* Non-locale, above Latin1. XXX We don't currently 4333 know what participates in folds with this, so have 4334 to assume anything could */ 4335 4336 /* XXX We could know exactly what does fold to this if the 4337 * reverse folds are loaded, as currently in S_regclass(). 4338 * But we do know that under /iaa nothing in the ASCII 4339 * range can participate */ 4340 if (OP(scan) == EXACTFA) { 4341 _invlist_union_complement_2nd(EXACTF_invlist, 4342 PL_XPosix_ptrs[_CC_ASCII], 4343 &EXACTF_invlist); 4344 } 4345 else { 4346 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, 4347 0, UV_MAX); 4348 } 4349 } 4350 } 4351 if (flags & SCF_DO_STCLASS_AND) { 4352 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4353 ANYOF_POSIXL_ZERO(data->start_class); 4354 ssc_intersection(data->start_class, EXACTF_invlist, FALSE); 4355 } 4356 else if (flags & SCF_DO_STCLASS_OR) { 4357 ssc_union(data->start_class, EXACTF_invlist, FALSE); 4358 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4359 4360 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 4361 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4362 } 4363 flags &= ~SCF_DO_STCLASS; 4364 SvREFCNT_dec(EXACTF_invlist); 4365 } 4366 else if (REGNODE_VARIES(OP(scan))) { 4367 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; 4368 I32 fl = 0, f = flags; 4369 regnode * const oscan = scan; 4370 regnode_ssc this_class; 4371 regnode_ssc *oclass = NULL; 4372 I32 next_is_eval = 0; 4373 4374 switch (PL_regkind[OP(scan)]) { 4375 case WHILEM: /* End of (?:...)* . */ 4376 scan = NEXTOPER(scan); 4377 goto finish; 4378 case PLUS: 4379 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 4380 next = NEXTOPER(scan); 4381 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 4382 mincount = 1; 4383 maxcount = REG_INFTY; 4384 next = regnext(scan); 4385 scan = NEXTOPER(scan); 4386 goto do_curly; 4387 } 4388 } 4389 if (flags & SCF_DO_SUBSTR) 4390 data->pos_min++; 4391 min++; 4392 /* Fall through. */ 4393 case STAR: 4394 if (flags & SCF_DO_STCLASS) { 4395 mincount = 0; 4396 maxcount = REG_INFTY; 4397 next = regnext(scan); 4398 scan = NEXTOPER(scan); 4399 goto do_curly; 4400 } 4401 if (flags & SCF_DO_SUBSTR) { 4402 scan_commit(pRExC_state, data, minlenp, is_inf); 4403 /* Cannot extend fixed substrings */ 4404 data->longest = &(data->longest_float); 4405 } 4406 is_inf = is_inf_internal = 1; 4407 scan = regnext(scan); 4408 goto optimize_curly_tail; 4409 case CURLY: 4410 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 4411 && (scan->flags == stopparen)) 4412 { 4413 mincount = 1; 4414 maxcount = 1; 4415 } else { 4416 mincount = ARG1(scan); 4417 maxcount = ARG2(scan); 4418 } 4419 next = regnext(scan); 4420 if (OP(scan) == CURLYX) { 4421 I32 lp = (data ? *(data->last_closep) : 0); 4422 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 4423 } 4424 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 4425 next_is_eval = (OP(scan) == EVAL); 4426 do_curly: 4427 if (flags & SCF_DO_SUBSTR) { 4428 if (mincount == 0) 4429 scan_commit(pRExC_state, data, minlenp, is_inf); 4430 /* Cannot extend fixed substrings */ 4431 pos_before = data->pos_min; 4432 } 4433 if (data) { 4434 fl = data->flags; 4435 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 4436 if (is_inf) 4437 data->flags |= SF_IS_INF; 4438 } 4439 if (flags & SCF_DO_STCLASS) { 4440 ssc_init(pRExC_state, &this_class); 4441 oclass = data->start_class; 4442 data->start_class = &this_class; 4443 f |= SCF_DO_STCLASS_AND; 4444 f &= ~SCF_DO_STCLASS_OR; 4445 } 4446 /* Exclude from super-linear cache processing any {n,m} 4447 regops for which the combination of input pos and regex 4448 pos is not enough information to determine if a match 4449 will be possible. 4450 4451 For example, in the regex /foo(bar\s*){4,8}baz/ with the 4452 regex pos at the \s*, the prospects for a match depend not 4453 only on the input position but also on how many (bar\s*) 4454 repeats into the {4,8} we are. */ 4455 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) 4456 f &= ~SCF_WHILEM_VISITED_POS; 4457 4458 /* This will finish on WHILEM, setting scan, or on NULL: */ 4459 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 4460 last, data, stopparen, recursed_depth, NULL, 4461 (mincount == 0 4462 ? (f & ~SCF_DO_SUBSTR) 4463 : f) 4464 ,depth+1); 4465 4466 if (flags & SCF_DO_STCLASS) 4467 data->start_class = oclass; 4468 if (mincount == 0 || minnext == 0) { 4469 if (flags & SCF_DO_STCLASS_OR) { 4470 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 4471 } 4472 else if (flags & SCF_DO_STCLASS_AND) { 4473 /* Switch to OR mode: cache the old value of 4474 * data->start_class */ 4475 INIT_AND_WITHP; 4476 StructCopy(data->start_class, and_withp, regnode_ssc); 4477 flags &= ~SCF_DO_STCLASS_AND; 4478 StructCopy(&this_class, data->start_class, regnode_ssc); 4479 flags |= SCF_DO_STCLASS_OR; 4480 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; 4481 } 4482 } else { /* Non-zero len */ 4483 if (flags & SCF_DO_STCLASS_OR) { 4484 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 4485 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4486 } 4487 else if (flags & SCF_DO_STCLASS_AND) 4488 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 4489 flags &= ~SCF_DO_STCLASS; 4490 } 4491 if (!scan) /* It was not CURLYX, but CURLY. */ 4492 scan = next; 4493 if (!(flags & SCF_TRIE_DOING_RESTUDY) 4494 /* ? quantifier ok, except for (?{ ... }) */ 4495 && (next_is_eval || !(mincount == 0 && maxcount == 1)) 4496 && (minnext == 0) && (deltanext == 0) 4497 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 4498 && maxcount <= REG_INFTY/3) /* Complement check for big 4499 count */ 4500 { 4501 /* Fatal warnings may leak the regexp without this: */ 4502 SAVEFREESV(RExC_rx_sv); 4503 ckWARNreg(RExC_parse, 4504 "Quantifier unexpected on zero-length expression"); 4505 (void)ReREFCNT_inc(RExC_rx_sv); 4506 } 4507 4508 min += minnext * mincount; 4509 is_inf_internal |= deltanext == SSize_t_MAX 4510 || (maxcount == REG_INFTY && minnext + deltanext > 0); 4511 is_inf |= is_inf_internal; 4512 if (is_inf) { 4513 delta = SSize_t_MAX; 4514 } else { 4515 delta += (minnext + deltanext) * maxcount 4516 - minnext * mincount; 4517 } 4518 /* Try powerful optimization CURLYX => CURLYN. */ 4519 if ( OP(oscan) == CURLYX && data 4520 && data->flags & SF_IN_PAR 4521 && !(data->flags & SF_HAS_EVAL) 4522 && !deltanext && minnext == 1 ) { 4523 /* Try to optimize to CURLYN. */ 4524 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 4525 regnode * const nxt1 = nxt; 4526 #ifdef DEBUGGING 4527 regnode *nxt2; 4528 #endif 4529 4530 /* Skip open. */ 4531 nxt = regnext(nxt); 4532 if (!REGNODE_SIMPLE(OP(nxt)) 4533 && !(PL_regkind[OP(nxt)] == EXACT 4534 && STR_LEN(nxt) == 1)) 4535 goto nogo; 4536 #ifdef DEBUGGING 4537 nxt2 = nxt; 4538 #endif 4539 nxt = regnext(nxt); 4540 if (OP(nxt) != CLOSE) 4541 goto nogo; 4542 if (RExC_open_parens) { 4543 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 4544 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ 4545 } 4546 /* Now we know that nxt2 is the only contents: */ 4547 oscan->flags = (U8)ARG(nxt); 4548 OP(oscan) = CURLYN; 4549 OP(nxt1) = NOTHING; /* was OPEN. */ 4550 4551 #ifdef DEBUGGING 4552 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 4553 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ 4554 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ 4555 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 4556 OP(nxt + 1) = OPTIMIZED; /* was count. */ 4557 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ 4558 #endif 4559 } 4560 nogo: 4561 4562 /* Try optimization CURLYX => CURLYM. */ 4563 if ( OP(oscan) == CURLYX && data 4564 && !(data->flags & SF_HAS_PAR) 4565 && !(data->flags & SF_HAS_EVAL) 4566 && !deltanext /* atom is fixed width */ 4567 && minnext != 0 /* CURLYM can't handle zero width */ 4568 4569 /* Nor characters whose fold at run-time may be 4570 * multi-character */ 4571 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) 4572 ) { 4573 /* XXXX How to optimize if data == 0? */ 4574 /* Optimize to a simpler form. */ 4575 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 4576 regnode *nxt2; 4577 4578 OP(oscan) = CURLYM; 4579 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 4580 && (OP(nxt2) != WHILEM)) 4581 nxt = nxt2; 4582 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 4583 /* Need to optimize away parenths. */ 4584 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { 4585 /* Set the parenth number. */ 4586 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 4587 4588 oscan->flags = (U8)ARG(nxt); 4589 if (RExC_open_parens) { 4590 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 4591 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ 4592 } 4593 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 4594 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 4595 4596 #ifdef DEBUGGING 4597 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 4598 OP(nxt + 1) = OPTIMIZED; /* was count. */ 4599 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ 4600 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ 4601 #endif 4602 #if 0 4603 while ( nxt1 && (OP(nxt1) != WHILEM)) { 4604 regnode *nnxt = regnext(nxt1); 4605 if (nnxt == nxt) { 4606 if (reg_off_by_arg[OP(nxt1)]) 4607 ARG_SET(nxt1, nxt2 - nxt1); 4608 else if (nxt2 - nxt1 < U16_MAX) 4609 NEXT_OFF(nxt1) = nxt2 - nxt1; 4610 else 4611 OP(nxt) = NOTHING; /* Cannot beautify */ 4612 } 4613 nxt1 = nnxt; 4614 } 4615 #endif 4616 /* Optimize again: */ 4617 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 4618 NULL, stopparen, recursed_depth, NULL, 0,depth+1); 4619 } 4620 else 4621 oscan->flags = 0; 4622 } 4623 else if ((OP(oscan) == CURLYX) 4624 && (flags & SCF_WHILEM_VISITED_POS) 4625 /* See the comment on a similar expression above. 4626 However, this time it's not a subexpression 4627 we care about, but the expression itself. */ 4628 && (maxcount == REG_INFTY) 4629 && data && ++data->whilem_c < 16) { 4630 /* This stays as CURLYX, we can put the count/of pair. */ 4631 /* Find WHILEM (as in regexec.c) */ 4632 regnode *nxt = oscan + NEXT_OFF(oscan); 4633 4634 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 4635 nxt += ARG(nxt); 4636 PREVOPER(nxt)->flags = (U8)(data->whilem_c 4637 | (RExC_whilem_seen << 4)); /* On WHILEM */ 4638 } 4639 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 4640 pars++; 4641 if (flags & SCF_DO_SUBSTR) { 4642 SV *last_str = NULL; 4643 STRLEN last_chrs = 0; 4644 int counted = mincount != 0; 4645 4646 if (data->last_end > 0 && mincount != 0) { /* Ends with a 4647 string. */ 4648 SSize_t b = pos_before >= data->last_start_min 4649 ? pos_before : data->last_start_min; 4650 STRLEN l; 4651 const char * const s = SvPV_const(data->last_found, l); 4652 SSize_t old = b - data->last_start_min; 4653 4654 if (UTF) 4655 old = utf8_hop((U8*)s, old) - (U8*)s; 4656 l -= old; 4657 /* Get the added string: */ 4658 last_str = newSVpvn_utf8(s + old, l, UTF); 4659 last_chrs = UTF ? utf8_length((U8*)(s + old), 4660 (U8*)(s + old + l)) : l; 4661 if (deltanext == 0 && pos_before == b) { 4662 /* What was added is a constant string */ 4663 if (mincount > 1) { 4664 4665 SvGROW(last_str, (mincount * l) + 1); 4666 repeatcpy(SvPVX(last_str) + l, 4667 SvPVX_const(last_str), l, 4668 mincount - 1); 4669 SvCUR_set(last_str, SvCUR(last_str) * mincount); 4670 /* Add additional parts. */ 4671 SvCUR_set(data->last_found, 4672 SvCUR(data->last_found) - l); 4673 sv_catsv(data->last_found, last_str); 4674 { 4675 SV * sv = data->last_found; 4676 MAGIC *mg = 4677 SvUTF8(sv) && SvMAGICAL(sv) ? 4678 mg_find(sv, PERL_MAGIC_utf8) : NULL; 4679 if (mg && mg->mg_len >= 0) 4680 mg->mg_len += last_chrs * (mincount-1); 4681 } 4682 last_chrs *= mincount; 4683 data->last_end += l * (mincount - 1); 4684 } 4685 } else { 4686 /* start offset must point into the last copy */ 4687 data->last_start_min += minnext * (mincount - 1); 4688 data->last_start_max += is_inf ? SSize_t_MAX 4689 : (maxcount - 1) * (minnext + data->pos_delta); 4690 } 4691 } 4692 /* It is counted once already... */ 4693 data->pos_min += minnext * (mincount - counted); 4694 #if 0 4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf 4696 " SSize_t_MAX=%"UVdf" minnext=%"UVdf 4697 " maxcount=%"UVdf" mincount=%"UVdf"\n", 4698 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, 4699 (UV)mincount); 4700 if (deltanext != SSize_t_MAX) 4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", 4702 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount 4703 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); 4704 #endif 4705 if (deltanext == SSize_t_MAX 4706 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) 4707 data->pos_delta = SSize_t_MAX; 4708 else 4709 data->pos_delta += - counted * deltanext + 4710 (minnext + deltanext) * maxcount - minnext * mincount; 4711 if (mincount != maxcount) { 4712 /* Cannot extend fixed substrings found inside 4713 the group. */ 4714 scan_commit(pRExC_state, data, minlenp, is_inf); 4715 if (mincount && last_str) { 4716 SV * const sv = data->last_found; 4717 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 4718 mg_find(sv, PERL_MAGIC_utf8) : NULL; 4719 4720 if (mg) 4721 mg->mg_len = -1; 4722 sv_setsv(sv, last_str); 4723 data->last_end = data->pos_min; 4724 data->last_start_min = data->pos_min - last_chrs; 4725 data->last_start_max = is_inf 4726 ? SSize_t_MAX 4727 : data->pos_min + data->pos_delta - last_chrs; 4728 } 4729 data->longest = &(data->longest_float); 4730 } 4731 SvREFCNT_dec(last_str); 4732 } 4733 if (data && (fl & SF_HAS_EVAL)) 4734 data->flags |= SF_HAS_EVAL; 4735 optimize_curly_tail: 4736 if (OP(oscan) != CURLYX) { 4737 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING 4738 && NEXT_OFF(next)) 4739 NEXT_OFF(oscan) += NEXT_OFF(next); 4740 } 4741 continue; 4742 4743 default: 4744 #ifdef DEBUGGING 4745 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", 4746 OP(scan)); 4747 #endif 4748 case REF: 4749 case CLUMP: 4750 if (flags & SCF_DO_SUBSTR) { 4751 /* Cannot expect anything... */ 4752 scan_commit(pRExC_state, data, minlenp, is_inf); 4753 data->longest = &(data->longest_float); 4754 } 4755 is_inf = is_inf_internal = 1; 4756 if (flags & SCF_DO_STCLASS_OR) { 4757 if (OP(scan) == CLUMP) { 4758 /* Actually is any start char, but very few code points 4759 * aren't start characters */ 4760 ssc_match_all_cp(data->start_class); 4761 } 4762 else { 4763 ssc_anything(data->start_class); 4764 } 4765 } 4766 flags &= ~SCF_DO_STCLASS; 4767 break; 4768 } 4769 } 4770 else if (OP(scan) == LNBREAK) { 4771 if (flags & SCF_DO_STCLASS) { 4772 if (flags & SCF_DO_STCLASS_AND) { 4773 ssc_intersection(data->start_class, 4774 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); 4775 ssc_clear_locale(data->start_class); 4776 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4777 } 4778 else if (flags & SCF_DO_STCLASS_OR) { 4779 ssc_union(data->start_class, 4780 PL_XPosix_ptrs[_CC_VERTSPACE], 4781 FALSE); 4782 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4783 4784 /* See commit msg for 4785 * 749e076fceedeb708a624933726e7989f2302f6a */ 4786 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4787 } 4788 flags &= ~SCF_DO_STCLASS; 4789 } 4790 min++; 4791 delta++; /* Because of the 2 char string cr-lf */ 4792 if (flags & SCF_DO_SUBSTR) { 4793 /* Cannot expect anything... */ 4794 scan_commit(pRExC_state, data, minlenp, is_inf); 4795 data->pos_min += 1; 4796 data->pos_delta += 1; 4797 data->longest = &(data->longest_float); 4798 } 4799 } 4800 else if (REGNODE_SIMPLE(OP(scan))) { 4801 4802 if (flags & SCF_DO_SUBSTR) { 4803 scan_commit(pRExC_state, data, minlenp, is_inf); 4804 data->pos_min++; 4805 } 4806 min++; 4807 if (flags & SCF_DO_STCLASS) { 4808 bool invert = 0; 4809 SV* my_invlist = NULL; 4810 U8 namedclass; 4811 4812 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 4813 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; 4814 4815 /* Some of the logic below assumes that switching 4816 locale on will only add false positives. */ 4817 switch (OP(scan)) { 4818 4819 default: 4820 #ifdef DEBUGGING 4821 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", 4822 OP(scan)); 4823 #endif 4824 case CANY: 4825 case SANY: 4826 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 4827 ssc_match_all_cp(data->start_class); 4828 break; 4829 4830 case REG_ANY: 4831 { 4832 SV* REG_ANY_invlist = _new_invlist(2); 4833 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, 4834 '\n'); 4835 if (flags & SCF_DO_STCLASS_OR) { 4836 ssc_union(data->start_class, 4837 REG_ANY_invlist, 4838 TRUE /* TRUE => invert, hence all but \n 4839 */ 4840 ); 4841 } 4842 else if (flags & SCF_DO_STCLASS_AND) { 4843 ssc_intersection(data->start_class, 4844 REG_ANY_invlist, 4845 TRUE /* TRUE => invert */ 4846 ); 4847 ssc_clear_locale(data->start_class); 4848 } 4849 SvREFCNT_dec_NN(REG_ANY_invlist); 4850 } 4851 break; 4852 4853 case ANYOF: 4854 if (flags & SCF_DO_STCLASS_AND) 4855 ssc_and(pRExC_state, data->start_class, 4856 (regnode_charclass *) scan); 4857 else 4858 ssc_or(pRExC_state, data->start_class, 4859 (regnode_charclass *) scan); 4860 break; 4861 4862 case NPOSIXL: 4863 invert = 1; 4864 /* FALL THROUGH */ 4865 4866 case POSIXL: 4867 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; 4868 if (flags & SCF_DO_STCLASS_AND) { 4869 bool was_there = cBOOL( 4870 ANYOF_POSIXL_TEST(data->start_class, 4871 namedclass)); 4872 ANYOF_POSIXL_ZERO(data->start_class); 4873 if (was_there) { /* Do an AND */ 4874 ANYOF_POSIXL_SET(data->start_class, namedclass); 4875 } 4876 /* No individual code points can now match */ 4877 data->start_class->invlist 4878 = sv_2mortal(_new_invlist(0)); 4879 } 4880 else { 4881 int complement = namedclass + ((invert) ? -1 : 1); 4882 4883 assert(flags & SCF_DO_STCLASS_OR); 4884 4885 /* If the complement of this class was already there, 4886 * the result is that they match all code points, 4887 * (\d + \D == everything). Remove the classes from 4888 * future consideration. Locale is not relevant in 4889 * this case */ 4890 if (ANYOF_POSIXL_TEST(data->start_class, complement)) { 4891 ssc_match_all_cp(data->start_class); 4892 ANYOF_POSIXL_CLEAR(data->start_class, namedclass); 4893 ANYOF_POSIXL_CLEAR(data->start_class, complement); 4894 } 4895 else { /* The usual case; just add this class to the 4896 existing set */ 4897 ANYOF_POSIXL_SET(data->start_class, namedclass); 4898 } 4899 } 4900 break; 4901 4902 case NPOSIXA: /* For these, we always know the exact set of 4903 what's matched */ 4904 invert = 1; 4905 /* FALL THROUGH */ 4906 case POSIXA: 4907 if (FLAGS(scan) == _CC_ASCII) { 4908 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]); 4909 } 4910 else { 4911 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], 4912 PL_XPosix_ptrs[_CC_ASCII], 4913 &my_invlist); 4914 } 4915 goto join_posix; 4916 4917 case NPOSIXD: 4918 case NPOSIXU: 4919 invert = 1; 4920 /* FALL THROUGH */ 4921 case POSIXD: 4922 case POSIXU: 4923 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); 4924 4925 /* NPOSIXD matches all upper Latin1 code points unless the 4926 * target string being matched is UTF-8, which is 4927 * unknowable until match time. Since we are going to 4928 * invert, we want to get rid of all of them so that the 4929 * inversion will match all */ 4930 if (OP(scan) == NPOSIXD) { 4931 _invlist_subtract(my_invlist, PL_UpperLatin1, 4932 &my_invlist); 4933 } 4934 4935 join_posix: 4936 4937 if (flags & SCF_DO_STCLASS_AND) { 4938 ssc_intersection(data->start_class, my_invlist, invert); 4939 ssc_clear_locale(data->start_class); 4940 } 4941 else { 4942 assert(flags & SCF_DO_STCLASS_OR); 4943 ssc_union(data->start_class, my_invlist, invert); 4944 } 4945 SvREFCNT_dec(my_invlist); 4946 } 4947 if (flags & SCF_DO_STCLASS_OR) 4948 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4949 flags &= ~SCF_DO_STCLASS; 4950 } 4951 } 4952 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 4953 data->flags |= (OP(scan) == MEOL 4954 ? SF_BEFORE_MEOL 4955 : SF_BEFORE_SEOL); 4956 scan_commit(pRExC_state, data, minlenp, is_inf); 4957 4958 } 4959 else if ( PL_regkind[OP(scan)] == BRANCHJ 4960 /* Lookbehind, or need to calculate parens/evals/stclass: */ 4961 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 4962 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 4963 if ( OP(scan) == UNLESSM && 4964 scan->flags == 0 && 4965 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && 4966 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED 4967 ) { 4968 regnode *opt; 4969 regnode *upto= regnext(scan); 4970 DEBUG_PARSE_r({ 4971 SV * const mysv_val=sv_newmortal(); 4972 DEBUG_STUDYDATA("OPFAIL",data,depth); 4973 4974 /*DEBUG_PARSE_MSG("opfail");*/ 4975 regprop(RExC_rx, mysv_val, upto, NULL); 4976 PerlIO_printf(Perl_debug_log, 4977 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", 4978 SvPV_nolen_const(mysv_val), 4979 (IV)REG_NODE_NUM(upto), 4980 (IV)(upto - scan) 4981 ); 4982 }); 4983 OP(scan) = OPFAIL; 4984 NEXT_OFF(scan) = upto - scan; 4985 for (opt= scan + 1; opt < upto ; opt++) 4986 OP(opt) = OPTIMIZED; 4987 scan= upto; 4988 continue; 4989 } 4990 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 4991 || OP(scan) == UNLESSM ) 4992 { 4993 /* Negative Lookahead/lookbehind 4994 In this case we can't do fixed string optimisation. 4995 */ 4996 4997 SSize_t deltanext, minnext, fake = 0; 4998 regnode *nscan; 4999 regnode_ssc intrnl; 5000 int f = 0; 5001 5002 data_fake.flags = 0; 5003 if (data) { 5004 data_fake.whilem_c = data->whilem_c; 5005 data_fake.last_closep = data->last_closep; 5006 } 5007 else 5008 data_fake.last_closep = &fake; 5009 data_fake.pos_delta = delta; 5010 if ( flags & SCF_DO_STCLASS && !scan->flags 5011 && OP(scan) == IFMATCH ) { /* Lookahead */ 5012 ssc_init(pRExC_state, &intrnl); 5013 data_fake.start_class = &intrnl; 5014 f |= SCF_DO_STCLASS_AND; 5015 } 5016 if (flags & SCF_WHILEM_VISITED_POS) 5017 f |= SCF_WHILEM_VISITED_POS; 5018 next = regnext(scan); 5019 nscan = NEXTOPER(NEXTOPER(scan)); 5020 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 5021 last, &data_fake, stopparen, 5022 recursed_depth, NULL, f, depth+1); 5023 if (scan->flags) { 5024 if (deltanext) { 5025 FAIL("Variable length lookbehind not implemented"); 5026 } 5027 else if (minnext > (I32)U8_MAX) { 5028 FAIL2("Lookbehind longer than %"UVuf" not implemented", 5029 (UV)U8_MAX); 5030 } 5031 scan->flags = (U8)minnext; 5032 } 5033 if (data) { 5034 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5035 pars++; 5036 if (data_fake.flags & SF_HAS_EVAL) 5037 data->flags |= SF_HAS_EVAL; 5038 data->whilem_c = data_fake.whilem_c; 5039 } 5040 if (f & SCF_DO_STCLASS_AND) { 5041 if (flags & SCF_DO_STCLASS_OR) { 5042 /* OR before, AND after: ideally we would recurse with 5043 * data_fake to get the AND applied by study of the 5044 * remainder of the pattern, and then derecurse; 5045 * *** HACK *** for now just treat as "no information". 5046 * See [perl #56690]. 5047 */ 5048 ssc_init(pRExC_state, data->start_class); 5049 } else { 5050 /* AND before and after: combine and continue. These 5051 * assertions are zero-length, so can match an EMPTY 5052 * string */ 5053 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 5054 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; 5055 } 5056 } 5057 } 5058 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 5059 else { 5060 /* Positive Lookahead/lookbehind 5061 In this case we can do fixed string optimisation, 5062 but we must be careful about it. Note in the case of 5063 lookbehind the positions will be offset by the minimum 5064 length of the pattern, something we won't know about 5065 until after the recurse. 5066 */ 5067 SSize_t deltanext, fake = 0; 5068 regnode *nscan; 5069 regnode_ssc intrnl; 5070 int f = 0; 5071 /* We use SAVEFREEPV so that when the full compile 5072 is finished perl will clean up the allocated 5073 minlens when it's all done. This way we don't 5074 have to worry about freeing them when we know 5075 they wont be used, which would be a pain. 5076 */ 5077 SSize_t *minnextp; 5078 Newx( minnextp, 1, SSize_t ); 5079 SAVEFREEPV(minnextp); 5080 5081 if (data) { 5082 StructCopy(data, &data_fake, scan_data_t); 5083 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 5084 f |= SCF_DO_SUBSTR; 5085 if (scan->flags) 5086 scan_commit(pRExC_state, &data_fake, minlenp, is_inf); 5087 data_fake.last_found=newSVsv(data->last_found); 5088 } 5089 } 5090 else 5091 data_fake.last_closep = &fake; 5092 data_fake.flags = 0; 5093 data_fake.pos_delta = delta; 5094 if (is_inf) 5095 data_fake.flags |= SF_IS_INF; 5096 if ( flags & SCF_DO_STCLASS && !scan->flags 5097 && OP(scan) == IFMATCH ) { /* Lookahead */ 5098 ssc_init(pRExC_state, &intrnl); 5099 data_fake.start_class = &intrnl; 5100 f |= SCF_DO_STCLASS_AND; 5101 } 5102 if (flags & SCF_WHILEM_VISITED_POS) 5103 f |= SCF_WHILEM_VISITED_POS; 5104 next = regnext(scan); 5105 nscan = NEXTOPER(NEXTOPER(scan)); 5106 5107 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, 5108 &deltanext, last, &data_fake, 5109 stopparen, recursed_depth, NULL, 5110 f,depth+1); 5111 if (scan->flags) { 5112 if (deltanext) { 5113 FAIL("Variable length lookbehind not implemented"); 5114 } 5115 else if (*minnextp > (I32)U8_MAX) { 5116 FAIL2("Lookbehind longer than %"UVuf" not implemented", 5117 (UV)U8_MAX); 5118 } 5119 scan->flags = (U8)*minnextp; 5120 } 5121 5122 *minnextp += min; 5123 5124 if (f & SCF_DO_STCLASS_AND) { 5125 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 5126 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; 5127 } 5128 if (data) { 5129 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5130 pars++; 5131 if (data_fake.flags & SF_HAS_EVAL) 5132 data->flags |= SF_HAS_EVAL; 5133 data->whilem_c = data_fake.whilem_c; 5134 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 5135 if (RExC_rx->minlen<*minnextp) 5136 RExC_rx->minlen=*minnextp; 5137 scan_commit(pRExC_state, &data_fake, minnextp, is_inf); 5138 SvREFCNT_dec_NN(data_fake.last_found); 5139 5140 if ( data_fake.minlen_fixed != minlenp ) 5141 { 5142 data->offset_fixed= data_fake.offset_fixed; 5143 data->minlen_fixed= data_fake.minlen_fixed; 5144 data->lookbehind_fixed+= scan->flags; 5145 } 5146 if ( data_fake.minlen_float != minlenp ) 5147 { 5148 data->minlen_float= data_fake.minlen_float; 5149 data->offset_float_min=data_fake.offset_float_min; 5150 data->offset_float_max=data_fake.offset_float_max; 5151 data->lookbehind_float+= scan->flags; 5152 } 5153 } 5154 } 5155 } 5156 #endif 5157 } 5158 else if (OP(scan) == OPEN) { 5159 if (stopparen != (I32)ARG(scan)) 5160 pars++; 5161 } 5162 else if (OP(scan) == CLOSE) { 5163 if (stopparen == (I32)ARG(scan)) { 5164 break; 5165 } 5166 if ((I32)ARG(scan) == is_par) { 5167 next = regnext(scan); 5168 5169 if ( next && (OP(next) != WHILEM) && next < last) 5170 is_par = 0; /* Disable optimization */ 5171 } 5172 if (data) 5173 *(data->last_closep) = ARG(scan); 5174 } 5175 else if (OP(scan) == EVAL) { 5176 if (data) 5177 data->flags |= SF_HAS_EVAL; 5178 } 5179 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 5180 if (flags & SCF_DO_SUBSTR) { 5181 scan_commit(pRExC_state, data, minlenp, is_inf); 5182 flags &= ~SCF_DO_SUBSTR; 5183 } 5184 if (data && OP(scan)==ACCEPT) { 5185 data->flags |= SCF_SEEN_ACCEPT; 5186 if (stopmin > min) 5187 stopmin = min; 5188 } 5189 } 5190 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 5191 { 5192 if (flags & SCF_DO_SUBSTR) { 5193 scan_commit(pRExC_state, data, minlenp, is_inf); 5194 data->longest = &(data->longest_float); 5195 } 5196 is_inf = is_inf_internal = 1; 5197 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 5198 ssc_anything(data->start_class); 5199 flags &= ~SCF_DO_STCLASS; 5200 } 5201 else if (OP(scan) == GPOS) { 5202 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && 5203 !(delta || is_inf || (data && data->pos_delta))) 5204 { 5205 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) 5206 RExC_rx->intflags |= PREGf_ANCH_GPOS; 5207 if (RExC_rx->gofs < (STRLEN)min) 5208 RExC_rx->gofs = min; 5209 } else { 5210 RExC_rx->intflags |= PREGf_GPOS_FLOAT; 5211 RExC_rx->gofs = 0; 5212 } 5213 } 5214 #ifdef TRIE_STUDY_OPT 5215 #ifdef FULL_TRIE_STUDY 5216 else if (PL_regkind[OP(scan)] == TRIE) { 5217 /* NOTE - There is similar code to this block above for handling 5218 BRANCH nodes on the initial study. If you change stuff here 5219 check there too. */ 5220 regnode *trie_node= scan; 5221 regnode *tail= regnext(scan); 5222 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 5223 SSize_t max1 = 0, min1 = SSize_t_MAX; 5224 regnode_ssc accum; 5225 5226 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ 5227 /* Cannot merge strings after this. */ 5228 scan_commit(pRExC_state, data, minlenp, is_inf); 5229 } 5230 if (flags & SCF_DO_STCLASS) 5231 ssc_init_zero(pRExC_state, &accum); 5232 5233 if (!trie->jump) { 5234 min1= trie->minlen; 5235 max1= trie->maxlen; 5236 } else { 5237 const regnode *nextbranch= NULL; 5238 U32 word; 5239 5240 for ( word=1 ; word <= trie->wordcount ; word++) 5241 { 5242 SSize_t deltanext=0, minnext=0, f = 0, fake; 5243 regnode_ssc this_class; 5244 5245 data_fake.flags = 0; 5246 if (data) { 5247 data_fake.whilem_c = data->whilem_c; 5248 data_fake.last_closep = data->last_closep; 5249 } 5250 else 5251 data_fake.last_closep = &fake; 5252 data_fake.pos_delta = delta; 5253 if (flags & SCF_DO_STCLASS) { 5254 ssc_init(pRExC_state, &this_class); 5255 data_fake.start_class = &this_class; 5256 f = SCF_DO_STCLASS_AND; 5257 } 5258 if (flags & SCF_WHILEM_VISITED_POS) 5259 f |= SCF_WHILEM_VISITED_POS; 5260 5261 if (trie->jump[word]) { 5262 if (!nextbranch) 5263 nextbranch = trie_node + trie->jump[0]; 5264 scan= trie_node + trie->jump[word]; 5265 /* We go from the jump point to the branch that follows 5266 it. Note this means we need the vestigal unused 5267 branches even though they arent otherwise used. */ 5268 minnext = study_chunk(pRExC_state, &scan, minlenp, 5269 &deltanext, (regnode *)nextbranch, &data_fake, 5270 stopparen, recursed_depth, NULL, f,depth+1); 5271 } 5272 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 5273 nextbranch= regnext((regnode*)nextbranch); 5274 5275 if (min1 > (SSize_t)(minnext + trie->minlen)) 5276 min1 = minnext + trie->minlen; 5277 if (deltanext == SSize_t_MAX) { 5278 is_inf = is_inf_internal = 1; 5279 max1 = SSize_t_MAX; 5280 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) 5281 max1 = minnext + deltanext + trie->maxlen; 5282 5283 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5284 pars++; 5285 if (data_fake.flags & SCF_SEEN_ACCEPT) { 5286 if ( stopmin > min + min1) 5287 stopmin = min + min1; 5288 flags &= ~SCF_DO_SUBSTR; 5289 if (data) 5290 data->flags |= SCF_SEEN_ACCEPT; 5291 } 5292 if (data) { 5293 if (data_fake.flags & SF_HAS_EVAL) 5294 data->flags |= SF_HAS_EVAL; 5295 data->whilem_c = data_fake.whilem_c; 5296 } 5297 if (flags & SCF_DO_STCLASS) 5298 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); 5299 } 5300 } 5301 if (flags & SCF_DO_SUBSTR) { 5302 data->pos_min += min1; 5303 data->pos_delta += max1 - min1; 5304 if (max1 != min1 || is_inf) 5305 data->longest = &(data->longest_float); 5306 } 5307 min += min1; 5308 delta += max1 - min1; 5309 if (flags & SCF_DO_STCLASS_OR) { 5310 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); 5311 if (min1) { 5312 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5313 flags &= ~SCF_DO_STCLASS; 5314 } 5315 } 5316 else if (flags & SCF_DO_STCLASS_AND) { 5317 if (min1) { 5318 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 5319 flags &= ~SCF_DO_STCLASS; 5320 } 5321 else { 5322 /* Switch to OR mode: cache the old value of 5323 * data->start_class */ 5324 INIT_AND_WITHP; 5325 StructCopy(data->start_class, and_withp, regnode_ssc); 5326 flags &= ~SCF_DO_STCLASS_AND; 5327 StructCopy(&accum, data->start_class, regnode_ssc); 5328 flags |= SCF_DO_STCLASS_OR; 5329 } 5330 } 5331 scan= tail; 5332 continue; 5333 } 5334 #else 5335 else if (PL_regkind[OP(scan)] == TRIE) { 5336 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 5337 U8*bang=NULL; 5338 5339 min += trie->minlen; 5340 delta += (trie->maxlen - trie->minlen); 5341 flags &= ~SCF_DO_STCLASS; /* xxx */ 5342 if (flags & SCF_DO_SUBSTR) { 5343 /* Cannot expect anything... */ 5344 scan_commit(pRExC_state, data, minlenp, is_inf); 5345 data->pos_min += trie->minlen; 5346 data->pos_delta += (trie->maxlen - trie->minlen); 5347 if (trie->maxlen != trie->minlen) 5348 data->longest = &(data->longest_float); 5349 } 5350 if (trie->jump) /* no more substrings -- for now /grr*/ 5351 flags &= ~SCF_DO_SUBSTR; 5352 } 5353 #endif /* old or new */ 5354 #endif /* TRIE_STUDY_OPT */ 5355 5356 /* Else: zero-length, ignore. */ 5357 scan = regnext(scan); 5358 } 5359 /* If we are exiting a recursion we can unset its recursed bit 5360 * and allow ourselves to enter it again - no danger of an 5361 * infinite loop there. 5362 if (stopparen > -1 && recursed) { 5363 DEBUG_STUDYDATA("unset:", data,depth); 5364 PAREN_UNSET( recursed, stopparen); 5365 } 5366 */ 5367 if (frame) { 5368 DEBUG_STUDYDATA("frame-end:",data,depth); 5369 DEBUG_PEEP("fend", scan, depth); 5370 /* restore previous context */ 5371 last = frame->last; 5372 scan = frame->next; 5373 stopparen = frame->stop; 5374 recursed_depth = frame->prev_recursed_depth; 5375 depth = depth - 1; 5376 5377 frame = frame->prev; 5378 goto fake_study_recurse; 5379 } 5380 5381 finish: 5382 assert(!frame); 5383 DEBUG_STUDYDATA("pre-fin:",data,depth); 5384 5385 *scanp = scan; 5386 *deltap = is_inf_internal ? SSize_t_MAX : delta; 5387 5388 if (flags & SCF_DO_SUBSTR && is_inf) 5389 data->pos_delta = SSize_t_MAX - data->pos_min; 5390 if (is_par > (I32)U8_MAX) 5391 is_par = 0; 5392 if (is_par && pars==1 && data) { 5393 data->flags |= SF_IN_PAR; 5394 data->flags &= ~SF_HAS_PAR; 5395 } 5396 else if (pars && data) { 5397 data->flags |= SF_HAS_PAR; 5398 data->flags &= ~SF_IN_PAR; 5399 } 5400 if (flags & SCF_DO_STCLASS_OR) 5401 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5402 if (flags & SCF_TRIE_RESTUDY) 5403 data->flags |= SCF_TRIE_RESTUDY; 5404 5405 DEBUG_STUDYDATA("post-fin:",data,depth); 5406 5407 { 5408 SSize_t final_minlen= min < stopmin ? min : stopmin; 5409 5410 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { 5411 RExC_maxlen = final_minlen + delta; 5412 } 5413 return final_minlen; 5414 } 5415 /* not-reached */ 5416 } 5417 5418 STATIC U32 5419 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) 5420 { 5421 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 5422 5423 PERL_ARGS_ASSERT_ADD_DATA; 5424 5425 Renewc(RExC_rxi->data, 5426 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 5427 char, struct reg_data); 5428 if(count) 5429 Renew(RExC_rxi->data->what, count + n, U8); 5430 else 5431 Newx(RExC_rxi->data->what, n, U8); 5432 RExC_rxi->data->count = count + n; 5433 Copy(s, RExC_rxi->data->what + count, n, U8); 5434 return count; 5435 } 5436 5437 /*XXX: todo make this not included in a non debugging perl */ 5438 #ifndef PERL_IN_XSUB_RE 5439 void 5440 Perl_reginitcolors(pTHX) 5441 { 5442 dVAR; 5443 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 5444 if (s) { 5445 char *t = savepv(s); 5446 int i = 0; 5447 PL_colors[0] = t; 5448 while (++i < 6) { 5449 t = strchr(t, '\t'); 5450 if (t) { 5451 *t = '\0'; 5452 PL_colors[i] = ++t; 5453 } 5454 else 5455 PL_colors[i] = t = (char *)""; 5456 } 5457 } else { 5458 int i = 0; 5459 while (i < 6) 5460 PL_colors[i++] = (char *)""; 5461 } 5462 PL_colorset = 1; 5463 } 5464 #endif 5465 5466 5467 #ifdef TRIE_STUDY_OPT 5468 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ 5469 STMT_START { \ 5470 if ( \ 5471 (data.flags & SCF_TRIE_RESTUDY) \ 5472 && ! restudied++ \ 5473 ) { \ 5474 dOsomething; \ 5475 goto reStudy; \ 5476 } \ 5477 } STMT_END 5478 #else 5479 #define CHECK_RESTUDY_GOTO_butfirst 5480 #endif 5481 5482 /* 5483 * pregcomp - compile a regular expression into internal code 5484 * 5485 * Decides which engine's compiler to call based on the hint currently in 5486 * scope 5487 */ 5488 5489 #ifndef PERL_IN_XSUB_RE 5490 5491 /* return the currently in-scope regex engine (or the default if none) */ 5492 5493 regexp_engine const * 5494 Perl_current_re_engine(pTHX) 5495 { 5496 dVAR; 5497 5498 if (IN_PERL_COMPILETIME) { 5499 HV * const table = GvHV(PL_hintgv); 5500 SV **ptr; 5501 5502 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) 5503 return &PL_core_reg_engine; 5504 ptr = hv_fetchs(table, "regcomp", FALSE); 5505 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) 5506 return &PL_core_reg_engine; 5507 return INT2PTR(regexp_engine*,SvIV(*ptr)); 5508 } 5509 else { 5510 SV *ptr; 5511 if (!PL_curcop->cop_hints_hash) 5512 return &PL_core_reg_engine; 5513 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); 5514 if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) 5515 return &PL_core_reg_engine; 5516 return INT2PTR(regexp_engine*,SvIV(ptr)); 5517 } 5518 } 5519 5520 5521 REGEXP * 5522 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 5523 { 5524 dVAR; 5525 regexp_engine const *eng = current_re_engine(); 5526 GET_RE_DEBUG_FLAGS_DECL; 5527 5528 PERL_ARGS_ASSERT_PREGCOMP; 5529 5530 /* Dispatch a request to compile a regexp to correct regexp engine. */ 5531 DEBUG_COMPILE_r({ 5532 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", 5533 PTR2UV(eng)); 5534 }); 5535 return CALLREGCOMP_ENG(eng, pattern, flags); 5536 } 5537 #endif 5538 5539 /* public(ish) entry point for the perl core's own regex compiling code. 5540 * It's actually a wrapper for Perl_re_op_compile that only takes an SV 5541 * pattern rather than a list of OPs, and uses the internal engine rather 5542 * than the current one */ 5543 5544 REGEXP * 5545 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) 5546 { 5547 SV *pat = pattern; /* defeat constness! */ 5548 PERL_ARGS_ASSERT_RE_COMPILE; 5549 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, 5550 #ifdef PERL_IN_XSUB_RE 5551 &my_reg_engine, 5552 #else 5553 &PL_core_reg_engine, 5554 #endif 5555 NULL, NULL, rx_flags, 0); 5556 } 5557 5558 5559 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code 5560 * blocks, recalculate the indices. Update pat_p and plen_p in-place to 5561 * point to the realloced string and length. 5562 * 5563 * This is essentially a copy of Perl_bytes_to_utf8() with the code index 5564 * stuff added */ 5565 5566 static void 5567 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, 5568 char **pat_p, STRLEN *plen_p, int num_code_blocks) 5569 { 5570 U8 *const src = (U8*)*pat_p; 5571 U8 *dst; 5572 int n=0; 5573 STRLEN s = 0, d = 0; 5574 bool do_end = 0; 5575 GET_RE_DEBUG_FLAGS_DECL; 5576 5577 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 5578 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 5579 5580 Newx(dst, *plen_p * 2 + 1, U8); 5581 5582 while (s < *plen_p) { 5583 if (NATIVE_BYTE_IS_INVARIANT(src[s])) 5584 dst[d] = src[s]; 5585 else { 5586 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); 5587 dst[d] = UTF8_EIGHT_BIT_LO(src[s]); 5588 } 5589 if (n < num_code_blocks) { 5590 if (!do_end && pRExC_state->code_blocks[n].start == s) { 5591 pRExC_state->code_blocks[n].start = d; 5592 assert(dst[d] == '('); 5593 do_end = 1; 5594 } 5595 else if (do_end && pRExC_state->code_blocks[n].end == s) { 5596 pRExC_state->code_blocks[n].end = d; 5597 assert(dst[d] == ')'); 5598 do_end = 0; 5599 n++; 5600 } 5601 } 5602 s++; 5603 d++; 5604 } 5605 dst[d] = '\0'; 5606 *plen_p = d; 5607 *pat_p = (char*) dst; 5608 SAVEFREEPV(*pat_p); 5609 RExC_orig_utf8 = RExC_utf8 = 1; 5610 } 5611 5612 5613 5614 /* S_concat_pat(): concatenate a list of args to the pattern string pat, 5615 * while recording any code block indices, and handling overloading, 5616 * nested qr// objects etc. If pat is null, it will allocate a new 5617 * string, or just return the first arg, if there's only one. 5618 * 5619 * Returns the malloced/updated pat. 5620 * patternp and pat_count is the array of SVs to be concatted; 5621 * oplist is the optional list of ops that generated the SVs; 5622 * recompile_p is a pointer to a boolean that will be set if 5623 * the regex will need to be recompiled. 5624 * delim, if non-null is an SV that will be inserted between each element 5625 */ 5626 5627 static SV* 5628 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, 5629 SV *pat, SV ** const patternp, int pat_count, 5630 OP *oplist, bool *recompile_p, SV *delim) 5631 { 5632 SV **svp; 5633 int n = 0; 5634 bool use_delim = FALSE; 5635 bool alloced = FALSE; 5636 5637 /* if we know we have at least two args, create an empty string, 5638 * then concatenate args to that. For no args, return an empty string */ 5639 if (!pat && pat_count != 1) { 5640 pat = newSVpvn("", 0); 5641 SAVEFREESV(pat); 5642 alloced = TRUE; 5643 } 5644 5645 for (svp = patternp; svp < patternp + pat_count; svp++) { 5646 SV *sv; 5647 SV *rx = NULL; 5648 STRLEN orig_patlen = 0; 5649 bool code = 0; 5650 SV *msv = use_delim ? delim : *svp; 5651 if (!msv) msv = &PL_sv_undef; 5652 5653 /* if we've got a delimiter, we go round the loop twice for each 5654 * svp slot (except the last), using the delimiter the second 5655 * time round */ 5656 if (use_delim) { 5657 svp--; 5658 use_delim = FALSE; 5659 } 5660 else if (delim) 5661 use_delim = TRUE; 5662 5663 if (SvTYPE(msv) == SVt_PVAV) { 5664 /* we've encountered an interpolated array within 5665 * the pattern, e.g. /...@a..../. Expand the list of elements, 5666 * then recursively append elements. 5667 * The code in this block is based on S_pushav() */ 5668 5669 AV *const av = (AV*)msv; 5670 const SSize_t maxarg = AvFILL(av) + 1; 5671 SV **array; 5672 5673 if (oplist) { 5674 assert(oplist->op_type == OP_PADAV 5675 || oplist->op_type == OP_RV2AV); 5676 oplist = oplist->op_sibling;; 5677 } 5678 5679 if (SvRMAGICAL(av)) { 5680 SSize_t i; 5681 5682 Newx(array, maxarg, SV*); 5683 SAVEFREEPV(array); 5684 for (i=0; i < maxarg; i++) { 5685 SV ** const svp = av_fetch(av, i, FALSE); 5686 array[i] = svp ? *svp : &PL_sv_undef; 5687 } 5688 } 5689 else 5690 array = AvARRAY(av); 5691 5692 pat = S_concat_pat(aTHX_ pRExC_state, pat, 5693 array, maxarg, NULL, recompile_p, 5694 /* $" */ 5695 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); 5696 5697 continue; 5698 } 5699 5700 5701 /* we make the assumption here that each op in the list of 5702 * op_siblings maps to one SV pushed onto the stack, 5703 * except for code blocks, with have both an OP_NULL and 5704 * and OP_CONST. 5705 * This allows us to match up the list of SVs against the 5706 * list of OPs to find the next code block. 5707 * 5708 * Note that PUSHMARK PADSV PADSV .. 5709 * is optimised to 5710 * PADRANGE PADSV PADSV .. 5711 * so the alignment still works. */ 5712 5713 if (oplist) { 5714 if (oplist->op_type == OP_NULL 5715 && (oplist->op_flags & OPf_SPECIAL)) 5716 { 5717 assert(n < pRExC_state->num_code_blocks); 5718 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; 5719 pRExC_state->code_blocks[n].block = oplist; 5720 pRExC_state->code_blocks[n].src_regex = NULL; 5721 n++; 5722 code = 1; 5723 oplist = oplist->op_sibling; /* skip CONST */ 5724 assert(oplist); 5725 } 5726 oplist = oplist->op_sibling;; 5727 } 5728 5729 /* apply magic and QR overloading to arg */ 5730 5731 SvGETMAGIC(msv); 5732 if (SvROK(msv) && SvAMAGIC(msv)) { 5733 SV *sv = AMG_CALLunary(msv, regexp_amg); 5734 if (sv) { 5735 if (SvROK(sv)) 5736 sv = SvRV(sv); 5737 if (SvTYPE(sv) != SVt_REGEXP) 5738 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); 5739 msv = sv; 5740 } 5741 } 5742 5743 /* try concatenation overload ... */ 5744 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && 5745 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) 5746 { 5747 sv_setsv(pat, sv); 5748 /* overloading involved: all bets are off over literal 5749 * code. Pretend we haven't seen it */ 5750 pRExC_state->num_code_blocks -= n; 5751 n = 0; 5752 } 5753 else { 5754 /* ... or failing that, try "" overload */ 5755 while (SvAMAGIC(msv) 5756 && (sv = AMG_CALLunary(msv, string_amg)) 5757 && sv != msv 5758 && !( SvROK(msv) 5759 && SvROK(sv) 5760 && SvRV(msv) == SvRV(sv)) 5761 ) { 5762 msv = sv; 5763 SvGETMAGIC(msv); 5764 } 5765 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) 5766 msv = SvRV(msv); 5767 5768 if (pat) { 5769 /* this is a partially unrolled 5770 * sv_catsv_nomg(pat, msv); 5771 * that allows us to adjust code block indices if 5772 * needed */ 5773 STRLEN dlen; 5774 char *dst = SvPV_force_nomg(pat, dlen); 5775 orig_patlen = dlen; 5776 if (SvUTF8(msv) && !SvUTF8(pat)) { 5777 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); 5778 sv_setpvn(pat, dst, dlen); 5779 SvUTF8_on(pat); 5780 } 5781 sv_catsv_nomg(pat, msv); 5782 rx = msv; 5783 } 5784 else 5785 pat = msv; 5786 5787 if (code) 5788 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; 5789 } 5790 5791 /* extract any code blocks within any embedded qr//'s */ 5792 if (rx && SvTYPE(rx) == SVt_REGEXP 5793 && RX_ENGINE((REGEXP*)rx)->op_comp) 5794 { 5795 5796 RXi_GET_DECL(ReANY((REGEXP *)rx), ri); 5797 if (ri->num_code_blocks) { 5798 int i; 5799 /* the presence of an embedded qr// with code means 5800 * we should always recompile: the text of the 5801 * qr// may not have changed, but it may be a 5802 * different closure than last time */ 5803 *recompile_p = 1; 5804 Renew(pRExC_state->code_blocks, 5805 pRExC_state->num_code_blocks + ri->num_code_blocks, 5806 struct reg_code_block); 5807 pRExC_state->num_code_blocks += ri->num_code_blocks; 5808 5809 for (i=0; i < ri->num_code_blocks; i++) { 5810 struct reg_code_block *src, *dst; 5811 STRLEN offset = orig_patlen 5812 + ReANY((REGEXP *)rx)->pre_prefix; 5813 assert(n < pRExC_state->num_code_blocks); 5814 src = &ri->code_blocks[i]; 5815 dst = &pRExC_state->code_blocks[n]; 5816 dst->start = src->start + offset; 5817 dst->end = src->end + offset; 5818 dst->block = src->block; 5819 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) 5820 src->src_regex 5821 ? src->src_regex 5822 : (REGEXP*)rx); 5823 n++; 5824 } 5825 } 5826 } 5827 } 5828 /* avoid calling magic multiple times on a single element e.g. =~ $qr */ 5829 if (alloced) 5830 SvSETMAGIC(pat); 5831 5832 return pat; 5833 } 5834 5835 5836 5837 /* see if there are any run-time code blocks in the pattern. 5838 * False positives are allowed */ 5839 5840 static bool 5841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 5842 char *pat, STRLEN plen) 5843 { 5844 int n = 0; 5845 STRLEN s; 5846 5847 for (s = 0; s < plen; s++) { 5848 if (n < pRExC_state->num_code_blocks 5849 && s == pRExC_state->code_blocks[n].start) 5850 { 5851 s = pRExC_state->code_blocks[n].end; 5852 n++; 5853 continue; 5854 } 5855 /* TODO ideally should handle [..], (#..), /#.../x to reduce false 5856 * positives here */ 5857 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && 5858 (pat[s+2] == '{' 5859 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) 5860 ) 5861 return 1; 5862 } 5863 return 0; 5864 } 5865 5866 /* Handle run-time code blocks. We will already have compiled any direct 5867 * or indirect literal code blocks. Now, take the pattern 'pat' and make a 5868 * copy of it, but with any literal code blocks blanked out and 5869 * appropriate chars escaped; then feed it into 5870 * 5871 * eval "qr'modified_pattern'" 5872 * 5873 * For example, 5874 * 5875 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno 5876 * 5877 * becomes 5878 * 5879 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' 5880 * 5881 * After eval_sv()-ing that, grab any new code blocks from the returned qr 5882 * and merge them with any code blocks of the original regexp. 5883 * 5884 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; 5885 * instead, just save the qr and return FALSE; this tells our caller that 5886 * the original pattern needs upgrading to utf8. 5887 */ 5888 5889 static bool 5890 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 5891 char *pat, STRLEN plen) 5892 { 5893 SV *qr; 5894 5895 GET_RE_DEBUG_FLAGS_DECL; 5896 5897 if (pRExC_state->runtime_code_qr) { 5898 /* this is the second time we've been called; this should 5899 * only happen if the main pattern got upgraded to utf8 5900 * during compilation; re-use the qr we compiled first time 5901 * round (which should be utf8 too) 5902 */ 5903 qr = pRExC_state->runtime_code_qr; 5904 pRExC_state->runtime_code_qr = NULL; 5905 assert(RExC_utf8 && SvUTF8(qr)); 5906 } 5907 else { 5908 int n = 0; 5909 STRLEN s; 5910 char *p, *newpat; 5911 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ 5912 SV *sv, *qr_ref; 5913 dSP; 5914 5915 /* determine how many extra chars we need for ' and \ escaping */ 5916 for (s = 0; s < plen; s++) { 5917 if (pat[s] == '\'' || pat[s] == '\\') 5918 newlen++; 5919 } 5920 5921 Newx(newpat, newlen, char); 5922 p = newpat; 5923 *p++ = 'q'; *p++ = 'r'; *p++ = '\''; 5924 5925 for (s = 0; s < plen; s++) { 5926 if (n < pRExC_state->num_code_blocks 5927 && s == pRExC_state->code_blocks[n].start) 5928 { 5929 /* blank out literal code block */ 5930 assert(pat[s] == '('); 5931 while (s <= pRExC_state->code_blocks[n].end) { 5932 *p++ = '_'; 5933 s++; 5934 } 5935 s--; 5936 n++; 5937 continue; 5938 } 5939 if (pat[s] == '\'' || pat[s] == '\\') 5940 *p++ = '\\'; 5941 *p++ = pat[s]; 5942 } 5943 *p++ = '\''; 5944 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) 5945 *p++ = 'x'; 5946 *p++ = '\0'; 5947 DEBUG_COMPILE_r({ 5948 PerlIO_printf(Perl_debug_log, 5949 "%sre-parsing pattern for runtime code:%s %s\n", 5950 PL_colors[4],PL_colors[5],newpat); 5951 }); 5952 5953 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); 5954 Safefree(newpat); 5955 5956 ENTER; 5957 SAVETMPS; 5958 save_re_context(); 5959 PUSHSTACKi(PERLSI_REQUIRE); 5960 /* G_RE_REPARSING causes the toker to collapse \\ into \ when 5961 * parsing qr''; normally only q'' does this. It also alters 5962 * hints handling */ 5963 eval_sv(sv, G_SCALAR|G_RE_REPARSING); 5964 SvREFCNT_dec_NN(sv); 5965 SPAGAIN; 5966 qr_ref = POPs; 5967 PUTBACK; 5968 { 5969 SV * const errsv = ERRSV; 5970 if (SvTRUE_NN(errsv)) 5971 { 5972 Safefree(pRExC_state->code_blocks); 5973 /* use croak_sv ? */ 5974 Perl_croak_nocontext("%"SVf, SVfARG(errsv)); 5975 } 5976 } 5977 assert(SvROK(qr_ref)); 5978 qr = SvRV(qr_ref); 5979 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); 5980 /* the leaving below frees the tmp qr_ref. 5981 * Give qr a life of its own */ 5982 SvREFCNT_inc(qr); 5983 POPSTACK; 5984 FREETMPS; 5985 LEAVE; 5986 5987 } 5988 5989 if (!RExC_utf8 && SvUTF8(qr)) { 5990 /* first time through; the pattern got upgraded; save the 5991 * qr for the next time through */ 5992 assert(!pRExC_state->runtime_code_qr); 5993 pRExC_state->runtime_code_qr = qr; 5994 return 0; 5995 } 5996 5997 5998 /* extract any code blocks within the returned qr// */ 5999 6000 6001 /* merge the main (r1) and run-time (r2) code blocks into one */ 6002 { 6003 RXi_GET_DECL(ReANY((REGEXP *)qr), r2); 6004 struct reg_code_block *new_block, *dst; 6005 RExC_state_t * const r1 = pRExC_state; /* convenient alias */ 6006 int i1 = 0, i2 = 0; 6007 6008 if (!r2->num_code_blocks) /* we guessed wrong */ 6009 { 6010 SvREFCNT_dec_NN(qr); 6011 return 1; 6012 } 6013 6014 Newx(new_block, 6015 r1->num_code_blocks + r2->num_code_blocks, 6016 struct reg_code_block); 6017 dst = new_block; 6018 6019 while ( i1 < r1->num_code_blocks 6020 || i2 < r2->num_code_blocks) 6021 { 6022 struct reg_code_block *src; 6023 bool is_qr = 0; 6024 6025 if (i1 == r1->num_code_blocks) { 6026 src = &r2->code_blocks[i2++]; 6027 is_qr = 1; 6028 } 6029 else if (i2 == r2->num_code_blocks) 6030 src = &r1->code_blocks[i1++]; 6031 else if ( r1->code_blocks[i1].start 6032 < r2->code_blocks[i2].start) 6033 { 6034 src = &r1->code_blocks[i1++]; 6035 assert(src->end < r2->code_blocks[i2].start); 6036 } 6037 else { 6038 assert( r1->code_blocks[i1].start 6039 > r2->code_blocks[i2].start); 6040 src = &r2->code_blocks[i2++]; 6041 is_qr = 1; 6042 assert(src->end < r1->code_blocks[i1].start); 6043 } 6044 6045 assert(pat[src->start] == '('); 6046 assert(pat[src->end] == ')'); 6047 dst->start = src->start; 6048 dst->end = src->end; 6049 dst->block = src->block; 6050 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) 6051 : src->src_regex; 6052 dst++; 6053 } 6054 r1->num_code_blocks += r2->num_code_blocks; 6055 Safefree(r1->code_blocks); 6056 r1->code_blocks = new_block; 6057 } 6058 6059 SvREFCNT_dec_NN(qr); 6060 return 1; 6061 } 6062 6063 6064 STATIC bool 6065 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, 6066 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, 6067 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, 6068 STRLEN longest_length, bool eol, bool meol) 6069 { 6070 /* This is the common code for setting up the floating and fixed length 6071 * string data extracted from Perl_re_op_compile() below. Returns a boolean 6072 * as to whether succeeded or not */ 6073 6074 I32 t; 6075 SSize_t ml; 6076 6077 if (! (longest_length 6078 || (eol /* Can't have SEOL and MULTI */ 6079 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) 6080 ) 6081 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ 6082 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) 6083 { 6084 return FALSE; 6085 } 6086 6087 /* copy the information about the longest from the reg_scan_data 6088 over to the program. */ 6089 if (SvUTF8(sv_longest)) { 6090 *rx_utf8 = sv_longest; 6091 *rx_substr = NULL; 6092 } else { 6093 *rx_substr = sv_longest; 6094 *rx_utf8 = NULL; 6095 } 6096 /* end_shift is how many chars that must be matched that 6097 follow this item. We calculate it ahead of time as once the 6098 lookbehind offset is added in we lose the ability to correctly 6099 calculate it.*/ 6100 ml = minlen ? *(minlen) : (SSize_t)longest_length; 6101 *rx_end_shift = ml - offset 6102 - longest_length + (SvTAIL(sv_longest) != 0) 6103 + lookbehind; 6104 6105 t = (eol/* Can't have SEOL and MULTI */ 6106 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); 6107 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); 6108 6109 return TRUE; 6110 } 6111 6112 /* 6113 * Perl_re_op_compile - the perl internal RE engine's function to compile a 6114 * regular expression into internal code. 6115 * The pattern may be passed either as: 6116 * a list of SVs (patternp plus pat_count) 6117 * a list of OPs (expr) 6118 * If both are passed, the SV list is used, but the OP list indicates 6119 * which SVs are actually pre-compiled code blocks 6120 * 6121 * The SVs in the list have magic and qr overloading applied to them (and 6122 * the list may be modified in-place with replacement SVs in the latter 6123 * case). 6124 * 6125 * If the pattern hasn't changed from old_re, then old_re will be 6126 * returned. 6127 * 6128 * eng is the current engine. If that engine has an op_comp method, then 6129 * handle directly (i.e. we assume that op_comp was us); otherwise, just 6130 * do the initial concatenation of arguments and pass on to the external 6131 * engine. 6132 * 6133 * If is_bare_re is not null, set it to a boolean indicating whether the 6134 * arg list reduced (after overloading) to a single bare regex which has 6135 * been returned (i.e. /$qr/). 6136 * 6137 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. 6138 * 6139 * pm_flags contains the PMf_* flags, typically based on those from the 6140 * pm_flags field of the related PMOP. Currently we're only interested in 6141 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. 6142 * 6143 * We can't allocate space until we know how big the compiled form will be, 6144 * but we can't compile it (and thus know how big it is) until we've got a 6145 * place to put the code. So we cheat: we compile it twice, once with code 6146 * generation turned off and size counting turned on, and once "for real". 6147 * This also means that we don't allocate space until we are sure that the 6148 * thing really will compile successfully, and we never have to move the 6149 * code and thus invalidate pointers into it. (Note that it has to be in 6150 * one piece because free() must be able to free it all.) [NB: not true in perl] 6151 * 6152 * Beware that the optimization-preparation code in here knows about some 6153 * of the structure of the compiled regexp. [I'll say.] 6154 */ 6155 6156 REGEXP * 6157 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, 6158 OP *expr, const regexp_engine* eng, REGEXP *old_re, 6159 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) 6160 { 6161 dVAR; 6162 REGEXP *rx; 6163 struct regexp *r; 6164 regexp_internal *ri; 6165 STRLEN plen; 6166 char *exp; 6167 regnode *scan; 6168 I32 flags; 6169 SSize_t minlen = 0; 6170 U32 rx_flags; 6171 SV *pat; 6172 SV *code_blocksv = NULL; 6173 SV** new_patternp = patternp; 6174 6175 /* these are all flags - maybe they should be turned 6176 * into a single int with different bit masks */ 6177 I32 sawlookahead = 0; 6178 I32 sawplus = 0; 6179 I32 sawopen = 0; 6180 I32 sawminmod = 0; 6181 6182 regex_charset initial_charset = get_regex_charset(orig_rx_flags); 6183 bool recompile = 0; 6184 bool runtime_code = 0; 6185 scan_data_t data; 6186 RExC_state_t RExC_state; 6187 RExC_state_t * const pRExC_state = &RExC_state; 6188 #ifdef TRIE_STUDY_OPT 6189 int restudied = 0; 6190 RExC_state_t copyRExC_state; 6191 #endif 6192 GET_RE_DEBUG_FLAGS_DECL; 6193 6194 PERL_ARGS_ASSERT_RE_OP_COMPILE; 6195 6196 DEBUG_r(if (!PL_colorset) reginitcolors()); 6197 6198 #ifndef PERL_IN_XSUB_RE 6199 /* Initialize these here instead of as-needed, as is quick and avoids 6200 * having to test them each time otherwise */ 6201 if (! PL_AboveLatin1) { 6202 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); 6203 PL_Latin1 = _new_invlist_C_array(Latin1_invlist); 6204 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); 6205 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); 6206 PL_HasMultiCharFold = 6207 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); 6208 } 6209 #endif 6210 6211 pRExC_state->code_blocks = NULL; 6212 pRExC_state->num_code_blocks = 0; 6213 6214 if (is_bare_re) 6215 *is_bare_re = FALSE; 6216 6217 if (expr && (expr->op_type == OP_LIST || 6218 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { 6219 /* allocate code_blocks if needed */ 6220 OP *o; 6221 int ncode = 0; 6222 6223 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) 6224 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) 6225 ncode++; /* count of DO blocks */ 6226 if (ncode) { 6227 pRExC_state->num_code_blocks = ncode; 6228 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); 6229 } 6230 } 6231 6232 if (!pat_count) { 6233 /* compile-time pattern with just OP_CONSTs and DO blocks */ 6234 6235 int n; 6236 OP *o; 6237 6238 /* find how many CONSTs there are */ 6239 assert(expr); 6240 n = 0; 6241 if (expr->op_type == OP_CONST) 6242 n = 1; 6243 else 6244 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { 6245 if (o->op_type == OP_CONST) 6246 n++; 6247 } 6248 6249 /* fake up an SV array */ 6250 6251 assert(!new_patternp); 6252 Newx(new_patternp, n, SV*); 6253 SAVEFREEPV(new_patternp); 6254 pat_count = n; 6255 6256 n = 0; 6257 if (expr->op_type == OP_CONST) 6258 new_patternp[n] = cSVOPx_sv(expr); 6259 else 6260 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { 6261 if (o->op_type == OP_CONST) 6262 new_patternp[n++] = cSVOPo_sv; 6263 } 6264 6265 } 6266 6267 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 6268 "Assembling pattern from %d elements%s\n", pat_count, 6269 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 6270 6271 /* set expr to the first arg op */ 6272 6273 if (pRExC_state->num_code_blocks 6274 && expr->op_type != OP_CONST) 6275 { 6276 expr = cLISTOPx(expr)->op_first; 6277 assert( expr->op_type == OP_PUSHMARK 6278 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) 6279 || expr->op_type == OP_PADRANGE); 6280 expr = expr->op_sibling; 6281 } 6282 6283 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, 6284 expr, &recompile, NULL); 6285 6286 /* handle bare (possibly after overloading) regex: foo =~ $re */ 6287 { 6288 SV *re = pat; 6289 if (SvROK(re)) 6290 re = SvRV(re); 6291 if (SvTYPE(re) == SVt_REGEXP) { 6292 if (is_bare_re) 6293 *is_bare_re = TRUE; 6294 SvREFCNT_inc(re); 6295 Safefree(pRExC_state->code_blocks); 6296 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 6297 "Precompiled pattern%s\n", 6298 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 6299 6300 return (REGEXP*)re; 6301 } 6302 } 6303 6304 exp = SvPV_nomg(pat, plen); 6305 6306 if (!eng->op_comp) { 6307 if ((SvUTF8(pat) && IN_BYTES) 6308 || SvGMAGICAL(pat) || SvAMAGIC(pat)) 6309 { 6310 /* make a temporary copy; either to convert to bytes, 6311 * or to avoid repeating get-magic / overloaded stringify */ 6312 pat = newSVpvn_flags(exp, plen, SVs_TEMP | 6313 (IN_BYTES ? 0 : SvUTF8(pat))); 6314 } 6315 Safefree(pRExC_state->code_blocks); 6316 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); 6317 } 6318 6319 /* ignore the utf8ness if the pattern is 0 length */ 6320 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); 6321 RExC_uni_semantics = 0; 6322 RExC_contains_locale = 0; 6323 RExC_contains_i = 0; 6324 pRExC_state->runtime_code_qr = NULL; 6325 6326 DEBUG_COMPILE_r({ 6327 SV *dsv= sv_newmortal(); 6328 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); 6329 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", 6330 PL_colors[4],PL_colors[5],s); 6331 }); 6332 6333 redo_first_pass: 6334 /* we jump here if we upgrade the pattern to utf8 and have to 6335 * recompile */ 6336 6337 if ((pm_flags & PMf_USE_RE_EVAL) 6338 /* this second condition covers the non-regex literal case, 6339 * i.e. $foo =~ '(?{})'. */ 6340 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) 6341 ) 6342 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); 6343 6344 /* return old regex if pattern hasn't changed */ 6345 /* XXX: note in the below we have to check the flags as well as the 6346 * pattern. 6347 * 6348 * Things get a touch tricky as we have to compare the utf8 flag 6349 * independently from the compile flags. */ 6350 6351 if ( old_re 6352 && !recompile 6353 && !!RX_UTF8(old_re) == !!RExC_utf8 6354 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) 6355 && RX_PRECOMP(old_re) 6356 && RX_PRELEN(old_re) == plen 6357 && memEQ(RX_PRECOMP(old_re), exp, plen) 6358 && !runtime_code /* with runtime code, always recompile */ ) 6359 { 6360 Safefree(pRExC_state->code_blocks); 6361 return old_re; 6362 } 6363 6364 rx_flags = orig_rx_flags; 6365 6366 if (rx_flags & PMf_FOLD) { 6367 RExC_contains_i = 1; 6368 } 6369 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { 6370 6371 /* Set to use unicode semantics if the pattern is in utf8 and has the 6372 * 'depends' charset specified, as it means unicode when utf8 */ 6373 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 6374 } 6375 6376 RExC_precomp = exp; 6377 RExC_flags = rx_flags; 6378 RExC_pm_flags = pm_flags; 6379 6380 if (runtime_code) { 6381 if (TAINTING_get && TAINT_get) 6382 Perl_croak(aTHX_ "Eval-group in insecure regular expression"); 6383 6384 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { 6385 /* whoops, we have a non-utf8 pattern, whilst run-time code 6386 * got compiled as utf8. Try again with a utf8 pattern */ 6387 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 6388 pRExC_state->num_code_blocks); 6389 goto redo_first_pass; 6390 } 6391 } 6392 assert(!pRExC_state->runtime_code_qr); 6393 6394 RExC_sawback = 0; 6395 6396 RExC_seen = 0; 6397 RExC_maxlen = 0; 6398 RExC_in_lookbehind = 0; 6399 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 6400 RExC_extralen = 0; 6401 RExC_override_recoding = 0; 6402 RExC_in_multi_char_class = 0; 6403 6404 /* First pass: determine size, legality. */ 6405 RExC_parse = exp; 6406 RExC_start = exp; 6407 RExC_end = exp + plen; 6408 RExC_naughty = 0; 6409 RExC_npar = 1; 6410 RExC_nestroot = 0; 6411 RExC_size = 0L; 6412 RExC_emit = (regnode *) &RExC_emit_dummy; 6413 RExC_whilem_seen = 0; 6414 RExC_open_parens = NULL; 6415 RExC_close_parens = NULL; 6416 RExC_opend = NULL; 6417 RExC_paren_names = NULL; 6418 #ifdef DEBUGGING 6419 RExC_paren_name_list = NULL; 6420 #endif 6421 RExC_recurse = NULL; 6422 RExC_study_chunk_recursed = NULL; 6423 RExC_study_chunk_recursed_bytes= 0; 6424 RExC_recurse_count = 0; 6425 pRExC_state->code_index = 0; 6426 6427 #if 0 /* REGC() is (currently) a NOP at the first pass. 6428 * Clever compilers notice this and complain. --jhi */ 6429 REGC((U8)REG_MAGIC, (char*)RExC_emit); 6430 #endif 6431 DEBUG_PARSE_r( 6432 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); 6433 RExC_lastnum=0; 6434 RExC_lastparse=NULL; 6435 ); 6436 /* reg may croak on us, not giving us a chance to free 6437 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may 6438 need it to survive as long as the regexp (qr/(?{})/). 6439 We must check that code_blocksv is not already set, because we may 6440 have jumped back to restart the sizing pass. */ 6441 if (pRExC_state->code_blocks && !code_blocksv) { 6442 code_blocksv = newSV_type(SVt_PV); 6443 SAVEFREESV(code_blocksv); 6444 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); 6445 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ 6446 } 6447 if (reg(pRExC_state, 0, &flags,1) == NULL) { 6448 /* It's possible to write a regexp in ascii that represents Unicode 6449 codepoints outside of the byte range, such as via \x{100}. If we 6450 detect such a sequence we have to convert the entire pattern to utf8 6451 and then recompile, as our sizing calculation will have been based 6452 on 1 byte == 1 character, but we will need to use utf8 to encode 6453 at least some part of the pattern, and therefore must convert the whole 6454 thing. 6455 -- dmq */ 6456 if (flags & RESTART_UTF8) { 6457 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 6458 pRExC_state->num_code_blocks); 6459 goto redo_first_pass; 6460 } 6461 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); 6462 } 6463 if (code_blocksv) 6464 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ 6465 6466 DEBUG_PARSE_r({ 6467 PerlIO_printf(Perl_debug_log, 6468 "Required size %"IVdf" nodes\n" 6469 "Starting second pass (creation)\n", 6470 (IV)RExC_size); 6471 RExC_lastnum=0; 6472 RExC_lastparse=NULL; 6473 }); 6474 6475 /* The first pass could have found things that force Unicode semantics */ 6476 if ((RExC_utf8 || RExC_uni_semantics) 6477 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) 6478 { 6479 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 6480 } 6481 6482 /* Small enough for pointer-storage convention? 6483 If extralen==0, this means that we will not need long jumps. */ 6484 if (RExC_size >= 0x10000L && RExC_extralen) 6485 RExC_size += RExC_extralen; 6486 else 6487 RExC_extralen = 0; 6488 if (RExC_whilem_seen > 15) 6489 RExC_whilem_seen = 15; 6490 6491 /* Allocate space and zero-initialize. Note, the two step process 6492 of zeroing when in debug mode, thus anything assigned has to 6493 happen after that */ 6494 rx = (REGEXP*) newSV_type(SVt_REGEXP); 6495 r = ReANY(rx); 6496 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 6497 char, regexp_internal); 6498 if ( r == NULL || ri == NULL ) 6499 FAIL("Regexp out of space"); 6500 #ifdef DEBUGGING 6501 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 6502 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 6503 char); 6504 #else 6505 /* bulk initialize base fields with 0. */ 6506 Zero(ri, sizeof(regexp_internal), char); 6507 #endif 6508 6509 /* non-zero initialization begins here */ 6510 RXi_SET( r, ri ); 6511 r->engine= eng; 6512 r->extflags = rx_flags; 6513 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; 6514 6515 if (pm_flags & PMf_IS_QR) { 6516 ri->code_blocks = pRExC_state->code_blocks; 6517 ri->num_code_blocks = pRExC_state->num_code_blocks; 6518 } 6519 else 6520 { 6521 int n; 6522 for (n = 0; n < pRExC_state->num_code_blocks; n++) 6523 if (pRExC_state->code_blocks[n].src_regex) 6524 SAVEFREESV(pRExC_state->code_blocks[n].src_regex); 6525 SAVEFREEPV(pRExC_state->code_blocks); 6526 } 6527 6528 { 6529 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 6530 bool has_charset = (get_regex_charset(r->extflags) 6531 != REGEX_DEPENDS_CHARSET); 6532 6533 /* The caret is output if there are any defaults: if not all the STD 6534 * flags are set, or if no character set specifier is needed */ 6535 bool has_default = 6536 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) 6537 || ! has_charset); 6538 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) 6539 == REG_RUN_ON_COMMENT_SEEN); 6540 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) 6541 >> RXf_PMf_STD_PMMOD_SHIFT); 6542 const char *fptr = STD_PAT_MODS; /*"msix"*/ 6543 char *p; 6544 /* Allocate for the worst case, which is all the std flags are turned 6545 * on. If more precision is desired, we could do a population count of 6546 * the flags set. This could be done with a small lookup table, or by 6547 * shifting, masking and adding, or even, when available, assembly 6548 * language for a machine-language population count. 6549 * We never output a minus, as all those are defaults, so are 6550 * covered by the caret */ 6551 const STRLEN wraplen = plen + has_p + has_runon 6552 + has_default /* If needs a caret */ 6553 6554 /* If needs a character set specifier */ 6555 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) 6556 + (sizeof(STD_PAT_MODS) - 1) 6557 + (sizeof("(?:)") - 1); 6558 6559 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ 6560 r->xpv_len_u.xpvlenu_pv = p; 6561 if (RExC_utf8) 6562 SvFLAGS(rx) |= SVf_UTF8; 6563 *p++='('; *p++='?'; 6564 6565 /* If a default, cover it using the caret */ 6566 if (has_default) { 6567 *p++= DEFAULT_PAT_MOD; 6568 } 6569 if (has_charset) { 6570 STRLEN len; 6571 const char* const name = get_regex_charset_name(r->extflags, &len); 6572 Copy(name, p, len, char); 6573 p += len; 6574 } 6575 if (has_p) 6576 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 6577 { 6578 char ch; 6579 while((ch = *fptr++)) { 6580 if(reganch & 1) 6581 *p++ = ch; 6582 reganch >>= 1; 6583 } 6584 } 6585 6586 *p++ = ':'; 6587 Copy(RExC_precomp, p, plen, char); 6588 assert ((RX_WRAPPED(rx) - p) < 16); 6589 r->pre_prefix = p - RX_WRAPPED(rx); 6590 p += plen; 6591 if (has_runon) 6592 *p++ = '\n'; 6593 *p++ = ')'; 6594 *p = 0; 6595 SvCUR_set(rx, p - RX_WRAPPED(rx)); 6596 } 6597 6598 r->intflags = 0; 6599 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 6600 6601 /* setup various meta data about recursion, this all requires 6602 * RExC_npar to be correctly set, and a bit later on we clear it */ 6603 if (RExC_seen & REG_RECURSE_SEEN) { 6604 Newxz(RExC_open_parens, RExC_npar,regnode *); 6605 SAVEFREEPV(RExC_open_parens); 6606 Newxz(RExC_close_parens,RExC_npar,regnode *); 6607 SAVEFREEPV(RExC_close_parens); 6608 } 6609 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { 6610 /* Note, RExC_npar is 1 + the number of parens in a pattern. 6611 * So its 1 if there are no parens. */ 6612 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + 6613 ((RExC_npar & 0x07) != 0); 6614 Newx(RExC_study_chunk_recursed, 6615 RExC_study_chunk_recursed_bytes * RExC_npar, U8); 6616 SAVEFREEPV(RExC_study_chunk_recursed); 6617 } 6618 6619 /* Useful during FAIL. */ 6620 #ifdef RE_TRACK_PATTERN_OFFSETS 6621 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 6622 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, 6623 "%s %"UVuf" bytes for offset annotations.\n", 6624 ri->u.offsets ? "Got" : "Couldn't get", 6625 (UV)((2*RExC_size+1) * sizeof(U32)))); 6626 #endif 6627 SetProgLen(ri,RExC_size); 6628 RExC_rx_sv = rx; 6629 RExC_rx = r; 6630 RExC_rxi = ri; 6631 6632 /* Second pass: emit code. */ 6633 RExC_flags = rx_flags; /* don't let top level (?i) bleed */ 6634 RExC_pm_flags = pm_flags; 6635 RExC_parse = exp; 6636 RExC_end = exp + plen; 6637 RExC_naughty = 0; 6638 RExC_npar = 1; 6639 RExC_emit_start = ri->program; 6640 RExC_emit = ri->program; 6641 RExC_emit_bound = ri->program + RExC_size + 1; 6642 pRExC_state->code_index = 0; 6643 6644 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 6645 if (reg(pRExC_state, 0, &flags,1) == NULL) { 6646 ReREFCNT_dec(rx); 6647 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); 6648 } 6649 /* XXXX To minimize changes to RE engine we always allocate 6650 3-units-long substrs field. */ 6651 Newx(r->substrs, 1, struct reg_substr_data); 6652 if (RExC_recurse_count) { 6653 Newxz(RExC_recurse,RExC_recurse_count,regnode *); 6654 SAVEFREEPV(RExC_recurse); 6655 } 6656 6657 reStudy: 6658 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; 6659 Zero(r->substrs, 1, struct reg_substr_data); 6660 if (RExC_study_chunk_recursed) 6661 Zero(RExC_study_chunk_recursed, 6662 RExC_study_chunk_recursed_bytes * RExC_npar, U8); 6663 6664 #ifdef TRIE_STUDY_OPT 6665 if (!restudied) { 6666 StructCopy(&zero_scan_data, &data, scan_data_t); 6667 copyRExC_state = RExC_state; 6668 } else { 6669 U32 seen=RExC_seen; 6670 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); 6671 6672 RExC_state = copyRExC_state; 6673 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) 6674 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 6675 else 6676 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; 6677 StructCopy(&zero_scan_data, &data, scan_data_t); 6678 } 6679 #else 6680 StructCopy(&zero_scan_data, &data, scan_data_t); 6681 #endif 6682 6683 /* Dig out information for optimizations. */ 6684 r->extflags = RExC_flags; /* was pm_op */ 6685 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 6686 6687 if (UTF) 6688 SvUTF8_on(rx); /* Unicode in it? */ 6689 ri->regstclass = NULL; 6690 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 6691 r->intflags |= PREGf_NAUGHTY; 6692 scan = ri->program + 1; /* First BRANCH. */ 6693 6694 /* testing for BRANCH here tells us whether there is "must appear" 6695 data in the pattern. If there is then we can use it for optimisations */ 6696 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. 6697 */ 6698 SSize_t fake; 6699 STRLEN longest_float_length, longest_fixed_length; 6700 regnode_ssc ch_class; /* pointed to by data */ 6701 int stclass_flag; 6702 SSize_t last_close = 0; /* pointed to by data */ 6703 regnode *first= scan; 6704 regnode *first_next= regnext(first); 6705 /* 6706 * Skip introductions and multiplicators >= 1 6707 * so that we can extract the 'meat' of the pattern that must 6708 * match in the large if() sequence following. 6709 * NOTE that EXACT is NOT covered here, as it is normally 6710 * picked up by the optimiser separately. 6711 * 6712 * This is unfortunate as the optimiser isnt handling lookahead 6713 * properly currently. 6714 * 6715 */ 6716 while ((OP(first) == OPEN && (sawopen = 1)) || 6717 /* An OR of *one* alternative - should not happen now. */ 6718 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 6719 /* for now we can't handle lookbehind IFMATCH*/ 6720 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || 6721 (OP(first) == PLUS) || 6722 (OP(first) == MINMOD) || 6723 /* An {n,m} with n>0 */ 6724 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 6725 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 6726 { 6727 /* 6728 * the only op that could be a regnode is PLUS, all the rest 6729 * will be regnode_1 or regnode_2. 6730 * 6731 * (yves doesn't think this is true) 6732 */ 6733 if (OP(first) == PLUS) 6734 sawplus = 1; 6735 else { 6736 if (OP(first) == MINMOD) 6737 sawminmod = 1; 6738 first += regarglen[OP(first)]; 6739 } 6740 first = NEXTOPER(first); 6741 first_next= regnext(first); 6742 } 6743 6744 /* Starting-point info. */ 6745 again: 6746 DEBUG_PEEP("first:",first,0); 6747 /* Ignore EXACT as we deal with it later. */ 6748 if (PL_regkind[OP(first)] == EXACT) { 6749 if (OP(first) == EXACT) 6750 NOOP; /* Empty, get anchored substr later. */ 6751 else 6752 ri->regstclass = first; 6753 } 6754 #ifdef TRIE_STCLASS 6755 else if (PL_regkind[OP(first)] == TRIE && 6756 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 6757 { 6758 regnode *trie_op; 6759 /* this can happen only on restudy */ 6760 if ( OP(first) == TRIE ) { 6761 struct regnode_1 *trieop = (struct regnode_1 *) 6762 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 6763 StructCopy(first,trieop,struct regnode_1); 6764 trie_op=(regnode *)trieop; 6765 } else { 6766 struct regnode_charclass *trieop = (struct regnode_charclass *) 6767 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 6768 StructCopy(first,trieop,struct regnode_charclass); 6769 trie_op=(regnode *)trieop; 6770 } 6771 OP(trie_op)+=2; 6772 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); 6773 ri->regstclass = trie_op; 6774 } 6775 #endif 6776 else if (REGNODE_SIMPLE(OP(first))) 6777 ri->regstclass = first; 6778 else if (PL_regkind[OP(first)] == BOUND || 6779 PL_regkind[OP(first)] == NBOUND) 6780 ri->regstclass = first; 6781 else if (PL_regkind[OP(first)] == BOL) { 6782 r->intflags |= (OP(first) == MBOL 6783 ? PREGf_ANCH_MBOL 6784 : (OP(first) == SBOL 6785 ? PREGf_ANCH_SBOL 6786 : PREGf_ANCH_BOL)); 6787 first = NEXTOPER(first); 6788 goto again; 6789 } 6790 else if (OP(first) == GPOS) { 6791 r->intflags |= PREGf_ANCH_GPOS; 6792 first = NEXTOPER(first); 6793 goto again; 6794 } 6795 else if ((!sawopen || !RExC_sawback) && 6796 (OP(first) == STAR && 6797 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 6798 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) 6799 { 6800 /* turn .* into ^.* with an implied $*=1 */ 6801 const int type = 6802 (OP(NEXTOPER(first)) == REG_ANY) 6803 ? PREGf_ANCH_MBOL 6804 : PREGf_ANCH_SBOL; 6805 r->intflags |= (type | PREGf_IMPLICIT); 6806 first = NEXTOPER(first); 6807 goto again; 6808 } 6809 if (sawplus && !sawminmod && !sawlookahead 6810 && (!sawopen || !RExC_sawback) 6811 && !pRExC_state->num_code_blocks) /* May examine pos and $& */ 6812 /* x+ must match at the 1st pos of run of x's */ 6813 r->intflags |= PREGf_SKIP; 6814 6815 /* Scan is after the zeroth branch, first is atomic matcher. */ 6816 #ifdef TRIE_STUDY_OPT 6817 DEBUG_PARSE_r( 6818 if (!restudied) 6819 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 6820 (IV)(first - scan + 1)) 6821 ); 6822 #else 6823 DEBUG_PARSE_r( 6824 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 6825 (IV)(first - scan + 1)) 6826 ); 6827 #endif 6828 6829 6830 /* 6831 * If there's something expensive in the r.e., find the 6832 * longest literal string that must appear and make it the 6833 * regmust. Resolve ties in favor of later strings, since 6834 * the regstart check works with the beginning of the r.e. 6835 * and avoiding duplication strengthens checking. Not a 6836 * strong reason, but sufficient in the absence of others. 6837 * [Now we resolve ties in favor of the earlier string if 6838 * it happens that c_offset_min has been invalidated, since the 6839 * earlier string may buy us something the later one won't.] 6840 */ 6841 6842 data.longest_fixed = newSVpvs(""); 6843 data.longest_float = newSVpvs(""); 6844 data.last_found = newSVpvs(""); 6845 data.longest = &(data.longest_fixed); 6846 ENTER_with_name("study_chunk"); 6847 SAVEFREESV(data.longest_fixed); 6848 SAVEFREESV(data.longest_float); 6849 SAVEFREESV(data.last_found); 6850 first = scan; 6851 if (!ri->regstclass) { 6852 ssc_init(pRExC_state, &ch_class); 6853 data.start_class = &ch_class; 6854 stclass_flag = SCF_DO_STCLASS_AND; 6855 } else /* XXXX Check for BOUND? */ 6856 stclass_flag = 0; 6857 data.last_closep = &last_close; 6858 6859 DEBUG_RExC_seen(); 6860 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, 6861 scan + RExC_size, /* Up to end */ 6862 &data, -1, 0, NULL, 6863 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag 6864 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 6865 0); 6866 6867 6868 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); 6869 6870 6871 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 6872 && data.last_start_min == 0 && data.last_end > 0 6873 && !RExC_seen_zerolen 6874 && !(RExC_seen & REG_VERBARG_SEEN) 6875 && !(RExC_seen & REG_GPOS_SEEN) 6876 ){ 6877 r->extflags |= RXf_CHECK_ALL; 6878 } 6879 scan_commit(pRExC_state, &data,&minlen,0); 6880 6881 longest_float_length = CHR_SVLEN(data.longest_float); 6882 6883 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 6884 && data.offset_fixed == data.offset_float_min 6885 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) 6886 && S_setup_longest (aTHX_ pRExC_state, 6887 data.longest_float, 6888 &(r->float_utf8), 6889 &(r->float_substr), 6890 &(r->float_end_shift), 6891 data.lookbehind_float, 6892 data.offset_float_min, 6893 data.minlen_float, 6894 longest_float_length, 6895 cBOOL(data.flags & SF_FL_BEFORE_EOL), 6896 cBOOL(data.flags & SF_FL_BEFORE_MEOL))) 6897 { 6898 r->float_min_offset = data.offset_float_min - data.lookbehind_float; 6899 r->float_max_offset = data.offset_float_max; 6900 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ 6901 r->float_max_offset -= data.lookbehind_float; 6902 SvREFCNT_inc_simple_void_NN(data.longest_float); 6903 } 6904 else { 6905 r->float_substr = r->float_utf8 = NULL; 6906 longest_float_length = 0; 6907 } 6908 6909 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 6910 6911 if (S_setup_longest (aTHX_ pRExC_state, 6912 data.longest_fixed, 6913 &(r->anchored_utf8), 6914 &(r->anchored_substr), 6915 &(r->anchored_end_shift), 6916 data.lookbehind_fixed, 6917 data.offset_fixed, 6918 data.minlen_fixed, 6919 longest_fixed_length, 6920 cBOOL(data.flags & SF_FIX_BEFORE_EOL), 6921 cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) 6922 { 6923 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; 6924 SvREFCNT_inc_simple_void_NN(data.longest_fixed); 6925 } 6926 else { 6927 r->anchored_substr = r->anchored_utf8 = NULL; 6928 longest_fixed_length = 0; 6929 } 6930 LEAVE_with_name("study_chunk"); 6931 6932 if (ri->regstclass 6933 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) 6934 ri->regstclass = NULL; 6935 6936 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 6937 && stclass_flag 6938 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) 6939 && !ssc_is_anything(data.start_class)) 6940 { 6941 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 6942 6943 ssc_finalize(pRExC_state, data.start_class); 6944 6945 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 6946 StructCopy(data.start_class, 6947 (regnode_ssc*)RExC_rxi->data->data[n], 6948 regnode_ssc); 6949 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 6950 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 6951 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 6952 regprop(r, sv, (regnode*)data.start_class, NULL); 6953 PerlIO_printf(Perl_debug_log, 6954 "synthetic stclass \"%s\".\n", 6955 SvPVX_const(sv));}); 6956 data.start_class = NULL; 6957 } 6958 6959 /* A temporary algorithm prefers floated substr to fixed one to dig 6960 * more info. */ 6961 if (longest_fixed_length > longest_float_length) { 6962 r->substrs->check_ix = 0; 6963 r->check_end_shift = r->anchored_end_shift; 6964 r->check_substr = r->anchored_substr; 6965 r->check_utf8 = r->anchored_utf8; 6966 r->check_offset_min = r->check_offset_max = r->anchored_offset; 6967 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) 6968 r->intflags |= PREGf_NOSCAN; 6969 } 6970 else { 6971 r->substrs->check_ix = 1; 6972 r->check_end_shift = r->float_end_shift; 6973 r->check_substr = r->float_substr; 6974 r->check_utf8 = r->float_utf8; 6975 r->check_offset_min = r->float_min_offset; 6976 r->check_offset_max = r->float_max_offset; 6977 } 6978 if ((r->check_substr || r->check_utf8) ) { 6979 r->extflags |= RXf_USE_INTUIT; 6980 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 6981 r->extflags |= RXf_INTUIT_TAIL; 6982 } 6983 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; 6984 6985 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 6986 if ( (STRLEN)minlen < longest_float_length ) 6987 minlen= longest_float_length; 6988 if ( (STRLEN)minlen < longest_fixed_length ) 6989 minlen= longest_fixed_length; 6990 */ 6991 } 6992 else { 6993 /* Several toplevels. Best we can is to set minlen. */ 6994 SSize_t fake; 6995 regnode_ssc ch_class; 6996 SSize_t last_close = 0; 6997 6998 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); 6999 7000 scan = ri->program + 1; 7001 ssc_init(pRExC_state, &ch_class); 7002 data.start_class = &ch_class; 7003 data.last_closep = &last_close; 7004 7005 DEBUG_RExC_seen(); 7006 minlen = study_chunk(pRExC_state, 7007 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, 7008 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied 7009 ? SCF_TRIE_DOING_RESTUDY 7010 : 0), 7011 0); 7012 7013 CHECK_RESTUDY_GOTO_butfirst(NOOP); 7014 7015 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 7016 = r->float_substr = r->float_utf8 = NULL; 7017 7018 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) 7019 && ! ssc_is_anything(data.start_class)) 7020 { 7021 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 7022 7023 ssc_finalize(pRExC_state, data.start_class); 7024 7025 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 7026 StructCopy(data.start_class, 7027 (regnode_ssc*)RExC_rxi->data->data[n], 7028 regnode_ssc); 7029 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 7030 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 7031 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 7032 regprop(r, sv, (regnode*)data.start_class, NULL); 7033 PerlIO_printf(Perl_debug_log, 7034 "synthetic stclass \"%s\".\n", 7035 SvPVX_const(sv));}); 7036 data.start_class = NULL; 7037 } 7038 } 7039 7040 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { 7041 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; 7042 r->maxlen = REG_INFTY; 7043 } 7044 else { 7045 r->maxlen = RExC_maxlen; 7046 } 7047 7048 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 7049 the "real" pattern. */ 7050 DEBUG_OPTIMISE_r({ 7051 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", 7052 (IV)minlen, (IV)r->minlen, RExC_maxlen); 7053 }); 7054 r->minlenret = minlen; 7055 if (r->minlen < minlen) 7056 r->minlen = minlen; 7057 7058 if (RExC_seen & REG_GPOS_SEEN) 7059 r->intflags |= PREGf_GPOS_SEEN; 7060 if (RExC_seen & REG_LOOKBEHIND_SEEN) 7061 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 7062 lookbehind */ 7063 if (pRExC_state->num_code_blocks) 7064 r->extflags |= RXf_EVAL_SEEN; 7065 if (RExC_seen & REG_CANY_SEEN) 7066 r->intflags |= PREGf_CANY_SEEN; 7067 if (RExC_seen & REG_VERBARG_SEEN) 7068 { 7069 r->intflags |= PREGf_VERBARG_SEEN; 7070 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ 7071 } 7072 if (RExC_seen & REG_CUTGROUP_SEEN) 7073 r->intflags |= PREGf_CUTGROUP_SEEN; 7074 if (pm_flags & PMf_USE_RE_EVAL) 7075 r->intflags |= PREGf_USE_RE_EVAL; 7076 if (RExC_paren_names) 7077 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 7078 else 7079 RXp_PAREN_NAMES(r) = NULL; 7080 7081 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED 7082 * so it can be used in pp.c */ 7083 if (r->intflags & PREGf_ANCH) 7084 r->extflags |= RXf_IS_ANCHORED; 7085 7086 7087 { 7088 /* this is used to identify "special" patterns that might result 7089 * in Perl NOT calling the regex engine and instead doing the match "itself", 7090 * particularly special cases in split//. By having the regex compiler 7091 * do this pattern matching at a regop level (instead of by inspecting the pattern) 7092 * we avoid weird issues with equivalent patterns resulting in different behavior, 7093 * AND we allow non Perl engines to get the same optimizations by the setting the 7094 * flags appropriately - Yves */ 7095 regnode *first = ri->program + 1; 7096 U8 fop = OP(first); 7097 regnode *next = NEXTOPER(first); 7098 U8 nop = OP(next); 7099 7100 if (PL_regkind[fop] == NOTHING && nop == END) 7101 r->extflags |= RXf_NULL; 7102 else if (PL_regkind[fop] == BOL && nop == END) 7103 r->extflags |= RXf_START_ONLY; 7104 else if (fop == PLUS 7105 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE 7106 && OP(regnext(first)) == END) 7107 r->extflags |= RXf_WHITE; 7108 else if ( r->extflags & RXf_SPLIT 7109 && fop == EXACT 7110 && STR_LEN(first) == 1 7111 && *(STRING(first)) == ' ' 7112 && OP(regnext(first)) == END ) 7113 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 7114 7115 } 7116 7117 if (RExC_contains_locale) { 7118 RXp_EXTFLAGS(r) |= RXf_TAINTED; 7119 } 7120 7121 #ifdef DEBUGGING 7122 if (RExC_paren_names) { 7123 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); 7124 ri->data->data[ri->name_list_idx] 7125 = (void*)SvREFCNT_inc(RExC_paren_name_list); 7126 } else 7127 #endif 7128 ri->name_list_idx = 0; 7129 7130 if (RExC_recurse_count) { 7131 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { 7132 const regnode *scan = RExC_recurse[RExC_recurse_count-1]; 7133 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); 7134 } 7135 } 7136 Newxz(r->offs, RExC_npar, regexp_paren_pair); 7137 /* assume we don't need to swap parens around before we match */ 7138 7139 DEBUG_DUMP_r({ 7140 DEBUG_RExC_seen(); 7141 PerlIO_printf(Perl_debug_log,"Final program:\n"); 7142 regdump(r); 7143 }); 7144 #ifdef RE_TRACK_PATTERN_OFFSETS 7145 DEBUG_OFFSETS_r(if (ri->u.offsets) { 7146 const STRLEN len = ri->u.offsets[0]; 7147 STRLEN i; 7148 GET_RE_DEBUG_FLAGS_DECL; 7149 PerlIO_printf(Perl_debug_log, 7150 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); 7151 for (i = 1; i <= len; i++) { 7152 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) 7153 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", 7154 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); 7155 } 7156 PerlIO_printf(Perl_debug_log, "\n"); 7157 }); 7158 #endif 7159 7160 #ifdef USE_ITHREADS 7161 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated 7162 * by setting the regexp SV to readonly-only instead. If the 7163 * pattern's been recompiled, the USEDness should remain. */ 7164 if (old_re && SvREADONLY(old_re)) 7165 SvREADONLY_on(rx); 7166 #endif 7167 return rx; 7168 } 7169 7170 7171 SV* 7172 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 7173 const U32 flags) 7174 { 7175 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 7176 7177 PERL_UNUSED_ARG(value); 7178 7179 if (flags & RXapif_FETCH) { 7180 return reg_named_buff_fetch(rx, key, flags); 7181 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 7182 Perl_croak_no_modify(); 7183 return NULL; 7184 } else if (flags & RXapif_EXISTS) { 7185 return reg_named_buff_exists(rx, key, flags) 7186 ? &PL_sv_yes 7187 : &PL_sv_no; 7188 } else if (flags & RXapif_REGNAMES) { 7189 return reg_named_buff_all(rx, flags); 7190 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 7191 return reg_named_buff_scalar(rx, flags); 7192 } else { 7193 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 7194 return NULL; 7195 } 7196 } 7197 7198 SV* 7199 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 7200 const U32 flags) 7201 { 7202 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 7203 PERL_UNUSED_ARG(lastkey); 7204 7205 if (flags & RXapif_FIRSTKEY) 7206 return reg_named_buff_firstkey(rx, flags); 7207 else if (flags & RXapif_NEXTKEY) 7208 return reg_named_buff_nextkey(rx, flags); 7209 else { 7210 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", 7211 (int)flags); 7212 return NULL; 7213 } 7214 } 7215 7216 SV* 7217 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 7218 const U32 flags) 7219 { 7220 AV *retarray = NULL; 7221 SV *ret; 7222 struct regexp *const rx = ReANY(r); 7223 7224 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 7225 7226 if (flags & RXapif_ALL) 7227 retarray=newAV(); 7228 7229 if (rx && RXp_PAREN_NAMES(rx)) { 7230 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 7231 if (he_str) { 7232 IV i; 7233 SV* sv_dat=HeVAL(he_str); 7234 I32 *nums=(I32*)SvPVX(sv_dat); 7235 for ( i=0; i<SvIVX(sv_dat); i++ ) { 7236 if ((I32)(rx->nparens) >= nums[i] 7237 && rx->offs[nums[i]].start != -1 7238 && rx->offs[nums[i]].end != -1) 7239 { 7240 ret = newSVpvs(""); 7241 CALLREG_NUMBUF_FETCH(r,nums[i],ret); 7242 if (!retarray) 7243 return ret; 7244 } else { 7245 if (retarray) 7246 ret = newSVsv(&PL_sv_undef); 7247 } 7248 if (retarray) 7249 av_push(retarray, ret); 7250 } 7251 if (retarray) 7252 return newRV_noinc(MUTABLE_SV(retarray)); 7253 } 7254 } 7255 return NULL; 7256 } 7257 7258 bool 7259 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 7260 const U32 flags) 7261 { 7262 struct regexp *const rx = ReANY(r); 7263 7264 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 7265 7266 if (rx && RXp_PAREN_NAMES(rx)) { 7267 if (flags & RXapif_ALL) { 7268 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 7269 } else { 7270 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 7271 if (sv) { 7272 SvREFCNT_dec_NN(sv); 7273 return TRUE; 7274 } else { 7275 return FALSE; 7276 } 7277 } 7278 } else { 7279 return FALSE; 7280 } 7281 } 7282 7283 SV* 7284 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 7285 { 7286 struct regexp *const rx = ReANY(r); 7287 7288 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 7289 7290 if ( rx && RXp_PAREN_NAMES(rx) ) { 7291 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 7292 7293 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 7294 } else { 7295 return FALSE; 7296 } 7297 } 7298 7299 SV* 7300 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 7301 { 7302 struct regexp *const rx = ReANY(r); 7303 GET_RE_DEBUG_FLAGS_DECL; 7304 7305 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 7306 7307 if (rx && RXp_PAREN_NAMES(rx)) { 7308 HV *hv = RXp_PAREN_NAMES(rx); 7309 HE *temphe; 7310 while ( (temphe = hv_iternext_flags(hv,0)) ) { 7311 IV i; 7312 IV parno = 0; 7313 SV* sv_dat = HeVAL(temphe); 7314 I32 *nums = (I32*)SvPVX(sv_dat); 7315 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 7316 if ((I32)(rx->lastparen) >= nums[i] && 7317 rx->offs[nums[i]].start != -1 && 7318 rx->offs[nums[i]].end != -1) 7319 { 7320 parno = nums[i]; 7321 break; 7322 } 7323 } 7324 if (parno || flags & RXapif_ALL) { 7325 return newSVhek(HeKEY_hek(temphe)); 7326 } 7327 } 7328 } 7329 return NULL; 7330 } 7331 7332 SV* 7333 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 7334 { 7335 SV *ret; 7336 AV *av; 7337 SSize_t length; 7338 struct regexp *const rx = ReANY(r); 7339 7340 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 7341 7342 if (rx && RXp_PAREN_NAMES(rx)) { 7343 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 7344 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 7345 } else if (flags & RXapif_ONE) { 7346 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 7347 av = MUTABLE_AV(SvRV(ret)); 7348 length = av_tindex(av); 7349 SvREFCNT_dec_NN(ret); 7350 return newSViv(length + 1); 7351 } else { 7352 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", 7353 (int)flags); 7354 return NULL; 7355 } 7356 } 7357 return &PL_sv_undef; 7358 } 7359 7360 SV* 7361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 7362 { 7363 struct regexp *const rx = ReANY(r); 7364 AV *av = newAV(); 7365 7366 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 7367 7368 if (rx && RXp_PAREN_NAMES(rx)) { 7369 HV *hv= RXp_PAREN_NAMES(rx); 7370 HE *temphe; 7371 (void)hv_iterinit(hv); 7372 while ( (temphe = hv_iternext_flags(hv,0)) ) { 7373 IV i; 7374 IV parno = 0; 7375 SV* sv_dat = HeVAL(temphe); 7376 I32 *nums = (I32*)SvPVX(sv_dat); 7377 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 7378 if ((I32)(rx->lastparen) >= nums[i] && 7379 rx->offs[nums[i]].start != -1 && 7380 rx->offs[nums[i]].end != -1) 7381 { 7382 parno = nums[i]; 7383 break; 7384 } 7385 } 7386 if (parno || flags & RXapif_ALL) { 7387 av_push(av, newSVhek(HeKEY_hek(temphe))); 7388 } 7389 } 7390 } 7391 7392 return newRV_noinc(MUTABLE_SV(av)); 7393 } 7394 7395 void 7396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, 7397 SV * const sv) 7398 { 7399 struct regexp *const rx = ReANY(r); 7400 char *s = NULL; 7401 SSize_t i = 0; 7402 SSize_t s1, t1; 7403 I32 n = paren; 7404 7405 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 7406 7407 if ( n == RX_BUFF_IDX_CARET_PREMATCH 7408 || n == RX_BUFF_IDX_CARET_FULLMATCH 7409 || n == RX_BUFF_IDX_CARET_POSTMATCH 7410 ) 7411 { 7412 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 7413 if (!keepcopy) { 7414 /* on something like 7415 * $r = qr/.../; 7416 * /$qr/p; 7417 * the KEEPCOPY is set on the PMOP rather than the regex */ 7418 if (PL_curpm && r == PM_GETRE(PL_curpm)) 7419 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 7420 } 7421 if (!keepcopy) 7422 goto ret_undef; 7423 } 7424 7425 if (!rx->subbeg) 7426 goto ret_undef; 7427 7428 if (n == RX_BUFF_IDX_CARET_FULLMATCH) 7429 /* no need to distinguish between them any more */ 7430 n = RX_BUFF_IDX_FULLMATCH; 7431 7432 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) 7433 && rx->offs[0].start != -1) 7434 { 7435 /* $`, ${^PREMATCH} */ 7436 i = rx->offs[0].start; 7437 s = rx->subbeg; 7438 } 7439 else 7440 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) 7441 && rx->offs[0].end != -1) 7442 { 7443 /* $', ${^POSTMATCH} */ 7444 s = rx->subbeg - rx->suboffset + rx->offs[0].end; 7445 i = rx->sublen + rx->suboffset - rx->offs[0].end; 7446 } 7447 else 7448 if ( 0 <= n && n <= (I32)rx->nparens && 7449 (s1 = rx->offs[n].start) != -1 && 7450 (t1 = rx->offs[n].end) != -1) 7451 { 7452 /* $&, ${^MATCH}, $1 ... */ 7453 i = t1 - s1; 7454 s = rx->subbeg + s1 - rx->suboffset; 7455 } else { 7456 goto ret_undef; 7457 } 7458 7459 assert(s >= rx->subbeg); 7460 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); 7461 if (i >= 0) { 7462 #ifdef NO_TAINT_SUPPORT 7463 sv_setpvn(sv, s, i); 7464 #else 7465 const int oldtainted = TAINT_get; 7466 TAINT_NOT; 7467 sv_setpvn(sv, s, i); 7468 TAINT_set(oldtainted); 7469 #endif 7470 if ( (rx->intflags & PREGf_CANY_SEEN) 7471 ? (RXp_MATCH_UTF8(rx) 7472 && (!i || is_utf8_string((U8*)s, i))) 7473 : (RXp_MATCH_UTF8(rx)) ) 7474 { 7475 SvUTF8_on(sv); 7476 } 7477 else 7478 SvUTF8_off(sv); 7479 if (TAINTING_get) { 7480 if (RXp_MATCH_TAINTED(rx)) { 7481 if (SvTYPE(sv) >= SVt_PVMG) { 7482 MAGIC* const mg = SvMAGIC(sv); 7483 MAGIC* mgt; 7484 TAINT; 7485 SvMAGIC_set(sv, mg->mg_moremagic); 7486 SvTAINT(sv); 7487 if ((mgt = SvMAGIC(sv))) { 7488 mg->mg_moremagic = mgt; 7489 SvMAGIC_set(sv, mg); 7490 } 7491 } else { 7492 TAINT; 7493 SvTAINT(sv); 7494 } 7495 } else 7496 SvTAINTED_off(sv); 7497 } 7498 } else { 7499 ret_undef: 7500 sv_setsv(sv,&PL_sv_undef); 7501 return; 7502 } 7503 } 7504 7505 void 7506 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 7507 SV const * const value) 7508 { 7509 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 7510 7511 PERL_UNUSED_ARG(rx); 7512 PERL_UNUSED_ARG(paren); 7513 PERL_UNUSED_ARG(value); 7514 7515 if (!PL_localizing) 7516 Perl_croak_no_modify(); 7517 } 7518 7519 I32 7520 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 7521 const I32 paren) 7522 { 7523 struct regexp *const rx = ReANY(r); 7524 I32 i; 7525 I32 s1, t1; 7526 7527 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 7528 7529 if ( paren == RX_BUFF_IDX_CARET_PREMATCH 7530 || paren == RX_BUFF_IDX_CARET_FULLMATCH 7531 || paren == RX_BUFF_IDX_CARET_POSTMATCH 7532 ) 7533 { 7534 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 7535 if (!keepcopy) { 7536 /* on something like 7537 * $r = qr/.../; 7538 * /$qr/p; 7539 * the KEEPCOPY is set on the PMOP rather than the regex */ 7540 if (PL_curpm && r == PM_GETRE(PL_curpm)) 7541 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 7542 } 7543 if (!keepcopy) 7544 goto warn_undef; 7545 } 7546 7547 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 7548 switch (paren) { 7549 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ 7550 case RX_BUFF_IDX_PREMATCH: /* $` */ 7551 if (rx->offs[0].start != -1) { 7552 i = rx->offs[0].start; 7553 if (i > 0) { 7554 s1 = 0; 7555 t1 = i; 7556 goto getlen; 7557 } 7558 } 7559 return 0; 7560 7561 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ 7562 case RX_BUFF_IDX_POSTMATCH: /* $' */ 7563 if (rx->offs[0].end != -1) { 7564 i = rx->sublen - rx->offs[0].end; 7565 if (i > 0) { 7566 s1 = rx->offs[0].end; 7567 t1 = rx->sublen; 7568 goto getlen; 7569 } 7570 } 7571 return 0; 7572 7573 default: /* $& / ${^MATCH}, $1, $2, ... */ 7574 if (paren <= (I32)rx->nparens && 7575 (s1 = rx->offs[paren].start) != -1 && 7576 (t1 = rx->offs[paren].end) != -1) 7577 { 7578 i = t1 - s1; 7579 goto getlen; 7580 } else { 7581 warn_undef: 7582 if (ckWARN(WARN_UNINITIALIZED)) 7583 report_uninit((const SV *)sv); 7584 return 0; 7585 } 7586 } 7587 getlen: 7588 if (i > 0 && RXp_MATCH_UTF8(rx)) { 7589 const char * const s = rx->subbeg - rx->suboffset + s1; 7590 const U8 *ep; 7591 STRLEN el; 7592 7593 i = t1 - s1; 7594 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 7595 i = el; 7596 } 7597 return i; 7598 } 7599 7600 SV* 7601 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 7602 { 7603 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 7604 PERL_UNUSED_ARG(rx); 7605 if (0) 7606 return NULL; 7607 else 7608 return newSVpvs("Regexp"); 7609 } 7610 7611 /* Scans the name of a named buffer from the pattern. 7612 * If flags is REG_RSN_RETURN_NULL returns null. 7613 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 7614 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 7615 * to the parsed name as looked up in the RExC_paren_names hash. 7616 * If there is an error throws a vFAIL().. type exception. 7617 */ 7618 7619 #define REG_RSN_RETURN_NULL 0 7620 #define REG_RSN_RETURN_NAME 1 7621 #define REG_RSN_RETURN_DATA 2 7622 7623 STATIC SV* 7624 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 7625 { 7626 char *name_start = RExC_parse; 7627 7628 PERL_ARGS_ASSERT_REG_SCAN_NAME; 7629 7630 assert (RExC_parse <= RExC_end); 7631 if (RExC_parse == RExC_end) NOOP; 7632 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { 7633 /* skip IDFIRST by using do...while */ 7634 if (UTF) 7635 do { 7636 RExC_parse += UTF8SKIP(RExC_parse); 7637 } while (isWORDCHAR_utf8((U8*)RExC_parse)); 7638 else 7639 do { 7640 RExC_parse++; 7641 } while (isWORDCHAR(*RExC_parse)); 7642 } else { 7643 RExC_parse++; /* so the <- from the vFAIL is after the offending 7644 character */ 7645 vFAIL("Group name must start with a non-digit word character"); 7646 } 7647 if ( flags ) { 7648 SV* sv_name 7649 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 7650 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 7651 if ( flags == REG_RSN_RETURN_NAME) 7652 return sv_name; 7653 else if (flags==REG_RSN_RETURN_DATA) { 7654 HE *he_str = NULL; 7655 SV *sv_dat = NULL; 7656 if ( ! sv_name ) /* should not happen*/ 7657 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 7658 if (RExC_paren_names) 7659 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 7660 if ( he_str ) 7661 sv_dat = HeVAL(he_str); 7662 if ( ! sv_dat ) 7663 vFAIL("Reference to nonexistent named group"); 7664 return sv_dat; 7665 } 7666 else { 7667 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", 7668 (unsigned long) flags); 7669 } 7670 assert(0); /* NOT REACHED */ 7671 } 7672 return NULL; 7673 } 7674 7675 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 7676 int rem=(int)(RExC_end - RExC_parse); \ 7677 int cut; \ 7678 int num; \ 7679 int iscut=0; \ 7680 if (rem>10) { \ 7681 rem=10; \ 7682 iscut=1; \ 7683 } \ 7684 cut=10-rem; \ 7685 if (RExC_lastparse!=RExC_parse) \ 7686 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ 7687 rem, RExC_parse, \ 7688 cut + 4, \ 7689 iscut ? "..." : "<" \ 7690 ); \ 7691 else \ 7692 PerlIO_printf(Perl_debug_log,"%16s",""); \ 7693 \ 7694 if (SIZE_ONLY) \ 7695 num = RExC_size + 1; \ 7696 else \ 7697 num=REG_NODE_NUM(RExC_emit); \ 7698 if (RExC_lastnum!=num) \ 7699 PerlIO_printf(Perl_debug_log,"|%4d",num); \ 7700 else \ 7701 PerlIO_printf(Perl_debug_log,"|%4s",""); \ 7702 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ 7703 (int)((depth*2)), "", \ 7704 (funcname) \ 7705 ); \ 7706 RExC_lastnum=num; \ 7707 RExC_lastparse=RExC_parse; \ 7708 }) 7709 7710 7711 7712 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 7713 DEBUG_PARSE_MSG((funcname)); \ 7714 PerlIO_printf(Perl_debug_log,"%4s","\n"); \ 7715 }) 7716 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ 7717 DEBUG_PARSE_MSG((funcname)); \ 7718 PerlIO_printf(Perl_debug_log,fmt "\n",args); \ 7719 }) 7720 7721 /* This section of code defines the inversion list object and its methods. The 7722 * interfaces are highly subject to change, so as much as possible is static to 7723 * this file. An inversion list is here implemented as a malloc'd C UV array 7724 * as an SVt_INVLIST scalar. 7725 * 7726 * An inversion list for Unicode is an array of code points, sorted by ordinal 7727 * number. The zeroth element is the first code point in the list. The 1th 7728 * element is the first element beyond that not in the list. In other words, 7729 * the first range is 7730 * invlist[0]..(invlist[1]-1) 7731 * The other ranges follow. Thus every element whose index is divisible by two 7732 * marks the beginning of a range that is in the list, and every element not 7733 * divisible by two marks the beginning of a range not in the list. A single 7734 * element inversion list that contains the single code point N generally 7735 * consists of two elements 7736 * invlist[0] == N 7737 * invlist[1] == N+1 7738 * (The exception is when N is the highest representable value on the 7739 * machine, in which case the list containing just it would be a single 7740 * element, itself. By extension, if the last range in the list extends to 7741 * infinity, then the first element of that range will be in the inversion list 7742 * at a position that is divisible by two, and is the final element in the 7743 * list.) 7744 * Taking the complement (inverting) an inversion list is quite simple, if the 7745 * first element is 0, remove it; otherwise add a 0 element at the beginning. 7746 * This implementation reserves an element at the beginning of each inversion 7747 * list to always contain 0; there is an additional flag in the header which 7748 * indicates if the list begins at the 0, or is offset to begin at the next 7749 * element. 7750 * 7751 * More about inversion lists can be found in "Unicode Demystified" 7752 * Chapter 13 by Richard Gillam, published by Addison-Wesley. 7753 * More will be coming when functionality is added later. 7754 * 7755 * The inversion list data structure is currently implemented as an SV pointing 7756 * to an array of UVs that the SV thinks are bytes. This allows us to have an 7757 * array of UV whose memory management is automatically handled by the existing 7758 * facilities for SV's. 7759 * 7760 * Some of the methods should always be private to the implementation, and some 7761 * should eventually be made public */ 7762 7763 /* The header definitions are in F<inline_invlist.c> */ 7764 7765 PERL_STATIC_INLINE UV* 7766 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) 7767 { 7768 /* Returns a pointer to the first element in the inversion list's array. 7769 * This is called upon initialization of an inversion list. Where the 7770 * array begins depends on whether the list has the code point U+0000 in it 7771 * or not. The other parameter tells it whether the code that follows this 7772 * call is about to put a 0 in the inversion list or not. The first 7773 * element is either the element reserved for 0, if TRUE, or the element 7774 * after it, if FALSE */ 7775 7776 bool* offset = get_invlist_offset_addr(invlist); 7777 UV* zero_addr = (UV *) SvPVX(invlist); 7778 7779 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; 7780 7781 /* Must be empty */ 7782 assert(! _invlist_len(invlist)); 7783 7784 *zero_addr = 0; 7785 7786 /* 1^1 = 0; 1^0 = 1 */ 7787 *offset = 1 ^ will_have_0; 7788 return zero_addr + *offset; 7789 } 7790 7791 PERL_STATIC_INLINE UV* 7792 S_invlist_array(pTHX_ SV* const invlist) 7793 { 7794 /* Returns the pointer to the inversion list's array. Every time the 7795 * length changes, this needs to be called in case malloc or realloc moved 7796 * it */ 7797 7798 PERL_ARGS_ASSERT_INVLIST_ARRAY; 7799 7800 /* Must not be empty. If these fail, you probably didn't check for <len> 7801 * being non-zero before trying to get the array */ 7802 assert(_invlist_len(invlist)); 7803 7804 /* The very first element always contains zero, The array begins either 7805 * there, or if the inversion list is offset, at the element after it. 7806 * The offset header field determines which; it contains 0 or 1 to indicate 7807 * how much additionally to add */ 7808 assert(0 == *(SvPVX(invlist))); 7809 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); 7810 } 7811 7812 PERL_STATIC_INLINE void 7813 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) 7814 { 7815 /* Sets the current number of elements stored in the inversion list. 7816 * Updates SvCUR correspondingly */ 7817 7818 PERL_ARGS_ASSERT_INVLIST_SET_LEN; 7819 7820 assert(SvTYPE(invlist) == SVt_INVLIST); 7821 7822 SvCUR_set(invlist, 7823 (len == 0) 7824 ? 0 7825 : TO_INTERNAL_SIZE(len + offset)); 7826 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); 7827 } 7828 7829 PERL_STATIC_INLINE IV* 7830 S_get_invlist_previous_index_addr(pTHX_ SV* invlist) 7831 { 7832 /* Return the address of the IV that is reserved to hold the cached index 7833 * */ 7834 7835 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; 7836 7837 assert(SvTYPE(invlist) == SVt_INVLIST); 7838 7839 return &(((XINVLIST*) SvANY(invlist))->prev_index); 7840 } 7841 7842 PERL_STATIC_INLINE IV 7843 S_invlist_previous_index(pTHX_ SV* const invlist) 7844 { 7845 /* Returns cached index of previous search */ 7846 7847 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; 7848 7849 return *get_invlist_previous_index_addr(invlist); 7850 } 7851 7852 PERL_STATIC_INLINE void 7853 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) 7854 { 7855 /* Caches <index> for later retrieval */ 7856 7857 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; 7858 7859 assert(index == 0 || index < (int) _invlist_len(invlist)); 7860 7861 *get_invlist_previous_index_addr(invlist) = index; 7862 } 7863 7864 PERL_STATIC_INLINE UV 7865 S_invlist_max(pTHX_ SV* const invlist) 7866 { 7867 /* Returns the maximum number of elements storable in the inversion list's 7868 * array, without having to realloc() */ 7869 7870 PERL_ARGS_ASSERT_INVLIST_MAX; 7871 7872 assert(SvTYPE(invlist) == SVt_INVLIST); 7873 7874 /* Assumes worst case, in which the 0 element is not counted in the 7875 * inversion list, so subtracts 1 for that */ 7876 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ 7877 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 7878 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; 7879 } 7880 7881 #ifndef PERL_IN_XSUB_RE 7882 SV* 7883 Perl__new_invlist(pTHX_ IV initial_size) 7884 { 7885 7886 /* Return a pointer to a newly constructed inversion list, with enough 7887 * space to store 'initial_size' elements. If that number is negative, a 7888 * system default is used instead */ 7889 7890 SV* new_list; 7891 7892 if (initial_size < 0) { 7893 initial_size = 10; 7894 } 7895 7896 /* Allocate the initial space */ 7897 new_list = newSV_type(SVt_INVLIST); 7898 7899 /* First 1 is in case the zero element isn't in the list; second 1 is for 7900 * trailing NUL */ 7901 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); 7902 invlist_set_len(new_list, 0, 0); 7903 7904 /* Force iterinit() to be used to get iteration to work */ 7905 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; 7906 7907 *get_invlist_previous_index_addr(new_list) = 0; 7908 7909 return new_list; 7910 } 7911 7912 SV* 7913 Perl__new_invlist_C_array(pTHX_ const UV* const list) 7914 { 7915 /* Return a pointer to a newly constructed inversion list, initialized to 7916 * point to <list>, which has to be in the exact correct inversion list 7917 * form, including internal fields. Thus this is a dangerous routine that 7918 * should not be used in the wrong hands. The passed in 'list' contains 7919 * several header fields at the beginning that are not part of the 7920 * inversion list body proper */ 7921 7922 const STRLEN length = (STRLEN) list[0]; 7923 const UV version_id = list[1]; 7924 const bool offset = cBOOL(list[2]); 7925 #define HEADER_LENGTH 3 7926 /* If any of the above changes in any way, you must change HEADER_LENGTH 7927 * (if appropriate) and regenerate INVLIST_VERSION_ID by running 7928 * perl -E 'say int(rand 2**31-1)' 7929 */ 7930 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and 7931 data structure type, so that one being 7932 passed in can be validated to be an 7933 inversion list of the correct vintage. 7934 */ 7935 7936 SV* invlist = newSV_type(SVt_INVLIST); 7937 7938 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; 7939 7940 if (version_id != INVLIST_VERSION_ID) { 7941 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); 7942 } 7943 7944 /* The generated array passed in includes header elements that aren't part 7945 * of the list proper, so start it just after them */ 7946 SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); 7947 7948 SvLEN_set(invlist, 0); /* Means we own the contents, and the system 7949 shouldn't touch it */ 7950 7951 *(get_invlist_offset_addr(invlist)) = offset; 7952 7953 /* The 'length' passed to us is the physical number of elements in the 7954 * inversion list. But if there is an offset the logical number is one 7955 * less than that */ 7956 invlist_set_len(invlist, length - offset, offset); 7957 7958 invlist_set_previous_index(invlist, 0); 7959 7960 /* Initialize the iteration pointer. */ 7961 invlist_iterfinish(invlist); 7962 7963 SvREADONLY_on(invlist); 7964 7965 return invlist; 7966 } 7967 #endif /* ifndef PERL_IN_XSUB_RE */ 7968 7969 STATIC void 7970 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) 7971 { 7972 /* Grow the maximum size of an inversion list */ 7973 7974 PERL_ARGS_ASSERT_INVLIST_EXTEND; 7975 7976 assert(SvTYPE(invlist) == SVt_INVLIST); 7977 7978 /* Add one to account for the zero element at the beginning which may not 7979 * be counted by the calling parameters */ 7980 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); 7981 } 7982 7983 PERL_STATIC_INLINE void 7984 S_invlist_trim(pTHX_ SV* const invlist) 7985 { 7986 PERL_ARGS_ASSERT_INVLIST_TRIM; 7987 7988 assert(SvTYPE(invlist) == SVt_INVLIST); 7989 7990 /* Change the length of the inversion list to how many entries it currently 7991 * has */ 7992 SvPV_shrink_to_cur((SV *) invlist); 7993 } 7994 7995 STATIC void 7996 S__append_range_to_invlist(pTHX_ SV* const invlist, 7997 const UV start, const UV end) 7998 { 7999 /* Subject to change or removal. Append the range from 'start' to 'end' at 8000 * the end of the inversion list. The range must be above any existing 8001 * ones. */ 8002 8003 UV* array; 8004 UV max = invlist_max(invlist); 8005 UV len = _invlist_len(invlist); 8006 bool offset; 8007 8008 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; 8009 8010 if (len == 0) { /* Empty lists must be initialized */ 8011 offset = start != 0; 8012 array = _invlist_array_init(invlist, ! offset); 8013 } 8014 else { 8015 /* Here, the existing list is non-empty. The current max entry in the 8016 * list is generally the first value not in the set, except when the 8017 * set extends to the end of permissible values, in which case it is 8018 * the first entry in that final set, and so this call is an attempt to 8019 * append out-of-order */ 8020 8021 UV final_element = len - 1; 8022 array = invlist_array(invlist); 8023 if (array[final_element] > start 8024 || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) 8025 { 8026 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", 8027 array[final_element], start, 8028 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); 8029 } 8030 8031 /* Here, it is a legal append. If the new range begins with the first 8032 * value not in the set, it is extending the set, so the new first 8033 * value not in the set is one greater than the newly extended range. 8034 * */ 8035 offset = *get_invlist_offset_addr(invlist); 8036 if (array[final_element] == start) { 8037 if (end != UV_MAX) { 8038 array[final_element] = end + 1; 8039 } 8040 else { 8041 /* But if the end is the maximum representable on the machine, 8042 * just let the range that this would extend to have no end */ 8043 invlist_set_len(invlist, len - 1, offset); 8044 } 8045 return; 8046 } 8047 } 8048 8049 /* Here the new range doesn't extend any existing set. Add it */ 8050 8051 len += 2; /* Includes an element each for the start and end of range */ 8052 8053 /* If wll overflow the existing space, extend, which may cause the array to 8054 * be moved */ 8055 if (max < len) { 8056 invlist_extend(invlist, len); 8057 8058 /* Have to set len here to avoid assert failure in invlist_array() */ 8059 invlist_set_len(invlist, len, offset); 8060 8061 array = invlist_array(invlist); 8062 } 8063 else { 8064 invlist_set_len(invlist, len, offset); 8065 } 8066 8067 /* The next item on the list starts the range, the one after that is 8068 * one past the new range. */ 8069 array[len - 2] = start; 8070 if (end != UV_MAX) { 8071 array[len - 1] = end + 1; 8072 } 8073 else { 8074 /* But if the end is the maximum representable on the machine, just let 8075 * the range have no end */ 8076 invlist_set_len(invlist, len - 1, offset); 8077 } 8078 } 8079 8080 #ifndef PERL_IN_XSUB_RE 8081 8082 IV 8083 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) 8084 { 8085 /* Searches the inversion list for the entry that contains the input code 8086 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the 8087 * return value is the index into the list's array of the range that 8088 * contains <cp> */ 8089 8090 IV low = 0; 8091 IV mid; 8092 IV high = _invlist_len(invlist); 8093 const IV highest_element = high - 1; 8094 const UV* array; 8095 8096 PERL_ARGS_ASSERT__INVLIST_SEARCH; 8097 8098 /* If list is empty, return failure. */ 8099 if (high == 0) { 8100 return -1; 8101 } 8102 8103 /* (We can't get the array unless we know the list is non-empty) */ 8104 array = invlist_array(invlist); 8105 8106 mid = invlist_previous_index(invlist); 8107 assert(mid >=0 && mid <= highest_element); 8108 8109 /* <mid> contains the cache of the result of the previous call to this 8110 * function (0 the first time). See if this call is for the same result, 8111 * or if it is for mid-1. This is under the theory that calls to this 8112 * function will often be for related code points that are near each other. 8113 * And benchmarks show that caching gives better results. We also test 8114 * here if the code point is within the bounds of the list. These tests 8115 * replace others that would have had to be made anyway to make sure that 8116 * the array bounds were not exceeded, and these give us extra information 8117 * at the same time */ 8118 if (cp >= array[mid]) { 8119 if (cp >= array[highest_element]) { 8120 return highest_element; 8121 } 8122 8123 /* Here, array[mid] <= cp < array[highest_element]. This means that 8124 * the final element is not the answer, so can exclude it; it also 8125 * means that <mid> is not the final element, so can refer to 'mid + 1' 8126 * safely */ 8127 if (cp < array[mid + 1]) { 8128 return mid; 8129 } 8130 high--; 8131 low = mid + 1; 8132 } 8133 else { /* cp < aray[mid] */ 8134 if (cp < array[0]) { /* Fail if outside the array */ 8135 return -1; 8136 } 8137 high = mid; 8138 if (cp >= array[mid - 1]) { 8139 goto found_entry; 8140 } 8141 } 8142 8143 /* Binary search. What we are looking for is <i> such that 8144 * array[i] <= cp < array[i+1] 8145 * The loop below converges on the i+1. Note that there may not be an 8146 * (i+1)th element in the array, and things work nonetheless */ 8147 while (low < high) { 8148 mid = (low + high) / 2; 8149 assert(mid <= highest_element); 8150 if (array[mid] <= cp) { /* cp >= array[mid] */ 8151 low = mid + 1; 8152 8153 /* We could do this extra test to exit the loop early. 8154 if (cp < array[low]) { 8155 return mid; 8156 } 8157 */ 8158 } 8159 else { /* cp < array[mid] */ 8160 high = mid; 8161 } 8162 } 8163 8164 found_entry: 8165 high--; 8166 invlist_set_previous_index(invlist, high); 8167 return high; 8168 } 8169 8170 void 8171 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, 8172 const UV start, const UV end, U8* swatch) 8173 { 8174 /* populates a swatch of a swash the same way swatch_get() does in utf8.c, 8175 * but is used when the swash has an inversion list. This makes this much 8176 * faster, as it uses a binary search instead of a linear one. This is 8177 * intimately tied to that function, and perhaps should be in utf8.c, 8178 * except it is intimately tied to inversion lists as well. It assumes 8179 * that <swatch> is all 0's on input */ 8180 8181 UV current = start; 8182 const IV len = _invlist_len(invlist); 8183 IV i; 8184 const UV * array; 8185 8186 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; 8187 8188 if (len == 0) { /* Empty inversion list */ 8189 return; 8190 } 8191 8192 array = invlist_array(invlist); 8193 8194 /* Find which element it is */ 8195 i = _invlist_search(invlist, start); 8196 8197 /* We populate from <start> to <end> */ 8198 while (current < end) { 8199 UV upper; 8200 8201 /* The inversion list gives the results for every possible code point 8202 * after the first one in the list. Only those ranges whose index is 8203 * even are ones that the inversion list matches. For the odd ones, 8204 * and if the initial code point is not in the list, we have to skip 8205 * forward to the next element */ 8206 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { 8207 i++; 8208 if (i >= len) { /* Finished if beyond the end of the array */ 8209 return; 8210 } 8211 current = array[i]; 8212 if (current >= end) { /* Finished if beyond the end of what we 8213 are populating */ 8214 if (LIKELY(end < UV_MAX)) { 8215 return; 8216 } 8217 8218 /* We get here when the upper bound is the maximum 8219 * representable on the machine, and we are looking for just 8220 * that code point. Have to special case it */ 8221 i = len; 8222 goto join_end_of_list; 8223 } 8224 } 8225 assert(current >= start); 8226 8227 /* The current range ends one below the next one, except don't go past 8228 * <end> */ 8229 i++; 8230 upper = (i < len && array[i] < end) ? array[i] : end; 8231 8232 /* Here we are in a range that matches. Populate a bit in the 3-bit U8 8233 * for each code point in it */ 8234 for (; current < upper; current++) { 8235 const STRLEN offset = (STRLEN)(current - start); 8236 swatch[offset >> 3] |= 1 << (offset & 7); 8237 } 8238 8239 join_end_of_list: 8240 8241 /* Quit if at the end of the list */ 8242 if (i >= len) { 8243 8244 /* But first, have to deal with the highest possible code point on 8245 * the platform. The previous code assumes that <end> is one 8246 * beyond where we want to populate, but that is impossible at the 8247 * platform's infinity, so have to handle it specially */ 8248 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) 8249 { 8250 const STRLEN offset = (STRLEN)(end - start); 8251 swatch[offset >> 3] |= 1 << (offset & 7); 8252 } 8253 return; 8254 } 8255 8256 /* Advance to the next range, which will be for code points not in the 8257 * inversion list */ 8258 current = array[i]; 8259 } 8260 8261 return; 8262 } 8263 8264 void 8265 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 8266 const bool complement_b, SV** output) 8267 { 8268 /* Take the union of two inversion lists and point <output> to it. *output 8269 * SHOULD BE DEFINED upon input, and if it points to one of the two lists, 8270 * the reference count to that list will be decremented if not already a 8271 * temporary (mortal); otherwise *output will be made correspondingly 8272 * mortal. The first list, <a>, may be NULL, in which case a copy of the 8273 * second list is returned. If <complement_b> is TRUE, the union is taken 8274 * of the complement (inversion) of <b> instead of b itself. 8275 * 8276 * The basis for this comes from "Unicode Demystified" Chapter 13 by 8277 * Richard Gillam, published by Addison-Wesley, and explained at some 8278 * length there. The preface says to incorporate its examples into your 8279 * code at your own risk. 8280 * 8281 * The algorithm is like a merge sort. 8282 * 8283 * XXX A potential performance improvement is to keep track as we go along 8284 * if only one of the inputs contributes to the result, meaning the other 8285 * is a subset of that one. In that case, we can skip the final copy and 8286 * return the larger of the input lists, but then outside code might need 8287 * to keep track of whether to free the input list or not */ 8288 8289 const UV* array_a; /* a's array */ 8290 const UV* array_b; 8291 UV len_a; /* length of a's array */ 8292 UV len_b; 8293 8294 SV* u; /* the resulting union */ 8295 UV* array_u; 8296 UV len_u; 8297 8298 UV i_a = 0; /* current index into a's array */ 8299 UV i_b = 0; 8300 UV i_u = 0; 8301 8302 /* running count, as explained in the algorithm source book; items are 8303 * stopped accumulating and are output when the count changes to/from 0. 8304 * The count is incremented when we start a range that's in the set, and 8305 * decremented when we start a range that's not in the set. So its range 8306 * is 0 to 2. Only when the count is zero is something not in the set. 8307 */ 8308 UV count = 0; 8309 8310 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; 8311 assert(a != b); 8312 8313 /* If either one is empty, the union is the other one */ 8314 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { 8315 bool make_temp = FALSE; /* Should we mortalize the result? */ 8316 8317 if (*output == a) { 8318 if (a != NULL) { 8319 if (! (make_temp = cBOOL(SvTEMP(a)))) { 8320 SvREFCNT_dec_NN(a); 8321 } 8322 } 8323 } 8324 if (*output != b) { 8325 *output = invlist_clone(b); 8326 if (complement_b) { 8327 _invlist_invert(*output); 8328 } 8329 } /* else *output already = b; */ 8330 8331 if (make_temp) { 8332 sv_2mortal(*output); 8333 } 8334 return; 8335 } 8336 else if ((len_b = _invlist_len(b)) == 0) { 8337 bool make_temp = FALSE; 8338 if (*output == b) { 8339 if (! (make_temp = cBOOL(SvTEMP(b)))) { 8340 SvREFCNT_dec_NN(b); 8341 } 8342 } 8343 8344 /* The complement of an empty list is a list that has everything in it, 8345 * so the union with <a> includes everything too */ 8346 if (complement_b) { 8347 if (a == *output) { 8348 if (! (make_temp = cBOOL(SvTEMP(a)))) { 8349 SvREFCNT_dec_NN(a); 8350 } 8351 } 8352 *output = _new_invlist(1); 8353 _append_range_to_invlist(*output, 0, UV_MAX); 8354 } 8355 else if (*output != a) { 8356 *output = invlist_clone(a); 8357 } 8358 /* else *output already = a; */ 8359 8360 if (make_temp) { 8361 sv_2mortal(*output); 8362 } 8363 return; 8364 } 8365 8366 /* Here both lists exist and are non-empty */ 8367 array_a = invlist_array(a); 8368 array_b = invlist_array(b); 8369 8370 /* If are to take the union of 'a' with the complement of b, set it 8371 * up so are looking at b's complement. */ 8372 if (complement_b) { 8373 8374 /* To complement, we invert: if the first element is 0, remove it. To 8375 * do this, we just pretend the array starts one later */ 8376 if (array_b[0] == 0) { 8377 array_b++; 8378 len_b--; 8379 } 8380 else { 8381 8382 /* But if the first element is not zero, we pretend the list starts 8383 * at the 0 that is always stored immediately before the array. */ 8384 array_b--; 8385 len_b++; 8386 } 8387 } 8388 8389 /* Size the union for the worst case: that the sets are completely 8390 * disjoint */ 8391 u = _new_invlist(len_a + len_b); 8392 8393 /* Will contain U+0000 if either component does */ 8394 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) 8395 || (len_b > 0 && array_b[0] == 0)); 8396 8397 /* Go through each list item by item, stopping when exhausted one of 8398 * them */ 8399 while (i_a < len_a && i_b < len_b) { 8400 UV cp; /* The element to potentially add to the union's array */ 8401 bool cp_in_set; /* is it in the the input list's set or not */ 8402 8403 /* We need to take one or the other of the two inputs for the union. 8404 * Since we are merging two sorted lists, we take the smaller of the 8405 * next items. In case of a tie, we take the one that is in its set 8406 * first. If we took one not in the set first, it would decrement the 8407 * count, possibly to 0 which would cause it to be output as ending the 8408 * range, and the next time through we would take the same number, and 8409 * output it again as beginning the next range. By doing it the 8410 * opposite way, there is no possibility that the count will be 8411 * momentarily decremented to 0, and thus the two adjoining ranges will 8412 * be seamlessly merged. (In a tie and both are in the set or both not 8413 * in the set, it doesn't matter which we take first.) */ 8414 if (array_a[i_a] < array_b[i_b] 8415 || (array_a[i_a] == array_b[i_b] 8416 && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 8417 { 8418 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 8419 cp= array_a[i_a++]; 8420 } 8421 else { 8422 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 8423 cp = array_b[i_b++]; 8424 } 8425 8426 /* Here, have chosen which of the two inputs to look at. Only output 8427 * if the running count changes to/from 0, which marks the 8428 * beginning/end of a range in that's in the set */ 8429 if (cp_in_set) { 8430 if (count == 0) { 8431 array_u[i_u++] = cp; 8432 } 8433 count++; 8434 } 8435 else { 8436 count--; 8437 if (count == 0) { 8438 array_u[i_u++] = cp; 8439 } 8440 } 8441 } 8442 8443 /* Here, we are finished going through at least one of the lists, which 8444 * means there is something remaining in at most one. We check if the list 8445 * that hasn't been exhausted is positioned such that we are in the middle 8446 * of a range in its set or not. (i_a and i_b point to the element beyond 8447 * the one we care about.) If in the set, we decrement 'count'; if 0, there 8448 * is potentially more to output. 8449 * There are four cases: 8450 * 1) Both weren't in their sets, count is 0, and remains 0. What's left 8451 * in the union is entirely from the non-exhausted set. 8452 * 2) Both were in their sets, count is 2. Nothing further should 8453 * be output, as everything that remains will be in the exhausted 8454 * list's set, hence in the union; decrementing to 1 but not 0 insures 8455 * that 8456 * 3) the exhausted was in its set, non-exhausted isn't, count is 1. 8457 * Nothing further should be output because the union includes 8458 * everything from the exhausted set. Not decrementing ensures that. 8459 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; 8460 * decrementing to 0 insures that we look at the remainder of the 8461 * non-exhausted set */ 8462 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 8463 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 8464 { 8465 count--; 8466 } 8467 8468 /* The final length is what we've output so far, plus what else is about to 8469 * be output. (If 'count' is non-zero, then the input list we exhausted 8470 * has everything remaining up to the machine's limit in its set, and hence 8471 * in the union, so there will be no further output. */ 8472 len_u = i_u; 8473 if (count == 0) { 8474 /* At most one of the subexpressions will be non-zero */ 8475 len_u += (len_a - i_a) + (len_b - i_b); 8476 } 8477 8478 /* Set result to final length, which can change the pointer to array_u, so 8479 * re-find it */ 8480 if (len_u != _invlist_len(u)) { 8481 invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); 8482 invlist_trim(u); 8483 array_u = invlist_array(u); 8484 } 8485 8486 /* When 'count' is 0, the list that was exhausted (if one was shorter than 8487 * the other) ended with everything above it not in its set. That means 8488 * that the remaining part of the union is precisely the same as the 8489 * non-exhausted list, so can just copy it unchanged. (If both list were 8490 * exhausted at the same time, then the operations below will be both 0.) 8491 */ 8492 if (count == 0) { 8493 IV copy_count; /* At most one will have a non-zero copy count */ 8494 if ((copy_count = len_a - i_a) > 0) { 8495 Copy(array_a + i_a, array_u + i_u, copy_count, UV); 8496 } 8497 else if ((copy_count = len_b - i_b) > 0) { 8498 Copy(array_b + i_b, array_u + i_u, copy_count, UV); 8499 } 8500 } 8501 8502 /* We may be removing a reference to one of the inputs. If so, the output 8503 * is made mortal if the input was. (Mortal SVs shouldn't have their ref 8504 * count decremented) */ 8505 if (a == *output || b == *output) { 8506 assert(! invlist_is_iterating(*output)); 8507 if ((SvTEMP(*output))) { 8508 sv_2mortal(u); 8509 } 8510 else { 8511 SvREFCNT_dec_NN(*output); 8512 } 8513 } 8514 8515 *output = u; 8516 8517 return; 8518 } 8519 8520 void 8521 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 8522 const bool complement_b, SV** i) 8523 { 8524 /* Take the intersection of two inversion lists and point <i> to it. *i 8525 * SHOULD BE DEFINED upon input, and if it points to one of the two lists, 8526 * the reference count to that list will be decremented if not already a 8527 * temporary (mortal); otherwise *i will be made correspondingly mortal. 8528 * The first list, <a>, may be NULL, in which case an empty list is 8529 * returned. If <complement_b> is TRUE, the result will be the 8530 * intersection of <a> and the complement (or inversion) of <b> instead of 8531 * <b> directly. 8532 * 8533 * The basis for this comes from "Unicode Demystified" Chapter 13 by 8534 * Richard Gillam, published by Addison-Wesley, and explained at some 8535 * length there. The preface says to incorporate its examples into your 8536 * code at your own risk. In fact, it had bugs 8537 * 8538 * The algorithm is like a merge sort, and is essentially the same as the 8539 * union above 8540 */ 8541 8542 const UV* array_a; /* a's array */ 8543 const UV* array_b; 8544 UV len_a; /* length of a's array */ 8545 UV len_b; 8546 8547 SV* r; /* the resulting intersection */ 8548 UV* array_r; 8549 UV len_r; 8550 8551 UV i_a = 0; /* current index into a's array */ 8552 UV i_b = 0; 8553 UV i_r = 0; 8554 8555 /* running count, as explained in the algorithm source book; items are 8556 * stopped accumulating and are output when the count changes to/from 2. 8557 * The count is incremented when we start a range that's in the set, and 8558 * decremented when we start a range that's not in the set. So its range 8559 * is 0 to 2. Only when the count is 2 is something in the intersection. 8560 */ 8561 UV count = 0; 8562 8563 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; 8564 assert(a != b); 8565 8566 /* Special case if either one is empty */ 8567 len_a = (a == NULL) ? 0 : _invlist_len(a); 8568 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { 8569 bool make_temp = FALSE; 8570 8571 if (len_a != 0 && complement_b) { 8572 8573 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must 8574 * be empty. Here, also we are using 'b's complement, which hence 8575 * must be every possible code point. Thus the intersection is 8576 * simply 'a'. */ 8577 if (*i != a) { 8578 if (*i == b) { 8579 if (! (make_temp = cBOOL(SvTEMP(b)))) { 8580 SvREFCNT_dec_NN(b); 8581 } 8582 } 8583 8584 *i = invlist_clone(a); 8585 } 8586 /* else *i is already 'a' */ 8587 8588 if (make_temp) { 8589 sv_2mortal(*i); 8590 } 8591 return; 8592 } 8593 8594 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The 8595 * intersection must be empty */ 8596 if (*i == a) { 8597 if (! (make_temp = cBOOL(SvTEMP(a)))) { 8598 SvREFCNT_dec_NN(a); 8599 } 8600 } 8601 else if (*i == b) { 8602 if (! (make_temp = cBOOL(SvTEMP(b)))) { 8603 SvREFCNT_dec_NN(b); 8604 } 8605 } 8606 *i = _new_invlist(0); 8607 if (make_temp) { 8608 sv_2mortal(*i); 8609 } 8610 8611 return; 8612 } 8613 8614 /* Here both lists exist and are non-empty */ 8615 array_a = invlist_array(a); 8616 array_b = invlist_array(b); 8617 8618 /* If are to take the intersection of 'a' with the complement of b, set it 8619 * up so are looking at b's complement. */ 8620 if (complement_b) { 8621 8622 /* To complement, we invert: if the first element is 0, remove it. To 8623 * do this, we just pretend the array starts one later */ 8624 if (array_b[0] == 0) { 8625 array_b++; 8626 len_b--; 8627 } 8628 else { 8629 8630 /* But if the first element is not zero, we pretend the list starts 8631 * at the 0 that is always stored immediately before the array. */ 8632 array_b--; 8633 len_b++; 8634 } 8635 } 8636 8637 /* Size the intersection for the worst case: that the intersection ends up 8638 * fragmenting everything to be completely disjoint */ 8639 r= _new_invlist(len_a + len_b); 8640 8641 /* Will contain U+0000 iff both components do */ 8642 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 8643 && len_b > 0 && array_b[0] == 0); 8644 8645 /* Go through each list item by item, stopping when exhausted one of 8646 * them */ 8647 while (i_a < len_a && i_b < len_b) { 8648 UV cp; /* The element to potentially add to the intersection's 8649 array */ 8650 bool cp_in_set; /* Is it in the input list's set or not */ 8651 8652 /* We need to take one or the other of the two inputs for the 8653 * intersection. Since we are merging two sorted lists, we take the 8654 * smaller of the next items. In case of a tie, we take the one that 8655 * is not in its set first (a difference from the union algorithm). If 8656 * we took one in the set first, it would increment the count, possibly 8657 * to 2 which would cause it to be output as starting a range in the 8658 * intersection, and the next time through we would take that same 8659 * number, and output it again as ending the set. By doing it the 8660 * opposite of this, there is no possibility that the count will be 8661 * momentarily incremented to 2. (In a tie and both are in the set or 8662 * both not in the set, it doesn't matter which we take first.) */ 8663 if (array_a[i_a] < array_b[i_b] 8664 || (array_a[i_a] == array_b[i_b] 8665 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 8666 { 8667 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 8668 cp= array_a[i_a++]; 8669 } 8670 else { 8671 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 8672 cp= array_b[i_b++]; 8673 } 8674 8675 /* Here, have chosen which of the two inputs to look at. Only output 8676 * if the running count changes to/from 2, which marks the 8677 * beginning/end of a range that's in the intersection */ 8678 if (cp_in_set) { 8679 count++; 8680 if (count == 2) { 8681 array_r[i_r++] = cp; 8682 } 8683 } 8684 else { 8685 if (count == 2) { 8686 array_r[i_r++] = cp; 8687 } 8688 count--; 8689 } 8690 } 8691 8692 /* Here, we are finished going through at least one of the lists, which 8693 * means there is something remaining in at most one. We check if the list 8694 * that has been exhausted is positioned such that we are in the middle 8695 * of a range in its set or not. (i_a and i_b point to elements 1 beyond 8696 * the ones we care about.) There are four cases: 8697 * 1) Both weren't in their sets, count is 0, and remains 0. There's 8698 * nothing left in the intersection. 8699 * 2) Both were in their sets, count is 2 and perhaps is incremented to 8700 * above 2. What should be output is exactly that which is in the 8701 * non-exhausted set, as everything it has is also in the intersection 8702 * set, and everything it doesn't have can't be in the intersection 8703 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and 8704 * gets incremented to 2. Like the previous case, the intersection is 8705 * everything that remains in the non-exhausted set. 8706 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and 8707 * remains 1. And the intersection has nothing more. */ 8708 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 8709 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 8710 { 8711 count++; 8712 } 8713 8714 /* The final length is what we've output so far plus what else is in the 8715 * intersection. At most one of the subexpressions below will be non-zero 8716 * */ 8717 len_r = i_r; 8718 if (count >= 2) { 8719 len_r += (len_a - i_a) + (len_b - i_b); 8720 } 8721 8722 /* Set result to final length, which can change the pointer to array_r, so 8723 * re-find it */ 8724 if (len_r != _invlist_len(r)) { 8725 invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); 8726 invlist_trim(r); 8727 array_r = invlist_array(r); 8728 } 8729 8730 /* Finish outputting any remaining */ 8731 if (count >= 2) { /* At most one will have a non-zero copy count */ 8732 IV copy_count; 8733 if ((copy_count = len_a - i_a) > 0) { 8734 Copy(array_a + i_a, array_r + i_r, copy_count, UV); 8735 } 8736 else if ((copy_count = len_b - i_b) > 0) { 8737 Copy(array_b + i_b, array_r + i_r, copy_count, UV); 8738 } 8739 } 8740 8741 /* We may be removing a reference to one of the inputs. If so, the output 8742 * is made mortal if the input was. (Mortal SVs shouldn't have their ref 8743 * count decremented) */ 8744 if (a == *i || b == *i) { 8745 assert(! invlist_is_iterating(*i)); 8746 if (SvTEMP(*i)) { 8747 sv_2mortal(r); 8748 } 8749 else { 8750 SvREFCNT_dec_NN(*i); 8751 } 8752 } 8753 8754 *i = r; 8755 8756 return; 8757 } 8758 8759 SV* 8760 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) 8761 { 8762 /* Add the range from 'start' to 'end' inclusive to the inversion list's 8763 * set. A pointer to the inversion list is returned. This may actually be 8764 * a new list, in which case the passed in one has been destroyed. The 8765 * passed in inversion list can be NULL, in which case a new one is created 8766 * with just the one range in it */ 8767 8768 SV* range_invlist; 8769 UV len; 8770 8771 if (invlist == NULL) { 8772 invlist = _new_invlist(2); 8773 len = 0; 8774 } 8775 else { 8776 len = _invlist_len(invlist); 8777 } 8778 8779 /* If comes after the final entry actually in the list, can just append it 8780 * to the end, */ 8781 if (len == 0 8782 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) 8783 && start >= invlist_array(invlist)[len - 1])) 8784 { 8785 _append_range_to_invlist(invlist, start, end); 8786 return invlist; 8787 } 8788 8789 /* Here, can't just append things, create and return a new inversion list 8790 * which is the union of this range and the existing inversion list */ 8791 range_invlist = _new_invlist(2); 8792 _append_range_to_invlist(range_invlist, start, end); 8793 8794 _invlist_union(invlist, range_invlist, &invlist); 8795 8796 /* The temporary can be freed */ 8797 SvREFCNT_dec_NN(range_invlist); 8798 8799 return invlist; 8800 } 8801 8802 SV* 8803 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, 8804 UV** other_elements_ptr) 8805 { 8806 /* Create and return an inversion list whose contents are to be populated 8807 * by the caller. The caller gives the number of elements (in 'size') and 8808 * the very first element ('element0'). This function will set 8809 * '*other_elements_ptr' to an array of UVs, where the remaining elements 8810 * are to be placed. 8811 * 8812 * Obviously there is some trust involved that the caller will properly 8813 * fill in the other elements of the array. 8814 * 8815 * (The first element needs to be passed in, as the underlying code does 8816 * things differently depending on whether it is zero or non-zero) */ 8817 8818 SV* invlist = _new_invlist(size); 8819 bool offset; 8820 8821 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; 8822 8823 _append_range_to_invlist(invlist, element0, element0); 8824 offset = *get_invlist_offset_addr(invlist); 8825 8826 invlist_set_len(invlist, size, offset); 8827 *other_elements_ptr = invlist_array(invlist) + 1; 8828 return invlist; 8829 } 8830 8831 #endif 8832 8833 PERL_STATIC_INLINE SV* 8834 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { 8835 return _add_range_to_invlist(invlist, cp, cp); 8836 } 8837 8838 #ifndef PERL_IN_XSUB_RE 8839 void 8840 Perl__invlist_invert(pTHX_ SV* const invlist) 8841 { 8842 /* Complement the input inversion list. This adds a 0 if the list didn't 8843 * have a zero; removes it otherwise. As described above, the data 8844 * structure is set up so that this is very efficient */ 8845 8846 PERL_ARGS_ASSERT__INVLIST_INVERT; 8847 8848 assert(! invlist_is_iterating(invlist)); 8849 8850 /* The inverse of matching nothing is matching everything */ 8851 if (_invlist_len(invlist) == 0) { 8852 _append_range_to_invlist(invlist, 0, UV_MAX); 8853 return; 8854 } 8855 8856 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); 8857 } 8858 8859 #endif 8860 8861 PERL_STATIC_INLINE SV* 8862 S_invlist_clone(pTHX_ SV* const invlist) 8863 { 8864 8865 /* Return a new inversion list that is a copy of the input one, which is 8866 * unchanged. The new list will not be mortal even if the old one was. */ 8867 8868 /* Need to allocate extra space to accommodate Perl's addition of a 8869 * trailing NUL to SvPV's, since it thinks they are always strings */ 8870 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); 8871 STRLEN physical_length = SvCUR(invlist); 8872 bool offset = *(get_invlist_offset_addr(invlist)); 8873 8874 PERL_ARGS_ASSERT_INVLIST_CLONE; 8875 8876 *(get_invlist_offset_addr(new_invlist)) = offset; 8877 invlist_set_len(new_invlist, _invlist_len(invlist), offset); 8878 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); 8879 8880 return new_invlist; 8881 } 8882 8883 PERL_STATIC_INLINE STRLEN* 8884 S_get_invlist_iter_addr(pTHX_ SV* invlist) 8885 { 8886 /* Return the address of the UV that contains the current iteration 8887 * position */ 8888 8889 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; 8890 8891 assert(SvTYPE(invlist) == SVt_INVLIST); 8892 8893 return &(((XINVLIST*) SvANY(invlist))->iterator); 8894 } 8895 8896 PERL_STATIC_INLINE void 8897 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ 8898 { 8899 PERL_ARGS_ASSERT_INVLIST_ITERINIT; 8900 8901 *get_invlist_iter_addr(invlist) = 0; 8902 } 8903 8904 PERL_STATIC_INLINE void 8905 S_invlist_iterfinish(pTHX_ SV* invlist) 8906 { 8907 /* Terminate iterator for invlist. This is to catch development errors. 8908 * Any iteration that is interrupted before completed should call this 8909 * function. Functions that add code points anywhere else but to the end 8910 * of an inversion list assert that they are not in the middle of an 8911 * iteration. If they were, the addition would make the iteration 8912 * problematical: if the iteration hadn't reached the place where things 8913 * were being added, it would be ok */ 8914 8915 PERL_ARGS_ASSERT_INVLIST_ITERFINISH; 8916 8917 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; 8918 } 8919 8920 STATIC bool 8921 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) 8922 { 8923 /* An C<invlist_iterinit> call on <invlist> must be used to set this up. 8924 * This call sets in <*start> and <*end>, the next range in <invlist>. 8925 * Returns <TRUE> if successful and the next call will return the next 8926 * range; <FALSE> if was already at the end of the list. If the latter, 8927 * <*start> and <*end> are unchanged, and the next call to this function 8928 * will start over at the beginning of the list */ 8929 8930 STRLEN* pos = get_invlist_iter_addr(invlist); 8931 UV len = _invlist_len(invlist); 8932 UV *array; 8933 8934 PERL_ARGS_ASSERT_INVLIST_ITERNEXT; 8935 8936 if (*pos >= len) { 8937 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ 8938 return FALSE; 8939 } 8940 8941 array = invlist_array(invlist); 8942 8943 *start = array[(*pos)++]; 8944 8945 if (*pos >= len) { 8946 *end = UV_MAX; 8947 } 8948 else { 8949 *end = array[(*pos)++] - 1; 8950 } 8951 8952 return TRUE; 8953 } 8954 8955 PERL_STATIC_INLINE bool 8956 S_invlist_is_iterating(pTHX_ SV* const invlist) 8957 { 8958 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; 8959 8960 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; 8961 } 8962 8963 PERL_STATIC_INLINE UV 8964 S_invlist_highest(pTHX_ SV* const invlist) 8965 { 8966 /* Returns the highest code point that matches an inversion list. This API 8967 * has an ambiguity, as it returns 0 under either the highest is actually 8968 * 0, or if the list is empty. If this distinction matters to you, check 8969 * for emptiness before calling this function */ 8970 8971 UV len = _invlist_len(invlist); 8972 UV *array; 8973 8974 PERL_ARGS_ASSERT_INVLIST_HIGHEST; 8975 8976 if (len == 0) { 8977 return 0; 8978 } 8979 8980 array = invlist_array(invlist); 8981 8982 /* The last element in the array in the inversion list always starts a 8983 * range that goes to infinity. That range may be for code points that are 8984 * matched in the inversion list, or it may be for ones that aren't 8985 * matched. In the latter case, the highest code point in the set is one 8986 * less than the beginning of this range; otherwise it is the final element 8987 * of this range: infinity */ 8988 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) 8989 ? UV_MAX 8990 : array[len - 1] - 1; 8991 } 8992 8993 #ifndef PERL_IN_XSUB_RE 8994 SV * 8995 Perl__invlist_contents(pTHX_ SV* const invlist) 8996 { 8997 /* Get the contents of an inversion list into a string SV so that they can 8998 * be printed out. It uses the format traditionally done for debug tracing 8999 */ 9000 9001 UV start, end; 9002 SV* output = newSVpvs("\n"); 9003 9004 PERL_ARGS_ASSERT__INVLIST_CONTENTS; 9005 9006 assert(! invlist_is_iterating(invlist)); 9007 9008 invlist_iterinit(invlist); 9009 while (invlist_iternext(invlist, &start, &end)) { 9010 if (end == UV_MAX) { 9011 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); 9012 } 9013 else if (end != start) { 9014 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", 9015 start, end); 9016 } 9017 else { 9018 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); 9019 } 9020 } 9021 9022 return output; 9023 } 9024 #endif 9025 9026 #ifndef PERL_IN_XSUB_RE 9027 void 9028 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, 9029 const char * const indent, SV* const invlist) 9030 { 9031 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the 9032 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by 9033 * the string 'indent'. The output looks like this: 9034 [0] 0x000A .. 0x000D 9035 [2] 0x0085 9036 [4] 0x2028 .. 0x2029 9037 [6] 0x3104 .. INFINITY 9038 * This means that the first range of code points matched by the list are 9039 * 0xA through 0xD; the second range contains only the single code point 9040 * 0x85, etc. An inversion list is an array of UVs. Two array elements 9041 * are used to define each range (except if the final range extends to 9042 * infinity, only a single element is needed). The array index of the 9043 * first element for the corresponding range is given in brackets. */ 9044 9045 UV start, end; 9046 STRLEN count = 0; 9047 9048 PERL_ARGS_ASSERT__INVLIST_DUMP; 9049 9050 if (invlist_is_iterating(invlist)) { 9051 Perl_dump_indent(aTHX_ level, file, 9052 "%sCan't dump inversion list because is in middle of iterating\n", 9053 indent); 9054 return; 9055 } 9056 9057 invlist_iterinit(invlist); 9058 while (invlist_iternext(invlist, &start, &end)) { 9059 if (end == UV_MAX) { 9060 Perl_dump_indent(aTHX_ level, file, 9061 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", 9062 indent, (UV)count, start); 9063 } 9064 else if (end != start) { 9065 Perl_dump_indent(aTHX_ level, file, 9066 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", 9067 indent, (UV)count, start, end); 9068 } 9069 else { 9070 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", 9071 indent, (UV)count, start); 9072 } 9073 count += 2; 9074 } 9075 } 9076 #endif 9077 9078 #ifdef PERL_ARGS_ASSERT__INVLISTEQ 9079 bool 9080 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) 9081 { 9082 /* Return a boolean as to if the two passed in inversion lists are 9083 * identical. The final argument, if TRUE, says to take the complement of 9084 * the second inversion list before doing the comparison */ 9085 9086 const UV* array_a = invlist_array(a); 9087 const UV* array_b = invlist_array(b); 9088 UV len_a = _invlist_len(a); 9089 UV len_b = _invlist_len(b); 9090 9091 UV i = 0; /* current index into the arrays */ 9092 bool retval = TRUE; /* Assume are identical until proven otherwise */ 9093 9094 PERL_ARGS_ASSERT__INVLISTEQ; 9095 9096 /* If are to compare 'a' with the complement of b, set it 9097 * up so are looking at b's complement. */ 9098 if (complement_b) { 9099 9100 /* The complement of nothing is everything, so <a> would have to have 9101 * just one element, starting at zero (ending at infinity) */ 9102 if (len_b == 0) { 9103 return (len_a == 1 && array_a[0] == 0); 9104 } 9105 else if (array_b[0] == 0) { 9106 9107 /* Otherwise, to complement, we invert. Here, the first element is 9108 * 0, just remove it. To do this, we just pretend the array starts 9109 * one later */ 9110 9111 array_b++; 9112 len_b--; 9113 } 9114 else { 9115 9116 /* But if the first element is not zero, we pretend the list starts 9117 * at the 0 that is always stored immediately before the array. */ 9118 array_b--; 9119 len_b++; 9120 } 9121 } 9122 9123 /* Make sure that the lengths are the same, as well as the final element 9124 * before looping through the remainder. (Thus we test the length, final, 9125 * and first elements right off the bat) */ 9126 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { 9127 retval = FALSE; 9128 } 9129 else for (i = 0; i < len_a - 1; i++) { 9130 if (array_a[i] != array_b[i]) { 9131 retval = FALSE; 9132 break; 9133 } 9134 } 9135 9136 return retval; 9137 } 9138 #endif 9139 9140 #undef HEADER_LENGTH 9141 #undef TO_INTERNAL_SIZE 9142 #undef FROM_INTERNAL_SIZE 9143 #undef INVLIST_VERSION_ID 9144 9145 /* End of inversion list object */ 9146 9147 STATIC void 9148 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) 9149 { 9150 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' 9151 * constructs, and updates RExC_flags with them. On input, RExC_parse 9152 * should point to the first flag; it is updated on output to point to the 9153 * final ')' or ':'. There needs to be at least one flag, or this will 9154 * abort */ 9155 9156 /* for (?g), (?gc), and (?o) warnings; warning 9157 about (?c) will warn about (?g) -- japhy */ 9158 9159 #define WASTED_O 0x01 9160 #define WASTED_G 0x02 9161 #define WASTED_C 0x04 9162 #define WASTED_GC (WASTED_G|WASTED_C) 9163 I32 wastedflags = 0x00; 9164 U32 posflags = 0, negflags = 0; 9165 U32 *flagsp = &posflags; 9166 char has_charset_modifier = '\0'; 9167 regex_charset cs; 9168 bool has_use_defaults = FALSE; 9169 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ 9170 9171 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; 9172 9173 /* '^' as an initial flag sets certain defaults */ 9174 if (UCHARAT(RExC_parse) == '^') { 9175 RExC_parse++; 9176 has_use_defaults = TRUE; 9177 STD_PMMOD_FLAGS_CLEAR(&RExC_flags); 9178 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) 9179 ? REGEX_UNICODE_CHARSET 9180 : REGEX_DEPENDS_CHARSET); 9181 } 9182 9183 cs = get_regex_charset(RExC_flags); 9184 if (cs == REGEX_DEPENDS_CHARSET 9185 && (RExC_utf8 || RExC_uni_semantics)) 9186 { 9187 cs = REGEX_UNICODE_CHARSET; 9188 } 9189 9190 while (*RExC_parse) { 9191 /* && strchr("iogcmsx", *RExC_parse) */ 9192 /* (?g), (?gc) and (?o) are useless here 9193 and must be globally applied -- japhy */ 9194 switch (*RExC_parse) { 9195 9196 /* Code for the imsx flags */ 9197 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); 9198 9199 case LOCALE_PAT_MOD: 9200 if (has_charset_modifier) { 9201 goto excess_modifier; 9202 } 9203 else if (flagsp == &negflags) { 9204 goto neg_modifier; 9205 } 9206 cs = REGEX_LOCALE_CHARSET; 9207 has_charset_modifier = LOCALE_PAT_MOD; 9208 break; 9209 case UNICODE_PAT_MOD: 9210 if (has_charset_modifier) { 9211 goto excess_modifier; 9212 } 9213 else if (flagsp == &negflags) { 9214 goto neg_modifier; 9215 } 9216 cs = REGEX_UNICODE_CHARSET; 9217 has_charset_modifier = UNICODE_PAT_MOD; 9218 break; 9219 case ASCII_RESTRICT_PAT_MOD: 9220 if (flagsp == &negflags) { 9221 goto neg_modifier; 9222 } 9223 if (has_charset_modifier) { 9224 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { 9225 goto excess_modifier; 9226 } 9227 /* Doubled modifier implies more restricted */ 9228 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; 9229 } 9230 else { 9231 cs = REGEX_ASCII_RESTRICTED_CHARSET; 9232 } 9233 has_charset_modifier = ASCII_RESTRICT_PAT_MOD; 9234 break; 9235 case DEPENDS_PAT_MOD: 9236 if (has_use_defaults) { 9237 goto fail_modifiers; 9238 } 9239 else if (flagsp == &negflags) { 9240 goto neg_modifier; 9241 } 9242 else if (has_charset_modifier) { 9243 goto excess_modifier; 9244 } 9245 9246 /* The dual charset means unicode semantics if the 9247 * pattern (or target, not known until runtime) are 9248 * utf8, or something in the pattern indicates unicode 9249 * semantics */ 9250 cs = (RExC_utf8 || RExC_uni_semantics) 9251 ? REGEX_UNICODE_CHARSET 9252 : REGEX_DEPENDS_CHARSET; 9253 has_charset_modifier = DEPENDS_PAT_MOD; 9254 break; 9255 excess_modifier: 9256 RExC_parse++; 9257 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { 9258 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); 9259 } 9260 else if (has_charset_modifier == *(RExC_parse - 1)) { 9261 vFAIL2("Regexp modifier \"%c\" may not appear twice", 9262 *(RExC_parse - 1)); 9263 } 9264 else { 9265 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); 9266 } 9267 /*NOTREACHED*/ 9268 neg_modifier: 9269 RExC_parse++; 9270 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", 9271 *(RExC_parse - 1)); 9272 /*NOTREACHED*/ 9273 case ONCE_PAT_MOD: /* 'o' */ 9274 case GLOBAL_PAT_MOD: /* 'g' */ 9275 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 9276 const I32 wflagbit = *RExC_parse == 'o' 9277 ? WASTED_O 9278 : WASTED_G; 9279 if (! (wastedflags & wflagbit) ) { 9280 wastedflags |= wflagbit; 9281 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 9282 vWARN5( 9283 RExC_parse + 1, 9284 "Useless (%s%c) - %suse /%c modifier", 9285 flagsp == &negflags ? "?-" : "?", 9286 *RExC_parse, 9287 flagsp == &negflags ? "don't " : "", 9288 *RExC_parse 9289 ); 9290 } 9291 } 9292 break; 9293 9294 case CONTINUE_PAT_MOD: /* 'c' */ 9295 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 9296 if (! (wastedflags & WASTED_C) ) { 9297 wastedflags |= WASTED_GC; 9298 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 9299 vWARN3( 9300 RExC_parse + 1, 9301 "Useless (%sc) - %suse /gc modifier", 9302 flagsp == &negflags ? "?-" : "?", 9303 flagsp == &negflags ? "don't " : "" 9304 ); 9305 } 9306 } 9307 break; 9308 case KEEPCOPY_PAT_MOD: /* 'p' */ 9309 if (flagsp == &negflags) { 9310 if (SIZE_ONLY) 9311 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 9312 } else { 9313 *flagsp |= RXf_PMf_KEEPCOPY; 9314 } 9315 break; 9316 case '-': 9317 /* A flag is a default iff it is following a minus, so 9318 * if there is a minus, it means will be trying to 9319 * re-specify a default which is an error */ 9320 if (has_use_defaults || flagsp == &negflags) { 9321 goto fail_modifiers; 9322 } 9323 flagsp = &negflags; 9324 wastedflags = 0; /* reset so (?g-c) warns twice */ 9325 break; 9326 case ':': 9327 case ')': 9328 RExC_flags |= posflags; 9329 RExC_flags &= ~negflags; 9330 set_regex_charset(&RExC_flags, cs); 9331 if (RExC_flags & RXf_PMf_FOLD) { 9332 RExC_contains_i = 1; 9333 } 9334 return; 9335 /*NOTREACHED*/ 9336 default: 9337 fail_modifiers: 9338 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 9339 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 9340 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", 9341 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 9342 /*NOTREACHED*/ 9343 } 9344 9345 ++RExC_parse; 9346 } 9347 } 9348 9349 /* 9350 - reg - regular expression, i.e. main body or parenthesized thing 9351 * 9352 * Caller must absorb opening parenthesis. 9353 * 9354 * Combining parenthesis handling with the base level of regular expression 9355 * is a trifle forced, but the need to tie the tails of the branches to what 9356 * follows makes it hard to avoid. 9357 */ 9358 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 9359 #ifdef DEBUGGING 9360 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 9361 #else 9362 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 9363 #endif 9364 9365 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets 9366 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan 9367 needs to be restarted. 9368 Otherwise would only return NULL if regbranch() returns NULL, which 9369 cannot happen. */ 9370 STATIC regnode * 9371 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) 9372 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. 9373 * 2 is like 1, but indicates that nextchar() has been called to advance 9374 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and 9375 * this flag alerts us to the need to check for that */ 9376 { 9377 dVAR; 9378 regnode *ret; /* Will be the head of the group. */ 9379 regnode *br; 9380 regnode *lastbr; 9381 regnode *ender = NULL; 9382 I32 parno = 0; 9383 I32 flags; 9384 U32 oregflags = RExC_flags; 9385 bool have_branch = 0; 9386 bool is_open = 0; 9387 I32 freeze_paren = 0; 9388 I32 after_freeze = 0; 9389 9390 char * parse_start = RExC_parse; /* MJD */ 9391 char * const oregcomp_parse = RExC_parse; 9392 9393 GET_RE_DEBUG_FLAGS_DECL; 9394 9395 PERL_ARGS_ASSERT_REG; 9396 DEBUG_PARSE("reg "); 9397 9398 *flagp = 0; /* Tentatively. */ 9399 9400 9401 /* Make an OPEN node, if parenthesized. */ 9402 if (paren) { 9403 9404 /* Under /x, space and comments can be gobbled up between the '(' and 9405 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such 9406 * intervening space, as the sequence is a token, and a token should be 9407 * indivisible */ 9408 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; 9409 9410 if ( *RExC_parse == '*') { /* (*VERB:ARG) */ 9411 char *start_verb = RExC_parse; 9412 STRLEN verb_len = 0; 9413 char *start_arg = NULL; 9414 unsigned char op = 0; 9415 int argok = 1; 9416 int internal_argval = 0; /* internal_argval is only useful if 9417 !argok */ 9418 9419 if (has_intervening_patws && SIZE_ONLY) { 9420 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); 9421 } 9422 while ( *RExC_parse && *RExC_parse != ')' ) { 9423 if ( *RExC_parse == ':' ) { 9424 start_arg = RExC_parse + 1; 9425 break; 9426 } 9427 RExC_parse++; 9428 } 9429 ++start_verb; 9430 verb_len = RExC_parse - start_verb; 9431 if ( start_arg ) { 9432 RExC_parse++; 9433 while ( *RExC_parse && *RExC_parse != ')' ) 9434 RExC_parse++; 9435 if ( *RExC_parse != ')' ) 9436 vFAIL("Unterminated verb pattern argument"); 9437 if ( RExC_parse == start_arg ) 9438 start_arg = NULL; 9439 } else { 9440 if ( *RExC_parse != ')' ) 9441 vFAIL("Unterminated verb pattern"); 9442 } 9443 9444 switch ( *start_verb ) { 9445 case 'A': /* (*ACCEPT) */ 9446 if ( memEQs(start_verb,verb_len,"ACCEPT") ) { 9447 op = ACCEPT; 9448 internal_argval = RExC_nestroot; 9449 } 9450 break; 9451 case 'C': /* (*COMMIT) */ 9452 if ( memEQs(start_verb,verb_len,"COMMIT") ) 9453 op = COMMIT; 9454 break; 9455 case 'F': /* (*FAIL) */ 9456 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { 9457 op = OPFAIL; 9458 argok = 0; 9459 } 9460 break; 9461 case ':': /* (*:NAME) */ 9462 case 'M': /* (*MARK:NAME) */ 9463 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { 9464 op = MARKPOINT; 9465 argok = -1; 9466 } 9467 break; 9468 case 'P': /* (*PRUNE) */ 9469 if ( memEQs(start_verb,verb_len,"PRUNE") ) 9470 op = PRUNE; 9471 break; 9472 case 'S': /* (*SKIP) */ 9473 if ( memEQs(start_verb,verb_len,"SKIP") ) 9474 op = SKIP; 9475 break; 9476 case 'T': /* (*THEN) */ 9477 /* [19:06] <TimToady> :: is then */ 9478 if ( memEQs(start_verb,verb_len,"THEN") ) { 9479 op = CUTGROUP; 9480 RExC_seen |= REG_CUTGROUP_SEEN; 9481 } 9482 break; 9483 } 9484 if ( ! op ) { 9485 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 9486 vFAIL2utf8f( 9487 "Unknown verb pattern '%"UTF8f"'", 9488 UTF8fARG(UTF, verb_len, start_verb)); 9489 } 9490 if ( argok ) { 9491 if ( start_arg && internal_argval ) { 9492 vFAIL3("Verb pattern '%.*s' may not have an argument", 9493 verb_len, start_verb); 9494 } else if ( argok < 0 && !start_arg ) { 9495 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 9496 verb_len, start_verb); 9497 } else { 9498 ret = reganode(pRExC_state, op, internal_argval); 9499 if ( ! internal_argval && ! SIZE_ONLY ) { 9500 if (start_arg) { 9501 SV *sv = newSVpvn( start_arg, 9502 RExC_parse - start_arg); 9503 ARG(ret) = add_data( pRExC_state, 9504 STR_WITH_LEN("S")); 9505 RExC_rxi->data->data[ARG(ret)]=(void*)sv; 9506 ret->flags = 0; 9507 } else { 9508 ret->flags = 1; 9509 } 9510 } 9511 } 9512 if (!internal_argval) 9513 RExC_seen |= REG_VERBARG_SEEN; 9514 } else if ( start_arg ) { 9515 vFAIL3("Verb pattern '%.*s' may not have an argument", 9516 verb_len, start_verb); 9517 } else { 9518 ret = reg_node(pRExC_state, op); 9519 } 9520 nextchar(pRExC_state); 9521 return ret; 9522 } 9523 else if (*RExC_parse == '?') { /* (?...) */ 9524 bool is_logical = 0; 9525 const char * const seqstart = RExC_parse; 9526 if (has_intervening_patws && SIZE_ONLY) { 9527 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); 9528 } 9529 9530 RExC_parse++; 9531 paren = *RExC_parse++; 9532 ret = NULL; /* For look-ahead/behind. */ 9533 switch (paren) { 9534 9535 case 'P': /* (?P...) variants for those used to PCRE/Python */ 9536 paren = *RExC_parse++; 9537 if ( paren == '<') /* (?P<...>) named capture */ 9538 goto named_capture; 9539 else if (paren == '>') { /* (?P>name) named recursion */ 9540 goto named_recursion; 9541 } 9542 else if (paren == '=') { /* (?P=...) named backref */ 9543 /* this pretty much dupes the code for \k<NAME> in 9544 * regatom(), if you change this make sure you change that 9545 * */ 9546 char* name_start = RExC_parse; 9547 U32 num = 0; 9548 SV *sv_dat = reg_scan_name(pRExC_state, 9549 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 9550 if (RExC_parse == name_start || *RExC_parse != ')') 9551 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ 9552 vFAIL2("Sequence %.3s... not terminated",parse_start); 9553 9554 if (!SIZE_ONLY) { 9555 num = add_data( pRExC_state, STR_WITH_LEN("S")); 9556 RExC_rxi->data->data[num]=(void*)sv_dat; 9557 SvREFCNT_inc_simple_void(sv_dat); 9558 } 9559 RExC_sawback = 1; 9560 ret = reganode(pRExC_state, 9561 ((! FOLD) 9562 ? NREF 9563 : (ASCII_FOLD_RESTRICTED) 9564 ? NREFFA 9565 : (AT_LEAST_UNI_SEMANTICS) 9566 ? NREFFU 9567 : (LOC) 9568 ? NREFFL 9569 : NREFF), 9570 num); 9571 *flagp |= HASWIDTH; 9572 9573 Set_Node_Offset(ret, parse_start+1); 9574 Set_Node_Cur_Length(ret, parse_start); 9575 9576 nextchar(pRExC_state); 9577 return ret; 9578 } 9579 RExC_parse++; 9580 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 9581 vFAIL3("Sequence (%.*s...) not recognized", 9582 RExC_parse-seqstart, seqstart); 9583 /*NOTREACHED*/ 9584 case '<': /* (?<...) */ 9585 if (*RExC_parse == '!') 9586 paren = ','; 9587 else if (*RExC_parse != '=') 9588 named_capture: 9589 { /* (?<...>) */ 9590 char *name_start; 9591 SV *svname; 9592 paren= '>'; 9593 case '\'': /* (?'...') */ 9594 name_start= RExC_parse; 9595 svname = reg_scan_name(pRExC_state, 9596 SIZE_ONLY /* reverse test from the others */ 9597 ? REG_RSN_RETURN_NAME 9598 : REG_RSN_RETURN_NULL); 9599 if (RExC_parse == name_start || *RExC_parse != paren) 9600 vFAIL2("Sequence (?%c... not terminated", 9601 paren=='>' ? '<' : paren); 9602 if (SIZE_ONLY) { 9603 HE *he_str; 9604 SV *sv_dat = NULL; 9605 if (!svname) /* shouldn't happen */ 9606 Perl_croak(aTHX_ 9607 "panic: reg_scan_name returned NULL"); 9608 if (!RExC_paren_names) { 9609 RExC_paren_names= newHV(); 9610 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 9611 #ifdef DEBUGGING 9612 RExC_paren_name_list= newAV(); 9613 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 9614 #endif 9615 } 9616 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 9617 if ( he_str ) 9618 sv_dat = HeVAL(he_str); 9619 if ( ! sv_dat ) { 9620 /* croak baby croak */ 9621 Perl_croak(aTHX_ 9622 "panic: paren_name hash element allocation failed"); 9623 } else if ( SvPOK(sv_dat) ) { 9624 /* (?|...) can mean we have dupes so scan to check 9625 its already been stored. Maybe a flag indicating 9626 we are inside such a construct would be useful, 9627 but the arrays are likely to be quite small, so 9628 for now we punt -- dmq */ 9629 IV count = SvIV(sv_dat); 9630 I32 *pv = (I32*)SvPVX(sv_dat); 9631 IV i; 9632 for ( i = 0 ; i < count ; i++ ) { 9633 if ( pv[i] == RExC_npar ) { 9634 count = 0; 9635 break; 9636 } 9637 } 9638 if ( count ) { 9639 pv = (I32*)SvGROW(sv_dat, 9640 SvCUR(sv_dat) + sizeof(I32)+1); 9641 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 9642 pv[count] = RExC_npar; 9643 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 9644 } 9645 } else { 9646 (void)SvUPGRADE(sv_dat,SVt_PVNV); 9647 sv_setpvn(sv_dat, (char *)&(RExC_npar), 9648 sizeof(I32)); 9649 SvIOK_on(sv_dat); 9650 SvIV_set(sv_dat, 1); 9651 } 9652 #ifdef DEBUGGING 9653 /* Yes this does cause a memory leak in debugging Perls 9654 * */ 9655 if (!av_store(RExC_paren_name_list, 9656 RExC_npar, SvREFCNT_inc(svname))) 9657 SvREFCNT_dec_NN(svname); 9658 #endif 9659 9660 /*sv_dump(sv_dat);*/ 9661 } 9662 nextchar(pRExC_state); 9663 paren = 1; 9664 goto capturing_parens; 9665 } 9666 RExC_seen |= REG_LOOKBEHIND_SEEN; 9667 RExC_in_lookbehind++; 9668 RExC_parse++; 9669 case '=': /* (?=...) */ 9670 RExC_seen_zerolen++; 9671 break; 9672 case '!': /* (?!...) */ 9673 RExC_seen_zerolen++; 9674 if (*RExC_parse == ')') { 9675 ret=reg_node(pRExC_state, OPFAIL); 9676 nextchar(pRExC_state); 9677 return ret; 9678 } 9679 break; 9680 case '|': /* (?|...) */ 9681 /* branch reset, behave like a (?:...) except that 9682 buffers in alternations share the same numbers */ 9683 paren = ':'; 9684 after_freeze = freeze_paren = RExC_npar; 9685 break; 9686 case ':': /* (?:...) */ 9687 case '>': /* (?>...) */ 9688 break; 9689 case '$': /* (?$...) */ 9690 case '@': /* (?@...) */ 9691 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 9692 break; 9693 case '#': /* (?#...) */ 9694 /* XXX As soon as we disallow separating the '?' and '*' (by 9695 * spaces or (?#...) comment), it is believed that this case 9696 * will be unreachable and can be removed. See 9697 * [perl #117327] */ 9698 while (*RExC_parse && *RExC_parse != ')') 9699 RExC_parse++; 9700 if (*RExC_parse != ')') 9701 FAIL("Sequence (?#... not terminated"); 9702 nextchar(pRExC_state); 9703 *flagp = TRYAGAIN; 9704 return NULL; 9705 case '0' : /* (?0) */ 9706 case 'R' : /* (?R) */ 9707 if (*RExC_parse != ')') 9708 FAIL("Sequence (?R) not terminated"); 9709 ret = reg_node(pRExC_state, GOSTART); 9710 RExC_seen |= REG_GOSTART_SEEN; 9711 *flagp |= POSTPONED; 9712 nextchar(pRExC_state); 9713 return ret; 9714 /*notreached*/ 9715 { /* named and numeric backreferences */ 9716 I32 num; 9717 case '&': /* (?&NAME) */ 9718 parse_start = RExC_parse - 1; 9719 named_recursion: 9720 { 9721 SV *sv_dat = reg_scan_name(pRExC_state, 9722 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 9723 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 9724 } 9725 if (RExC_parse == RExC_end || *RExC_parse != ')') 9726 vFAIL("Sequence (?&... not terminated"); 9727 goto gen_recurse_regop; 9728 assert(0); /* NOT REACHED */ 9729 case '+': 9730 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 9731 RExC_parse++; 9732 vFAIL("Illegal pattern"); 9733 } 9734 goto parse_recursion; 9735 /* NOT REACHED*/ 9736 case '-': /* (?-1) */ 9737 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 9738 RExC_parse--; /* rewind to let it be handled later */ 9739 goto parse_flags; 9740 } 9741 /*FALLTHROUGH */ 9742 case '1': case '2': case '3': case '4': /* (?1) */ 9743 case '5': case '6': case '7': case '8': case '9': 9744 RExC_parse--; 9745 parse_recursion: 9746 num = atoi(RExC_parse); 9747 parse_start = RExC_parse - 1; /* MJD */ 9748 if (*RExC_parse == '-') 9749 RExC_parse++; 9750 while (isDIGIT(*RExC_parse)) 9751 RExC_parse++; 9752 if (*RExC_parse!=')') 9753 vFAIL("Expecting close bracket"); 9754 9755 gen_recurse_regop: 9756 if ( paren == '-' ) { 9757 /* 9758 Diagram of capture buffer numbering. 9759 Top line is the normal capture buffer numbers 9760 Bottom line is the negative indexing as from 9761 the X (the (?-2)) 9762 9763 + 1 2 3 4 5 X 6 7 9764 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 9765 - 5 4 3 2 1 X x x 9766 9767 */ 9768 num = RExC_npar + num; 9769 if (num < 1) { 9770 RExC_parse++; 9771 vFAIL("Reference to nonexistent group"); 9772 } 9773 } else if ( paren == '+' ) { 9774 num = RExC_npar + num - 1; 9775 } 9776 9777 ret = reganode(pRExC_state, GOSUB, num); 9778 if (!SIZE_ONLY) { 9779 if (num > (I32)RExC_rx->nparens) { 9780 RExC_parse++; 9781 vFAIL("Reference to nonexistent group"); 9782 } 9783 ARG2L_SET( ret, RExC_recurse_count++); 9784 RExC_emit++; 9785 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 9786 "Recurse #%"UVuf" to %"IVdf"\n", 9787 (UV)ARG(ret), (IV)ARG2L(ret))); 9788 } else { 9789 RExC_size++; 9790 } 9791 RExC_seen |= REG_RECURSE_SEEN; 9792 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ 9793 Set_Node_Offset(ret, parse_start); /* MJD */ 9794 9795 *flagp |= POSTPONED; 9796 nextchar(pRExC_state); 9797 return ret; 9798 } /* named and numeric backreferences */ 9799 assert(0); /* NOT REACHED */ 9800 9801 case '?': /* (??...) */ 9802 is_logical = 1; 9803 if (*RExC_parse != '{') { 9804 RExC_parse++; 9805 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 9806 vFAIL2utf8f( 9807 "Sequence (%"UTF8f"...) not recognized", 9808 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 9809 /*NOTREACHED*/ 9810 } 9811 *flagp |= POSTPONED; 9812 paren = *RExC_parse++; 9813 /* FALL THROUGH */ 9814 case '{': /* (?{...}) */ 9815 { 9816 U32 n = 0; 9817 struct reg_code_block *cb; 9818 9819 RExC_seen_zerolen++; 9820 9821 if ( !pRExC_state->num_code_blocks 9822 || pRExC_state->code_index >= pRExC_state->num_code_blocks 9823 || pRExC_state->code_blocks[pRExC_state->code_index].start 9824 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) 9825 - RExC_start) 9826 ) { 9827 if (RExC_pm_flags & PMf_USE_RE_EVAL) 9828 FAIL("panic: Sequence (?{...}): no code block found\n"); 9829 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 9830 } 9831 /* this is a pre-compiled code block (?{...}) */ 9832 cb = &pRExC_state->code_blocks[pRExC_state->code_index]; 9833 RExC_parse = RExC_start + cb->end; 9834 if (!SIZE_ONLY) { 9835 OP *o = cb->block; 9836 if (cb->src_regex) { 9837 n = add_data(pRExC_state, STR_WITH_LEN("rl")); 9838 RExC_rxi->data->data[n] = 9839 (void*)SvREFCNT_inc((SV*)cb->src_regex); 9840 RExC_rxi->data->data[n+1] = (void*)o; 9841 } 9842 else { 9843 n = add_data(pRExC_state, 9844 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); 9845 RExC_rxi->data->data[n] = (void*)o; 9846 } 9847 } 9848 pRExC_state->code_index++; 9849 nextchar(pRExC_state); 9850 9851 if (is_logical) { 9852 regnode *eval; 9853 ret = reg_node(pRExC_state, LOGICAL); 9854 eval = reganode(pRExC_state, EVAL, n); 9855 if (!SIZE_ONLY) { 9856 ret->flags = 2; 9857 /* for later propagation into (??{}) return value */ 9858 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); 9859 } 9860 REGTAIL(pRExC_state, ret, eval); 9861 /* deal with the length of this later - MJD */ 9862 return ret; 9863 } 9864 ret = reganode(pRExC_state, EVAL, n); 9865 Set_Node_Length(ret, RExC_parse - parse_start + 1); 9866 Set_Node_Offset(ret, parse_start); 9867 return ret; 9868 } 9869 case '(': /* (?(?{...})...) and (?(?=...)...) */ 9870 { 9871 int is_define= 0; 9872 if (RExC_parse[0] == '?') { /* (?(?...)) */ 9873 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 9874 || RExC_parse[1] == '<' 9875 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 9876 I32 flag; 9877 regnode *tail; 9878 9879 ret = reg_node(pRExC_state, LOGICAL); 9880 if (!SIZE_ONLY) 9881 ret->flags = 1; 9882 9883 tail = reg(pRExC_state, 1, &flag, depth+1); 9884 if (flag & RESTART_UTF8) { 9885 *flagp = RESTART_UTF8; 9886 return NULL; 9887 } 9888 REGTAIL(pRExC_state, ret, tail); 9889 goto insert_if; 9890 } 9891 } 9892 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 9893 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 9894 { 9895 char ch = RExC_parse[0] == '<' ? '>' : '\''; 9896 char *name_start= RExC_parse++; 9897 U32 num = 0; 9898 SV *sv_dat=reg_scan_name(pRExC_state, 9899 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 9900 if (RExC_parse == name_start || *RExC_parse != ch) 9901 vFAIL2("Sequence (?(%c... not terminated", 9902 (ch == '>' ? '<' : ch)); 9903 RExC_parse++; 9904 if (!SIZE_ONLY) { 9905 num = add_data( pRExC_state, STR_WITH_LEN("S")); 9906 RExC_rxi->data->data[num]=(void*)sv_dat; 9907 SvREFCNT_inc_simple_void(sv_dat); 9908 } 9909 ret = reganode(pRExC_state,NGROUPP,num); 9910 goto insert_if_check_paren; 9911 } 9912 else if (RExC_parse[0] == 'D' && 9913 RExC_parse[1] == 'E' && 9914 RExC_parse[2] == 'F' && 9915 RExC_parse[3] == 'I' && 9916 RExC_parse[4] == 'N' && 9917 RExC_parse[5] == 'E') 9918 { 9919 ret = reganode(pRExC_state,DEFINEP,0); 9920 RExC_parse +=6 ; 9921 is_define = 1; 9922 goto insert_if_check_paren; 9923 } 9924 else if (RExC_parse[0] == 'R') { 9925 RExC_parse++; 9926 parno = 0; 9927 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 9928 parno = atoi(RExC_parse++); 9929 while (isDIGIT(*RExC_parse)) 9930 RExC_parse++; 9931 } else if (RExC_parse[0] == '&') { 9932 SV *sv_dat; 9933 RExC_parse++; 9934 sv_dat = reg_scan_name(pRExC_state, 9935 SIZE_ONLY 9936 ? REG_RSN_RETURN_NULL 9937 : REG_RSN_RETURN_DATA); 9938 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 9939 } 9940 ret = reganode(pRExC_state,INSUBP,parno); 9941 goto insert_if_check_paren; 9942 } 9943 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 9944 /* (?(1)...) */ 9945 char c; 9946 char *tmp; 9947 parno = atoi(RExC_parse++); 9948 9949 while (isDIGIT(*RExC_parse)) 9950 RExC_parse++; 9951 ret = reganode(pRExC_state, GROUPP, parno); 9952 9953 insert_if_check_paren: 9954 if (*(tmp = nextchar(pRExC_state)) != ')') { 9955 /* nextchar also skips comments, so undo its work 9956 * and skip over the the next character. 9957 */ 9958 RExC_parse = tmp; 9959 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 9960 vFAIL("Switch condition not recognized"); 9961 } 9962 insert_if: 9963 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 9964 br = regbranch(pRExC_state, &flags, 1,depth+1); 9965 if (br == NULL) { 9966 if (flags & RESTART_UTF8) { 9967 *flagp = RESTART_UTF8; 9968 return NULL; 9969 } 9970 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", 9971 (UV) flags); 9972 } else 9973 REGTAIL(pRExC_state, br, reganode(pRExC_state, 9974 LONGJMP, 0)); 9975 c = *nextchar(pRExC_state); 9976 if (flags&HASWIDTH) 9977 *flagp |= HASWIDTH; 9978 if (c == '|') { 9979 if (is_define) 9980 vFAIL("(?(DEFINE)....) does not allow branches"); 9981 9982 /* Fake one for optimizer. */ 9983 lastbr = reganode(pRExC_state, IFTHEN, 0); 9984 9985 if (!regbranch(pRExC_state, &flags, 1,depth+1)) { 9986 if (flags & RESTART_UTF8) { 9987 *flagp = RESTART_UTF8; 9988 return NULL; 9989 } 9990 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", 9991 (UV) flags); 9992 } 9993 REGTAIL(pRExC_state, ret, lastbr); 9994 if (flags&HASWIDTH) 9995 *flagp |= HASWIDTH; 9996 c = *nextchar(pRExC_state); 9997 } 9998 else 9999 lastbr = NULL; 10000 if (c != ')') 10001 vFAIL("Switch (?(condition)... contains too many branches"); 10002 ender = reg_node(pRExC_state, TAIL); 10003 REGTAIL(pRExC_state, br, ender); 10004 if (lastbr) { 10005 REGTAIL(pRExC_state, lastbr, ender); 10006 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 10007 } 10008 else 10009 REGTAIL(pRExC_state, ret, ender); 10010 RExC_size++; /* XXX WHY do we need this?!! 10011 For large programs it seems to be required 10012 but I can't figure out why. -- dmq*/ 10013 return ret; 10014 } 10015 else { 10016 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10017 vFAIL("Unknown switch condition (?(...))"); 10018 } 10019 } 10020 case '[': /* (?[ ... ]) */ 10021 return handle_regex_sets(pRExC_state, NULL, flagp, depth, 10022 oregcomp_parse); 10023 case 0: 10024 RExC_parse--; /* for vFAIL to print correctly */ 10025 vFAIL("Sequence (? incomplete"); 10026 break; 10027 default: /* e.g., (?i) */ 10028 --RExC_parse; 10029 parse_flags: 10030 parse_lparen_question_flags(pRExC_state); 10031 if (UCHARAT(RExC_parse) != ':') { 10032 nextchar(pRExC_state); 10033 *flagp = TRYAGAIN; 10034 return NULL; 10035 } 10036 paren = ':'; 10037 nextchar(pRExC_state); 10038 ret = NULL; 10039 goto parse_rest; 10040 } /* end switch */ 10041 } 10042 else { /* (...) */ 10043 capturing_parens: 10044 parno = RExC_npar; 10045 RExC_npar++; 10046 10047 ret = reganode(pRExC_state, OPEN, parno); 10048 if (!SIZE_ONLY ){ 10049 if (!RExC_nestroot) 10050 RExC_nestroot = parno; 10051 if (RExC_seen & REG_RECURSE_SEEN 10052 && !RExC_open_parens[parno-1]) 10053 { 10054 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 10055 "Setting open paren #%"IVdf" to %d\n", 10056 (IV)parno, REG_NODE_NUM(ret))); 10057 RExC_open_parens[parno-1]= ret; 10058 } 10059 } 10060 Set_Node_Length(ret, 1); /* MJD */ 10061 Set_Node_Offset(ret, RExC_parse); /* MJD */ 10062 is_open = 1; 10063 } 10064 } 10065 else /* ! paren */ 10066 ret = NULL; 10067 10068 parse_rest: 10069 /* Pick up the branches, linking them together. */ 10070 parse_start = RExC_parse; /* MJD */ 10071 br = regbranch(pRExC_state, &flags, 1,depth+1); 10072 10073 /* branch_len = (paren != 0); */ 10074 10075 if (br == NULL) { 10076 if (flags & RESTART_UTF8) { 10077 *flagp = RESTART_UTF8; 10078 return NULL; 10079 } 10080 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); 10081 } 10082 if (*RExC_parse == '|') { 10083 if (!SIZE_ONLY && RExC_extralen) { 10084 reginsert(pRExC_state, BRANCHJ, br, depth+1); 10085 } 10086 else { /* MJD */ 10087 reginsert(pRExC_state, BRANCH, br, depth+1); 10088 Set_Node_Length(br, paren != 0); 10089 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 10090 } 10091 have_branch = 1; 10092 if (SIZE_ONLY) 10093 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 10094 } 10095 else if (paren == ':') { 10096 *flagp |= flags&SIMPLE; 10097 } 10098 if (is_open) { /* Starts with OPEN. */ 10099 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ 10100 } 10101 else if (paren != '?') /* Not Conditional */ 10102 ret = br; 10103 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 10104 lastbr = br; 10105 while (*RExC_parse == '|') { 10106 if (!SIZE_ONLY && RExC_extralen) { 10107 ender = reganode(pRExC_state, LONGJMP,0); 10108 10109 /* Append to the previous. */ 10110 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 10111 } 10112 if (SIZE_ONLY) 10113 RExC_extralen += 2; /* Account for LONGJMP. */ 10114 nextchar(pRExC_state); 10115 if (freeze_paren) { 10116 if (RExC_npar > after_freeze) 10117 after_freeze = RExC_npar; 10118 RExC_npar = freeze_paren; 10119 } 10120 br = regbranch(pRExC_state, &flags, 0, depth+1); 10121 10122 if (br == NULL) { 10123 if (flags & RESTART_UTF8) { 10124 *flagp = RESTART_UTF8; 10125 return NULL; 10126 } 10127 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); 10128 } 10129 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 10130 lastbr = br; 10131 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 10132 } 10133 10134 if (have_branch || paren != ':') { 10135 /* Make a closing node, and hook it on the end. */ 10136 switch (paren) { 10137 case ':': 10138 ender = reg_node(pRExC_state, TAIL); 10139 break; 10140 case 1: case 2: 10141 ender = reganode(pRExC_state, CLOSE, parno); 10142 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { 10143 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 10144 "Setting close paren #%"IVdf" to %d\n", 10145 (IV)parno, REG_NODE_NUM(ender))); 10146 RExC_close_parens[parno-1]= ender; 10147 if (RExC_nestroot == parno) 10148 RExC_nestroot = 0; 10149 } 10150 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 10151 Set_Node_Length(ender,1); /* MJD */ 10152 break; 10153 case '<': 10154 case ',': 10155 case '=': 10156 case '!': 10157 *flagp &= ~HASWIDTH; 10158 /* FALL THROUGH */ 10159 case '>': 10160 ender = reg_node(pRExC_state, SUCCEED); 10161 break; 10162 case 0: 10163 ender = reg_node(pRExC_state, END); 10164 if (!SIZE_ONLY) { 10165 assert(!RExC_opend); /* there can only be one! */ 10166 RExC_opend = ender; 10167 } 10168 break; 10169 } 10170 DEBUG_PARSE_r(if (!SIZE_ONLY) { 10171 SV * const mysv_val1=sv_newmortal(); 10172 SV * const mysv_val2=sv_newmortal(); 10173 DEBUG_PARSE_MSG("lsbr"); 10174 regprop(RExC_rx, mysv_val1, lastbr, NULL); 10175 regprop(RExC_rx, mysv_val2, ender, NULL); 10176 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", 10177 SvPV_nolen_const(mysv_val1), 10178 (IV)REG_NODE_NUM(lastbr), 10179 SvPV_nolen_const(mysv_val2), 10180 (IV)REG_NODE_NUM(ender), 10181 (IV)(ender - lastbr) 10182 ); 10183 }); 10184 REGTAIL(pRExC_state, lastbr, ender); 10185 10186 if (have_branch && !SIZE_ONLY) { 10187 char is_nothing= 1; 10188 if (depth==1) 10189 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 10190 10191 /* Hook the tails of the branches to the closing node. */ 10192 for (br = ret; br; br = regnext(br)) { 10193 const U8 op = PL_regkind[OP(br)]; 10194 if (op == BRANCH) { 10195 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); 10196 if ( OP(NEXTOPER(br)) != NOTHING 10197 || regnext(NEXTOPER(br)) != ender) 10198 is_nothing= 0; 10199 } 10200 else if (op == BRANCHJ) { 10201 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); 10202 /* for now we always disable this optimisation * / 10203 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING 10204 || regnext(NEXTOPER(NEXTOPER(br))) != ender) 10205 */ 10206 is_nothing= 0; 10207 } 10208 } 10209 if (is_nothing) { 10210 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; 10211 DEBUG_PARSE_r(if (!SIZE_ONLY) { 10212 SV * const mysv_val1=sv_newmortal(); 10213 SV * const mysv_val2=sv_newmortal(); 10214 DEBUG_PARSE_MSG("NADA"); 10215 regprop(RExC_rx, mysv_val1, ret, NULL); 10216 regprop(RExC_rx, mysv_val2, ender, NULL); 10217 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", 10218 SvPV_nolen_const(mysv_val1), 10219 (IV)REG_NODE_NUM(ret), 10220 SvPV_nolen_const(mysv_val2), 10221 (IV)REG_NODE_NUM(ender), 10222 (IV)(ender - ret) 10223 ); 10224 }); 10225 OP(br)= NOTHING; 10226 if (OP(ender) == TAIL) { 10227 NEXT_OFF(br)= 0; 10228 RExC_emit= br + 1; 10229 } else { 10230 regnode *opt; 10231 for ( opt= br + 1; opt < ender ; opt++ ) 10232 OP(opt)= OPTIMIZED; 10233 NEXT_OFF(br)= ender - br; 10234 } 10235 } 10236 } 10237 } 10238 10239 { 10240 const char *p; 10241 static const char parens[] = "=!<,>"; 10242 10243 if (paren && (p = strchr(parens, paren))) { 10244 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 10245 int flag = (p - parens) > 1; 10246 10247 if (paren == '>') 10248 node = SUSPEND, flag = 0; 10249 reginsert(pRExC_state, node,ret, depth+1); 10250 Set_Node_Cur_Length(ret, parse_start); 10251 Set_Node_Offset(ret, parse_start + 1); 10252 ret->flags = flag; 10253 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 10254 } 10255 } 10256 10257 /* Check for proper termination. */ 10258 if (paren) { 10259 /* restore original flags, but keep (?p) */ 10260 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); 10261 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 10262 RExC_parse = oregcomp_parse; 10263 vFAIL("Unmatched ("); 10264 } 10265 } 10266 else if (!paren && RExC_parse < RExC_end) { 10267 if (*RExC_parse == ')') { 10268 RExC_parse++; 10269 vFAIL("Unmatched )"); 10270 } 10271 else 10272 FAIL("Junk on end of regexp"); /* "Can't happen". */ 10273 assert(0); /* NOTREACHED */ 10274 } 10275 10276 if (RExC_in_lookbehind) { 10277 RExC_in_lookbehind--; 10278 } 10279 if (after_freeze > RExC_npar) 10280 RExC_npar = after_freeze; 10281 return(ret); 10282 } 10283 10284 /* 10285 - regbranch - one alternative of an | operator 10286 * 10287 * Implements the concatenation operator. 10288 * 10289 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be 10290 * restarted. 10291 */ 10292 STATIC regnode * 10293 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 10294 { 10295 dVAR; 10296 regnode *ret; 10297 regnode *chain = NULL; 10298 regnode *latest; 10299 I32 flags = 0, c = 0; 10300 GET_RE_DEBUG_FLAGS_DECL; 10301 10302 PERL_ARGS_ASSERT_REGBRANCH; 10303 10304 DEBUG_PARSE("brnc"); 10305 10306 if (first) 10307 ret = NULL; 10308 else { 10309 if (!SIZE_ONLY && RExC_extralen) 10310 ret = reganode(pRExC_state, BRANCHJ,0); 10311 else { 10312 ret = reg_node(pRExC_state, BRANCH); 10313 Set_Node_Length(ret, 1); 10314 } 10315 } 10316 10317 if (!first && SIZE_ONLY) 10318 RExC_extralen += 1; /* BRANCHJ */ 10319 10320 *flagp = WORST; /* Tentatively. */ 10321 10322 RExC_parse--; 10323 nextchar(pRExC_state); 10324 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 10325 flags &= ~TRYAGAIN; 10326 latest = regpiece(pRExC_state, &flags,depth+1); 10327 if (latest == NULL) { 10328 if (flags & TRYAGAIN) 10329 continue; 10330 if (flags & RESTART_UTF8) { 10331 *flagp = RESTART_UTF8; 10332 return NULL; 10333 } 10334 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); 10335 } 10336 else if (ret == NULL) 10337 ret = latest; 10338 *flagp |= flags&(HASWIDTH|POSTPONED); 10339 if (chain == NULL) /* First piece. */ 10340 *flagp |= flags&SPSTART; 10341 else { 10342 RExC_naughty++; 10343 REGTAIL(pRExC_state, chain, latest); 10344 } 10345 chain = latest; 10346 c++; 10347 } 10348 if (chain == NULL) { /* Loop ran zero times. */ 10349 chain = reg_node(pRExC_state, NOTHING); 10350 if (ret == NULL) 10351 ret = chain; 10352 } 10353 if (c == 1) { 10354 *flagp |= flags&SIMPLE; 10355 } 10356 10357 return ret; 10358 } 10359 10360 /* 10361 - regpiece - something followed by possible [*+?] 10362 * 10363 * Note that the branching code sequences used for ? and the general cases 10364 * of * and + are somewhat optimized: they use the same NOTHING node as 10365 * both the endmarker for their branch list and the body of the last branch. 10366 * It might seem that this node could be dispensed with entirely, but the 10367 * endmarker role is not redundant. 10368 * 10369 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with 10370 * TRYAGAIN. 10371 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be 10372 * restarted. 10373 */ 10374 STATIC regnode * 10375 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 10376 { 10377 dVAR; 10378 regnode *ret; 10379 char op; 10380 char *next; 10381 I32 flags; 10382 const char * const origparse = RExC_parse; 10383 I32 min; 10384 I32 max = REG_INFTY; 10385 #ifdef RE_TRACK_PATTERN_OFFSETS 10386 char *parse_start; 10387 #endif 10388 const char *maxpos = NULL; 10389 10390 /* Save the original in case we change the emitted regop to a FAIL. */ 10391 regnode * const orig_emit = RExC_emit; 10392 10393 GET_RE_DEBUG_FLAGS_DECL; 10394 10395 PERL_ARGS_ASSERT_REGPIECE; 10396 10397 DEBUG_PARSE("piec"); 10398 10399 ret = regatom(pRExC_state, &flags,depth+1); 10400 if (ret == NULL) { 10401 if (flags & (TRYAGAIN|RESTART_UTF8)) 10402 *flagp |= flags & (TRYAGAIN|RESTART_UTF8); 10403 else 10404 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); 10405 return(NULL); 10406 } 10407 10408 op = *RExC_parse; 10409 10410 if (op == '{' && regcurly(RExC_parse, FALSE)) { 10411 maxpos = NULL; 10412 #ifdef RE_TRACK_PATTERN_OFFSETS 10413 parse_start = RExC_parse; /* MJD */ 10414 #endif 10415 next = RExC_parse + 1; 10416 while (isDIGIT(*next) || *next == ',') { 10417 if (*next == ',') { 10418 if (maxpos) 10419 break; 10420 else 10421 maxpos = next; 10422 } 10423 next++; 10424 } 10425 if (*next == '}') { /* got one */ 10426 if (!maxpos) 10427 maxpos = next; 10428 RExC_parse++; 10429 min = atoi(RExC_parse); 10430 if (*maxpos == ',') 10431 maxpos++; 10432 else 10433 maxpos = RExC_parse; 10434 max = atoi(maxpos); 10435 if (!max && *maxpos != '0') 10436 max = REG_INFTY; /* meaning "infinity" */ 10437 else if (max >= REG_INFTY) 10438 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 10439 RExC_parse = next; 10440 nextchar(pRExC_state); 10441 if (max < min) { /* If can't match, warn and optimize to fail 10442 unconditionally */ 10443 if (SIZE_ONLY) { 10444 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); 10445 10446 /* We can't back off the size because we have to reserve 10447 * enough space for all the things we are about to throw 10448 * away, but we can shrink it by the ammount we are about 10449 * to re-use here */ 10450 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; 10451 } 10452 else { 10453 RExC_emit = orig_emit; 10454 } 10455 ret = reg_node(pRExC_state, OPFAIL); 10456 return ret; 10457 } 10458 else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?') 10459 { 10460 if (SIZE_ONLY) { 10461 ckWARN2reg(RExC_parse + 1, 10462 "Useless use of greediness modifier '%c'", 10463 *RExC_parse); 10464 } 10465 /* Absorb the modifier, so later code doesn't see nor use 10466 * it */ 10467 nextchar(pRExC_state); 10468 } 10469 10470 do_curly: 10471 if ((flags&SIMPLE)) { 10472 RExC_naughty += 2 + RExC_naughty / 2; 10473 reginsert(pRExC_state, CURLY, ret, depth+1); 10474 Set_Node_Offset(ret, parse_start+1); /* MJD */ 10475 Set_Node_Cur_Length(ret, parse_start); 10476 } 10477 else { 10478 regnode * const w = reg_node(pRExC_state, WHILEM); 10479 10480 w->flags = 0; 10481 REGTAIL(pRExC_state, ret, w); 10482 if (!SIZE_ONLY && RExC_extralen) { 10483 reginsert(pRExC_state, LONGJMP,ret, depth+1); 10484 reginsert(pRExC_state, NOTHING,ret, depth+1); 10485 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 10486 } 10487 reginsert(pRExC_state, CURLYX,ret, depth+1); 10488 /* MJD hk */ 10489 Set_Node_Offset(ret, parse_start+1); 10490 Set_Node_Length(ret, 10491 op == '{' ? (RExC_parse - parse_start) : 1); 10492 10493 if (!SIZE_ONLY && RExC_extralen) 10494 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 10495 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 10496 if (SIZE_ONLY) 10497 RExC_whilem_seen++, RExC_extralen += 3; 10498 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 10499 } 10500 ret->flags = 0; 10501 10502 if (min > 0) 10503 *flagp = WORST; 10504 if (max > 0) 10505 *flagp |= HASWIDTH; 10506 if (!SIZE_ONLY) { 10507 ARG1_SET(ret, (U16)min); 10508 ARG2_SET(ret, (U16)max); 10509 } 10510 if (max == REG_INFTY) 10511 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 10512 10513 goto nest_check; 10514 } 10515 } 10516 10517 if (!ISMULT1(op)) { 10518 *flagp = flags; 10519 return(ret); 10520 } 10521 10522 #if 0 /* Now runtime fix should be reliable. */ 10523 10524 /* if this is reinstated, don't forget to put this back into perldiag: 10525 10526 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 10527 10528 (F) The part of the regexp subject to either the * or + quantifier 10529 could match an empty string. The {#} shows in the regular 10530 expression about where the problem was discovered. 10531 10532 */ 10533 10534 if (!(flags&HASWIDTH) && op != '?') 10535 vFAIL("Regexp *+ operand could be empty"); 10536 #endif 10537 10538 #ifdef RE_TRACK_PATTERN_OFFSETS 10539 parse_start = RExC_parse; 10540 #endif 10541 nextchar(pRExC_state); 10542 10543 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 10544 10545 if (op == '*' && (flags&SIMPLE)) { 10546 reginsert(pRExC_state, STAR, ret, depth+1); 10547 ret->flags = 0; 10548 RExC_naughty += 4; 10549 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 10550 } 10551 else if (op == '*') { 10552 min = 0; 10553 goto do_curly; 10554 } 10555 else if (op == '+' && (flags&SIMPLE)) { 10556 reginsert(pRExC_state, PLUS, ret, depth+1); 10557 ret->flags = 0; 10558 RExC_naughty += 3; 10559 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 10560 } 10561 else if (op == '+') { 10562 min = 1; 10563 goto do_curly; 10564 } 10565 else if (op == '?') { 10566 min = 0; max = 1; 10567 goto do_curly; 10568 } 10569 nest_check: 10570 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { 10571 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ 10572 ckWARN2reg(RExC_parse, 10573 "%"UTF8f" matches null string many times", 10574 UTF8fARG(UTF, (RExC_parse >= origparse 10575 ? RExC_parse - origparse 10576 : 0), 10577 origparse)); 10578 (void)ReREFCNT_inc(RExC_rx_sv); 10579 } 10580 10581 if (RExC_parse < RExC_end && *RExC_parse == '?') { 10582 nextchar(pRExC_state); 10583 reginsert(pRExC_state, MINMOD, ret, depth+1); 10584 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); 10585 } 10586 else 10587 if (RExC_parse < RExC_end && *RExC_parse == '+') { 10588 regnode *ender; 10589 nextchar(pRExC_state); 10590 ender = reg_node(pRExC_state, SUCCEED); 10591 REGTAIL(pRExC_state, ret, ender); 10592 reginsert(pRExC_state, SUSPEND, ret, depth+1); 10593 ret->flags = 0; 10594 ender = reg_node(pRExC_state, TAIL); 10595 REGTAIL(pRExC_state, ret, ender); 10596 } 10597 10598 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { 10599 RExC_parse++; 10600 vFAIL("Nested quantifiers"); 10601 } 10602 10603 return(ret); 10604 } 10605 10606 STATIC bool 10607 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, 10608 UV *valuep, I32 *flagp, U32 depth, bool in_char_class, 10609 const bool strict /* Apply stricter parsing rules? */ 10610 ) 10611 { 10612 10613 /* This is expected to be called by a parser routine that has recognized '\N' 10614 and needs to handle the rest. RExC_parse is expected to point at the first 10615 char following the N at the time of the call. On successful return, 10616 RExC_parse has been updated to point to just after the sequence identified 10617 by this routine, and <*flagp> has been updated. 10618 10619 The \N may be inside (indicated by the boolean <in_char_class>) or outside a 10620 character class. 10621 10622 \N may begin either a named sequence, or if outside a character class, mean 10623 to match a non-newline. For non single-quoted regexes, the tokenizer has 10624 attempted to decide which, and in the case of a named sequence, converted it 10625 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, 10626 where c1... are the characters in the sequence. For single-quoted regexes, 10627 the tokenizer passes the \N sequence through unchanged; this code will not 10628 attempt to determine this nor expand those, instead raising a syntax error. 10629 The net effect is that if the beginning of the passed-in pattern isn't '{U+' 10630 or there is no '}', it signals that this \N occurrence means to match a 10631 non-newline. 10632 10633 Only the \N{U+...} form should occur in a character class, for the same 10634 reason that '.' inside a character class means to just match a period: it 10635 just doesn't make sense. 10636 10637 The function raises an error (via vFAIL), and doesn't return for various 10638 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on 10639 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to 10640 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is 10641 only possible if node_p is non-NULL. 10642 10643 10644 If <valuep> is non-null, it means the caller can accept an input sequence 10645 consisting of a just a single code point; <*valuep> is set to that value 10646 if the input is such. 10647 10648 If <node_p> is non-null it signifies that the caller can accept any other 10649 legal sequence (i.e., one that isn't just a single code point). <*node_p> 10650 is set as follows: 10651 1) \N means not-a-NL: points to a newly created REG_ANY node; 10652 2) \N{}: points to a new NOTHING node; 10653 3) otherwise: points to a new EXACT node containing the resolved 10654 string. 10655 Note that FALSE is returned for single code point sequences if <valuep> is 10656 null. 10657 */ 10658 10659 char * endbrace; /* '}' following the name */ 10660 char* p; 10661 char *endchar; /* Points to '.' or '}' ending cur char in the input 10662 stream */ 10663 bool has_multiple_chars; /* true if the input stream contains a sequence of 10664 more than one character */ 10665 10666 GET_RE_DEBUG_FLAGS_DECL; 10667 10668 PERL_ARGS_ASSERT_GROK_BSLASH_N; 10669 10670 GET_RE_DEBUG_FLAGS; 10671 10672 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ 10673 10674 /* The [^\n] meaning of \N ignores spaces and comments under the /x 10675 * modifier. The other meaning does not, so use a temporary until we find 10676 * out which we are being called with */ 10677 p = (RExC_flags & RXf_PMf_EXTENDED) 10678 ? regwhite( pRExC_state, RExC_parse ) 10679 : RExC_parse; 10680 10681 /* Disambiguate between \N meaning a named character versus \N meaning 10682 * [^\n]. The former is assumed when it can't be the latter. */ 10683 if (*p != '{' || regcurly(p, FALSE)) { 10684 RExC_parse = p; 10685 if (! node_p) { 10686 /* no bare \N allowed in a charclass */ 10687 if (in_char_class) { 10688 vFAIL("\\N in a character class must be a named character: \\N{...}"); 10689 } 10690 return FALSE; 10691 } 10692 RExC_parse--; /* Need to back off so nextchar() doesn't skip the 10693 current char */ 10694 nextchar(pRExC_state); 10695 *node_p = reg_node(pRExC_state, REG_ANY); 10696 *flagp |= HASWIDTH|SIMPLE; 10697 RExC_naughty++; 10698 Set_Node_Length(*node_p, 1); /* MJD */ 10699 return TRUE; 10700 } 10701 10702 /* Here, we have decided it should be a named character or sequence */ 10703 10704 /* The test above made sure that the next real character is a '{', but 10705 * under the /x modifier, it could be separated by space (or a comment and 10706 * \n) and this is not allowed (for consistency with \x{...} and the 10707 * tokenizer handling of \N{NAME}). */ 10708 if (*RExC_parse != '{') { 10709 vFAIL("Missing braces on \\N{}"); 10710 } 10711 10712 RExC_parse++; /* Skip past the '{' */ 10713 10714 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ 10715 || ! (endbrace == RExC_parse /* nothing between the {} */ 10716 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below 10717 */ 10718 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) 10719 */ 10720 { 10721 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ 10722 vFAIL("\\N{NAME} must be resolved by the lexer"); 10723 } 10724 10725 if (endbrace == RExC_parse) { /* empty: \N{} */ 10726 bool ret = TRUE; 10727 if (node_p) { 10728 *node_p = reg_node(pRExC_state,NOTHING); 10729 } 10730 else if (in_char_class) { 10731 if (SIZE_ONLY && in_char_class) { 10732 if (strict) { 10733 RExC_parse++; /* Position after the "}" */ 10734 vFAIL("Zero length \\N{}"); 10735 } 10736 else { 10737 ckWARNreg(RExC_parse, 10738 "Ignoring zero length \\N{} in character class"); 10739 } 10740 } 10741 ret = FALSE; 10742 } 10743 else { 10744 return FALSE; 10745 } 10746 nextchar(pRExC_state); 10747 return ret; 10748 } 10749 10750 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ 10751 RExC_parse += 2; /* Skip past the 'U+' */ 10752 10753 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 10754 10755 /* Code points are separated by dots. If none, there is only one code 10756 * point, and is terminated by the brace */ 10757 has_multiple_chars = (endchar < endbrace); 10758 10759 if (valuep && (! has_multiple_chars || in_char_class)) { 10760 /* We only pay attention to the first char of 10761 multichar strings being returned in char classes. I kinda wonder 10762 if this makes sense as it does change the behaviour 10763 from earlier versions, OTOH that behaviour was broken 10764 as well. XXX Solution is to recharacterize as 10765 [rest-of-class]|multi1|multi2... */ 10766 10767 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); 10768 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES 10769 | PERL_SCAN_DISALLOW_PREFIX 10770 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); 10771 10772 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); 10773 10774 /* The tokenizer should have guaranteed validity, but it's possible to 10775 * bypass it by using single quoting, so check */ 10776 if (length_of_hex == 0 10777 || length_of_hex != (STRLEN)(endchar - RExC_parse) ) 10778 { 10779 RExC_parse += length_of_hex; /* Includes all the valid */ 10780 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ 10781 ? UTF8SKIP(RExC_parse) 10782 : 1; 10783 /* Guard against malformed utf8 */ 10784 if (RExC_parse >= endchar) { 10785 RExC_parse = endchar; 10786 } 10787 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 10788 } 10789 10790 if (in_char_class && has_multiple_chars) { 10791 if (strict) { 10792 RExC_parse = endbrace; 10793 vFAIL("\\N{} in character class restricted to one character"); 10794 } 10795 else { 10796 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); 10797 } 10798 } 10799 10800 RExC_parse = endbrace + 1; 10801 } 10802 else if (! node_p || ! has_multiple_chars) { 10803 10804 /* Here, the input is legal, but not according to the caller's 10805 * options. We fail without advancing the parse, so that the 10806 * caller can try again */ 10807 RExC_parse = p; 10808 return FALSE; 10809 } 10810 else { 10811 10812 /* What is done here is to convert this to a sub-pattern of the form 10813 * (?:\x{char1}\x{char2}...) 10814 * and then call reg recursively. That way, it retains its atomicness, 10815 * while not having to worry about special handling that some code 10816 * points may have. toke.c has converted the original Unicode values 10817 * to native, so that we can just pass on the hex values unchanged. We 10818 * do have to set a flag to keep recoding from happening in the 10819 * recursion */ 10820 10821 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); 10822 STRLEN len; 10823 char *orig_end = RExC_end; 10824 I32 flags; 10825 10826 while (RExC_parse < endbrace) { 10827 10828 /* Convert to notation the rest of the code understands */ 10829 sv_catpv(substitute_parse, "\\x{"); 10830 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); 10831 sv_catpv(substitute_parse, "}"); 10832 10833 /* Point to the beginning of the next character in the sequence. */ 10834 RExC_parse = endchar + 1; 10835 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 10836 } 10837 sv_catpv(substitute_parse, ")"); 10838 10839 RExC_parse = SvPV(substitute_parse, len); 10840 10841 /* Don't allow empty number */ 10842 if (len < 8) { 10843 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 10844 } 10845 RExC_end = RExC_parse + len; 10846 10847 /* The values are Unicode, and therefore not subject to recoding */ 10848 RExC_override_recoding = 1; 10849 10850 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { 10851 if (flags & RESTART_UTF8) { 10852 *flagp = RESTART_UTF8; 10853 return FALSE; 10854 } 10855 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", 10856 (UV) flags); 10857 } 10858 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 10859 10860 RExC_parse = endbrace; 10861 RExC_end = orig_end; 10862 RExC_override_recoding = 0; 10863 10864 nextchar(pRExC_state); 10865 } 10866 10867 return TRUE; 10868 } 10869 10870 10871 /* 10872 * reg_recode 10873 * 10874 * It returns the code point in utf8 for the value in *encp. 10875 * value: a code value in the source encoding 10876 * encp: a pointer to an Encode object 10877 * 10878 * If the result from Encode is not a single character, 10879 * it returns U+FFFD (Replacement character) and sets *encp to NULL. 10880 */ 10881 STATIC UV 10882 S_reg_recode(pTHX_ const char value, SV **encp) 10883 { 10884 STRLEN numlen = 1; 10885 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); 10886 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); 10887 const STRLEN newlen = SvCUR(sv); 10888 UV uv = UNICODE_REPLACEMENT; 10889 10890 PERL_ARGS_ASSERT_REG_RECODE; 10891 10892 if (newlen) 10893 uv = SvUTF8(sv) 10894 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) 10895 : *(U8*)s; 10896 10897 if (!newlen || numlen != newlen) { 10898 uv = UNICODE_REPLACEMENT; 10899 *encp = NULL; 10900 } 10901 return uv; 10902 } 10903 10904 PERL_STATIC_INLINE U8 10905 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) 10906 { 10907 U8 op; 10908 10909 PERL_ARGS_ASSERT_COMPUTE_EXACTISH; 10910 10911 if (! FOLD) { 10912 return EXACT; 10913 } 10914 10915 op = get_regex_charset(RExC_flags); 10916 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { 10917 op--; /* /a is same as /u, and map /aa's offset to what /a's would have 10918 been, so there is no hole */ 10919 } 10920 10921 return op + EXACTF; 10922 } 10923 10924 PERL_STATIC_INLINE void 10925 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, 10926 regnode *node, I32* flagp, STRLEN len, UV code_point, 10927 bool downgradable) 10928 { 10929 /* This knows the details about sizing an EXACTish node, setting flags for 10930 * it (by setting <*flagp>, and potentially populating it with a single 10931 * character. 10932 * 10933 * If <len> (the length in bytes) is non-zero, this function assumes that 10934 * the node has already been populated, and just does the sizing. In this 10935 * case <code_point> should be the final code point that has already been 10936 * placed into the node. This value will be ignored except that under some 10937 * circumstances <*flagp> is set based on it. 10938 * 10939 * If <len> is zero, the function assumes that the node is to contain only 10940 * the single character given by <code_point> and calculates what <len> 10941 * should be. In pass 1, it sizes the node appropriately. In pass 2, it 10942 * additionally will populate the node's STRING with <code_point> or its 10943 * fold if folding. 10944 * 10945 * In both cases <*flagp> is appropriately set 10946 * 10947 * It knows that under FOLD, the Latin Sharp S and UTF characters above 10948 * 255, must be folded (the former only when the rules indicate it can 10949 * match 'ss') 10950 * 10951 * When it does the populating, it looks at the flag 'downgradable'. If 10952 * true with a node that folds, it checks if the single code point 10953 * participates in a fold, and if not downgrades the node to an EXACT. 10954 * This helps the optimizer */ 10955 10956 bool len_passed_in = cBOOL(len != 0); 10957 U8 character[UTF8_MAXBYTES_CASE+1]; 10958 10959 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; 10960 10961 /* Don't bother to check for downgrading in PASS1, as it doesn't make any 10962 * sizing difference, and is extra work that is thrown away */ 10963 if (downgradable && ! PASS2) { 10964 downgradable = FALSE; 10965 } 10966 10967 if (! len_passed_in) { 10968 if (UTF) { 10969 if (UNI_IS_INVARIANT(code_point)) { 10970 if (LOC || ! FOLD) { /* /l defers folding until runtime */ 10971 *character = (U8) code_point; 10972 } 10973 else { /* Here is /i and not /l (toFOLD() is defined on just 10974 ASCII, which isn't the same thing as INVARIANT on 10975 EBCDIC, but it works there, as the extra invariants 10976 fold to themselves) */ 10977 *character = toFOLD((U8) code_point); 10978 10979 /* We can downgrade to an EXACT node if this character 10980 * isn't a folding one. Note that this assumes that 10981 * nothing above Latin1 folds to some other invariant than 10982 * one of these alphabetics; otherwise we would also have 10983 * to check: 10984 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) 10985 * || ASCII_FOLD_RESTRICTED)) 10986 */ 10987 if (downgradable && PL_fold[code_point] == code_point) { 10988 OP(node) = EXACT; 10989 } 10990 } 10991 len = 1; 10992 } 10993 else if (FOLD && (! LOC 10994 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) 10995 { /* Folding, and ok to do so now */ 10996 UV folded = _to_uni_fold_flags( 10997 code_point, 10998 character, 10999 &len, 11000 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 11001 ? FOLD_FLAGS_NOMIX_ASCII 11002 : 0)); 11003 if (downgradable 11004 && folded == code_point 11005 && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) 11006 { 11007 OP(node) = EXACT; 11008 } 11009 } 11010 else if (code_point <= MAX_UTF8_TWO_BYTE) { 11011 11012 /* Not folding this cp, and can output it directly */ 11013 *character = UTF8_TWO_BYTE_HI(code_point); 11014 *(character + 1) = UTF8_TWO_BYTE_LO(code_point); 11015 len = 2; 11016 } 11017 else { 11018 uvchr_to_utf8( character, code_point); 11019 len = UTF8SKIP(character); 11020 } 11021 } /* Else pattern isn't UTF8. */ 11022 else if (! FOLD) { 11023 *character = (U8) code_point; 11024 len = 1; 11025 } /* Else is folded non-UTF8 */ 11026 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { 11027 11028 /* We don't fold any non-UTF8 except possibly the Sharp s (see 11029 * comments at join_exact()); */ 11030 *character = (U8) code_point; 11031 len = 1; 11032 11033 /* Can turn into an EXACT node if we know the fold at compile time, 11034 * and it folds to itself and doesn't particpate in other folds */ 11035 if (downgradable 11036 && ! LOC 11037 && PL_fold_latin1[code_point] == code_point 11038 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) 11039 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) 11040 { 11041 OP(node) = EXACT; 11042 } 11043 } /* else is Sharp s. May need to fold it */ 11044 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { 11045 *character = 's'; 11046 *(character + 1) = 's'; 11047 len = 2; 11048 } 11049 else { 11050 *character = LATIN_SMALL_LETTER_SHARP_S; 11051 len = 1; 11052 } 11053 } 11054 11055 if (SIZE_ONLY) { 11056 RExC_size += STR_SZ(len); 11057 } 11058 else { 11059 RExC_emit += STR_SZ(len); 11060 STR_LEN(node) = len; 11061 if (! len_passed_in) { 11062 Copy((char *) character, STRING(node), len, char); 11063 } 11064 } 11065 11066 *flagp |= HASWIDTH; 11067 11068 /* A single character node is SIMPLE, except for the special-cased SHARP S 11069 * under /di. */ 11070 if ((len == 1 || (UTF && len == UNISKIP(code_point))) 11071 && (code_point != LATIN_SMALL_LETTER_SHARP_S 11072 || ! FOLD || ! DEPENDS_SEMANTICS)) 11073 { 11074 *flagp |= SIMPLE; 11075 } 11076 11077 /* The OP may not be well defined in PASS1 */ 11078 if (PASS2 && OP(node) == EXACTFL) { 11079 RExC_contains_locale = 1; 11080 } 11081 } 11082 11083 11084 /* return atoi(p), unless it's too big to sensibly be a backref, 11085 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ 11086 11087 static I32 11088 S_backref_value(char *p) 11089 { 11090 char *q = p; 11091 11092 for (;isDIGIT(*q); q++); /* calculate length of num */ 11093 if (q - p == 0 || q - p > 9) 11094 return I32_MAX; 11095 return atoi(p); 11096 } 11097 11098 11099 /* 11100 - regatom - the lowest level 11101 11102 Try to identify anything special at the start of the pattern. If there 11103 is, then handle it as required. This may involve generating a single regop, 11104 such as for an assertion; or it may involve recursing, such as to 11105 handle a () structure. 11106 11107 If the string doesn't start with something special then we gobble up 11108 as much literal text as we can. 11109 11110 Once we have been able to handle whatever type of thing started the 11111 sequence, we return. 11112 11113 Note: we have to be careful with escapes, as they can be both literal 11114 and special, and in the case of \10 and friends, context determines which. 11115 11116 A summary of the code structure is: 11117 11118 switch (first_byte) { 11119 cases for each special: 11120 handle this special; 11121 break; 11122 case '\\': 11123 switch (2nd byte) { 11124 cases for each unambiguous special: 11125 handle this special; 11126 break; 11127 cases for each ambigous special/literal: 11128 disambiguate; 11129 if (special) handle here 11130 else goto defchar; 11131 default: // unambiguously literal: 11132 goto defchar; 11133 } 11134 default: // is a literal char 11135 // FALL THROUGH 11136 defchar: 11137 create EXACTish node for literal; 11138 while (more input and node isn't full) { 11139 switch (input_byte) { 11140 cases for each special; 11141 make sure parse pointer is set so that the next call to 11142 regatom will see this special first 11143 goto loopdone; // EXACTish node terminated by prev. char 11144 default: 11145 append char to EXACTISH node; 11146 } 11147 get next input byte; 11148 } 11149 loopdone: 11150 } 11151 return the generated node; 11152 11153 Specifically there are two separate switches for handling 11154 escape sequences, with the one for handling literal escapes requiring 11155 a dummy entry for all of the special escapes that are actually handled 11156 by the other. 11157 11158 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with 11159 TRYAGAIN. 11160 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be 11161 restarted. 11162 Otherwise does not return NULL. 11163 */ 11164 11165 STATIC regnode * 11166 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 11167 { 11168 dVAR; 11169 regnode *ret = NULL; 11170 I32 flags = 0; 11171 char *parse_start = RExC_parse; 11172 U8 op; 11173 int invert = 0; 11174 11175 GET_RE_DEBUG_FLAGS_DECL; 11176 11177 *flagp = WORST; /* Tentatively. */ 11178 11179 DEBUG_PARSE("atom"); 11180 11181 PERL_ARGS_ASSERT_REGATOM; 11182 11183 tryagain: 11184 switch ((U8)*RExC_parse) { 11185 case '^': 11186 RExC_seen_zerolen++; 11187 nextchar(pRExC_state); 11188 if (RExC_flags & RXf_PMf_MULTILINE) 11189 ret = reg_node(pRExC_state, MBOL); 11190 else if (RExC_flags & RXf_PMf_SINGLELINE) 11191 ret = reg_node(pRExC_state, SBOL); 11192 else 11193 ret = reg_node(pRExC_state, BOL); 11194 Set_Node_Length(ret, 1); /* MJD */ 11195 break; 11196 case '$': 11197 nextchar(pRExC_state); 11198 if (*RExC_parse) 11199 RExC_seen_zerolen++; 11200 if (RExC_flags & RXf_PMf_MULTILINE) 11201 ret = reg_node(pRExC_state, MEOL); 11202 else if (RExC_flags & RXf_PMf_SINGLELINE) 11203 ret = reg_node(pRExC_state, SEOL); 11204 else 11205 ret = reg_node(pRExC_state, EOL); 11206 Set_Node_Length(ret, 1); /* MJD */ 11207 break; 11208 case '.': 11209 nextchar(pRExC_state); 11210 if (RExC_flags & RXf_PMf_SINGLELINE) 11211 ret = reg_node(pRExC_state, SANY); 11212 else 11213 ret = reg_node(pRExC_state, REG_ANY); 11214 *flagp |= HASWIDTH|SIMPLE; 11215 RExC_naughty++; 11216 Set_Node_Length(ret, 1); /* MJD */ 11217 break; 11218 case '[': 11219 { 11220 char * const oregcomp_parse = ++RExC_parse; 11221 ret = regclass(pRExC_state, flagp,depth+1, 11222 FALSE, /* means parse the whole char class */ 11223 TRUE, /* allow multi-char folds */ 11224 FALSE, /* don't silence non-portable warnings. */ 11225 NULL); 11226 if (*RExC_parse != ']') { 11227 RExC_parse = oregcomp_parse; 11228 vFAIL("Unmatched ["); 11229 } 11230 if (ret == NULL) { 11231 if (*flagp & RESTART_UTF8) 11232 return NULL; 11233 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", 11234 (UV) *flagp); 11235 } 11236 nextchar(pRExC_state); 11237 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 11238 break; 11239 } 11240 case '(': 11241 nextchar(pRExC_state); 11242 ret = reg(pRExC_state, 2, &flags,depth+1); 11243 if (ret == NULL) { 11244 if (flags & TRYAGAIN) { 11245 if (RExC_parse == RExC_end) { 11246 /* Make parent create an empty node if needed. */ 11247 *flagp |= TRYAGAIN; 11248 return(NULL); 11249 } 11250 goto tryagain; 11251 } 11252 if (flags & RESTART_UTF8) { 11253 *flagp = RESTART_UTF8; 11254 return NULL; 11255 } 11256 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", 11257 (UV) flags); 11258 } 11259 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 11260 break; 11261 case '|': 11262 case ')': 11263 if (flags & TRYAGAIN) { 11264 *flagp |= TRYAGAIN; 11265 return NULL; 11266 } 11267 vFAIL("Internal urp"); 11268 /* Supposed to be caught earlier. */ 11269 break; 11270 case '{': 11271 if (!regcurly(RExC_parse, FALSE)) { 11272 RExC_parse++; 11273 goto defchar; 11274 } 11275 /* FALL THROUGH */ 11276 case '?': 11277 case '+': 11278 case '*': 11279 RExC_parse++; 11280 vFAIL("Quantifier follows nothing"); 11281 break; 11282 case '\\': 11283 /* Special Escapes 11284 11285 This switch handles escape sequences that resolve to some kind 11286 of special regop and not to literal text. Escape sequnces that 11287 resolve to literal text are handled below in the switch marked 11288 "Literal Escapes". 11289 11290 Every entry in this switch *must* have a corresponding entry 11291 in the literal escape switch. However, the opposite is not 11292 required, as the default for this switch is to jump to the 11293 literal text handling code. 11294 */ 11295 switch ((U8)*++RExC_parse) { 11296 U8 arg; 11297 /* Special Escapes */ 11298 case 'A': 11299 RExC_seen_zerolen++; 11300 ret = reg_node(pRExC_state, SBOL); 11301 *flagp |= SIMPLE; 11302 goto finish_meta_pat; 11303 case 'G': 11304 ret = reg_node(pRExC_state, GPOS); 11305 RExC_seen |= REG_GPOS_SEEN; 11306 *flagp |= SIMPLE; 11307 goto finish_meta_pat; 11308 case 'K': 11309 RExC_seen_zerolen++; 11310 ret = reg_node(pRExC_state, KEEPS); 11311 *flagp |= SIMPLE; 11312 /* XXX:dmq : disabling in-place substitution seems to 11313 * be necessary here to avoid cases of memory corruption, as 11314 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 11315 */ 11316 RExC_seen |= REG_LOOKBEHIND_SEEN; 11317 goto finish_meta_pat; 11318 case 'Z': 11319 ret = reg_node(pRExC_state, SEOL); 11320 *flagp |= SIMPLE; 11321 RExC_seen_zerolen++; /* Do not optimize RE away */ 11322 goto finish_meta_pat; 11323 case 'z': 11324 ret = reg_node(pRExC_state, EOS); 11325 *flagp |= SIMPLE; 11326 RExC_seen_zerolen++; /* Do not optimize RE away */ 11327 goto finish_meta_pat; 11328 case 'C': 11329 ret = reg_node(pRExC_state, CANY); 11330 RExC_seen |= REG_CANY_SEEN; 11331 *flagp |= HASWIDTH|SIMPLE; 11332 goto finish_meta_pat; 11333 case 'X': 11334 ret = reg_node(pRExC_state, CLUMP); 11335 *flagp |= HASWIDTH; 11336 goto finish_meta_pat; 11337 11338 case 'W': 11339 invert = 1; 11340 /* FALLTHROUGH */ 11341 case 'w': 11342 arg = ANYOF_WORDCHAR; 11343 goto join_posix; 11344 11345 case 'b': 11346 RExC_seen_zerolen++; 11347 RExC_seen |= REG_LOOKBEHIND_SEEN; 11348 op = BOUND + get_regex_charset(RExC_flags); 11349 if (op > BOUNDA) { /* /aa is same as /a */ 11350 op = BOUNDA; 11351 } 11352 else if (op == BOUNDL) { 11353 RExC_contains_locale = 1; 11354 } 11355 ret = reg_node(pRExC_state, op); 11356 FLAGS(ret) = get_regex_charset(RExC_flags); 11357 *flagp |= SIMPLE; 11358 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { 11359 /* diag_listed_as: Use "%s" instead of "%s" */ 11360 vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); 11361 } 11362 goto finish_meta_pat; 11363 case 'B': 11364 RExC_seen_zerolen++; 11365 RExC_seen |= REG_LOOKBEHIND_SEEN; 11366 op = NBOUND + get_regex_charset(RExC_flags); 11367 if (op > NBOUNDA) { /* /aa is same as /a */ 11368 op = NBOUNDA; 11369 } 11370 else if (op == NBOUNDL) { 11371 RExC_contains_locale = 1; 11372 } 11373 ret = reg_node(pRExC_state, op); 11374 FLAGS(ret) = get_regex_charset(RExC_flags); 11375 *flagp |= SIMPLE; 11376 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { 11377 /* diag_listed_as: Use "%s" instead of "%s" */ 11378 vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); 11379 } 11380 goto finish_meta_pat; 11381 11382 case 'D': 11383 invert = 1; 11384 /* FALLTHROUGH */ 11385 case 'd': 11386 arg = ANYOF_DIGIT; 11387 goto join_posix; 11388 11389 case 'R': 11390 ret = reg_node(pRExC_state, LNBREAK); 11391 *flagp |= HASWIDTH|SIMPLE; 11392 goto finish_meta_pat; 11393 11394 case 'H': 11395 invert = 1; 11396 /* FALLTHROUGH */ 11397 case 'h': 11398 arg = ANYOF_BLANK; 11399 op = POSIXU; 11400 goto join_posix_op_known; 11401 11402 case 'V': 11403 invert = 1; 11404 /* FALLTHROUGH */ 11405 case 'v': 11406 arg = ANYOF_VERTWS; 11407 op = POSIXU; 11408 goto join_posix_op_known; 11409 11410 case 'S': 11411 invert = 1; 11412 /* FALLTHROUGH */ 11413 case 's': 11414 arg = ANYOF_SPACE; 11415 11416 join_posix: 11417 11418 op = POSIXD + get_regex_charset(RExC_flags); 11419 if (op > POSIXA) { /* /aa is same as /a */ 11420 op = POSIXA; 11421 } 11422 else if (op == POSIXL) { 11423 RExC_contains_locale = 1; 11424 } 11425 11426 join_posix_op_known: 11427 11428 if (invert) { 11429 op += NPOSIXD - POSIXD; 11430 } 11431 11432 ret = reg_node(pRExC_state, op); 11433 if (! SIZE_ONLY) { 11434 FLAGS(ret) = namedclass_to_classnum(arg); 11435 } 11436 11437 *flagp |= HASWIDTH|SIMPLE; 11438 /* FALL THROUGH */ 11439 11440 finish_meta_pat: 11441 nextchar(pRExC_state); 11442 Set_Node_Length(ret, 2); /* MJD */ 11443 break; 11444 case 'p': 11445 case 'P': 11446 { 11447 #ifdef DEBUGGING 11448 char* parse_start = RExC_parse - 2; 11449 #endif 11450 11451 RExC_parse--; 11452 11453 ret = regclass(pRExC_state, flagp,depth+1, 11454 TRUE, /* means just parse this element */ 11455 FALSE, /* don't allow multi-char folds */ 11456 FALSE, /* don't silence non-portable warnings. 11457 It would be a bug if these returned 11458 non-portables */ 11459 NULL); 11460 /* regclass() can only return RESTART_UTF8 if multi-char folds 11461 are allowed. */ 11462 if (!ret) 11463 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", 11464 (UV) *flagp); 11465 11466 RExC_parse--; 11467 11468 Set_Node_Offset(ret, parse_start + 2); 11469 Set_Node_Cur_Length(ret, parse_start); 11470 nextchar(pRExC_state); 11471 } 11472 break; 11473 case 'N': 11474 /* Handle \N and \N{NAME} with multiple code points here and not 11475 * below because it can be multicharacter. join_exact() will join 11476 * them up later on. Also this makes sure that things like 11477 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. 11478 * The options to the grok function call causes it to fail if the 11479 * sequence is just a single code point. We then go treat it as 11480 * just another character in the current EXACT node, and hence it 11481 * gets uniform treatment with all the other characters. The 11482 * special treatment for quantifiers is not needed for such single 11483 * character sequences */ 11484 ++RExC_parse; 11485 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, 11486 FALSE /* not strict */ )) { 11487 if (*flagp & RESTART_UTF8) 11488 return NULL; 11489 RExC_parse--; 11490 goto defchar; 11491 } 11492 break; 11493 case 'k': /* Handle \k<NAME> and \k'NAME' */ 11494 parse_named_seq: 11495 { 11496 char ch= RExC_parse[1]; 11497 if (ch != '<' && ch != '\'' && ch != '{') { 11498 RExC_parse++; 11499 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 11500 vFAIL2("Sequence %.2s... not terminated",parse_start); 11501 } else { 11502 /* this pretty much dupes the code for (?P=...) in reg(), if 11503 you change this make sure you change that */ 11504 char* name_start = (RExC_parse += 2); 11505 U32 num = 0; 11506 SV *sv_dat = reg_scan_name(pRExC_state, 11507 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 11508 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; 11509 if (RExC_parse == name_start || *RExC_parse != ch) 11510 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 11511 vFAIL2("Sequence %.3s... not terminated",parse_start); 11512 11513 if (!SIZE_ONLY) { 11514 num = add_data( pRExC_state, STR_WITH_LEN("S")); 11515 RExC_rxi->data->data[num]=(void*)sv_dat; 11516 SvREFCNT_inc_simple_void(sv_dat); 11517 } 11518 11519 RExC_sawback = 1; 11520 ret = reganode(pRExC_state, 11521 ((! FOLD) 11522 ? NREF 11523 : (ASCII_FOLD_RESTRICTED) 11524 ? NREFFA 11525 : (AT_LEAST_UNI_SEMANTICS) 11526 ? NREFFU 11527 : (LOC) 11528 ? NREFFL 11529 : NREFF), 11530 num); 11531 *flagp |= HASWIDTH; 11532 11533 /* override incorrect value set in reganode MJD */ 11534 Set_Node_Offset(ret, parse_start+1); 11535 Set_Node_Cur_Length(ret, parse_start); 11536 nextchar(pRExC_state); 11537 11538 } 11539 break; 11540 } 11541 case 'g': 11542 case '1': case '2': case '3': case '4': 11543 case '5': case '6': case '7': case '8': case '9': 11544 { 11545 I32 num; 11546 bool hasbrace = 0; 11547 11548 if (*RExC_parse == 'g') { 11549 bool isrel = 0; 11550 11551 RExC_parse++; 11552 if (*RExC_parse == '{') { 11553 RExC_parse++; 11554 hasbrace = 1; 11555 } 11556 if (*RExC_parse == '-') { 11557 RExC_parse++; 11558 isrel = 1; 11559 } 11560 if (hasbrace && !isDIGIT(*RExC_parse)) { 11561 if (isrel) RExC_parse--; 11562 RExC_parse -= 2; 11563 goto parse_named_seq; 11564 } 11565 11566 num = S_backref_value(RExC_parse); 11567 if (num == 0) 11568 vFAIL("Reference to invalid group 0"); 11569 else if (num == I32_MAX) { 11570 if (isDIGIT(*RExC_parse)) 11571 vFAIL("Reference to nonexistent group"); 11572 else 11573 vFAIL("Unterminated \\g... pattern"); 11574 } 11575 11576 if (isrel) { 11577 num = RExC_npar - num; 11578 if (num < 1) 11579 vFAIL("Reference to nonexistent or unclosed group"); 11580 } 11581 } 11582 else { 11583 num = S_backref_value(RExC_parse); 11584 /* bare \NNN might be backref or octal - if it is larger than or equal 11585 * RExC_npar then it is assumed to be and octal escape. 11586 * Note RExC_npar is +1 from the actual number of parens*/ 11587 if (num == I32_MAX || (num > 9 && num >= RExC_npar 11588 && *RExC_parse != '8' && *RExC_parse != '9')) 11589 { 11590 /* Probably a character specified in octal, e.g. \35 */ 11591 goto defchar; 11592 } 11593 } 11594 11595 /* at this point RExC_parse definitely points to a backref 11596 * number */ 11597 { 11598 #ifdef RE_TRACK_PATTERN_OFFSETS 11599 char * const parse_start = RExC_parse - 1; /* MJD */ 11600 #endif 11601 while (isDIGIT(*RExC_parse)) 11602 RExC_parse++; 11603 if (hasbrace) { 11604 if (*RExC_parse != '}') 11605 vFAIL("Unterminated \\g{...} pattern"); 11606 RExC_parse++; 11607 } 11608 if (!SIZE_ONLY) { 11609 if (num > (I32)RExC_rx->nparens) 11610 vFAIL("Reference to nonexistent group"); 11611 } 11612 RExC_sawback = 1; 11613 ret = reganode(pRExC_state, 11614 ((! FOLD) 11615 ? REF 11616 : (ASCII_FOLD_RESTRICTED) 11617 ? REFFA 11618 : (AT_LEAST_UNI_SEMANTICS) 11619 ? REFFU 11620 : (LOC) 11621 ? REFFL 11622 : REFF), 11623 num); 11624 *flagp |= HASWIDTH; 11625 11626 /* override incorrect value set in reganode MJD */ 11627 Set_Node_Offset(ret, parse_start+1); 11628 Set_Node_Cur_Length(ret, parse_start); 11629 RExC_parse--; 11630 nextchar(pRExC_state); 11631 } 11632 } 11633 break; 11634 case '\0': 11635 if (RExC_parse >= RExC_end) 11636 FAIL("Trailing \\"); 11637 /* FALL THROUGH */ 11638 default: 11639 /* Do not generate "unrecognized" warnings here, we fall 11640 back into the quick-grab loop below */ 11641 parse_start--; 11642 goto defchar; 11643 } 11644 break; 11645 11646 case '#': 11647 if (RExC_flags & RXf_PMf_EXTENDED) { 11648 if ( reg_skipcomment( pRExC_state ) ) 11649 goto tryagain; 11650 } 11651 /* FALL THROUGH */ 11652 11653 default: 11654 11655 parse_start = RExC_parse - 1; 11656 11657 RExC_parse++; 11658 11659 defchar: { 11660 STRLEN len = 0; 11661 UV ender = 0; 11662 char *p; 11663 char *s; 11664 #define MAX_NODE_STRING_SIZE 127 11665 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; 11666 char *s0; 11667 U8 upper_parse = MAX_NODE_STRING_SIZE; 11668 U8 node_type = compute_EXACTish(pRExC_state); 11669 bool next_is_quantifier; 11670 char * oldp = NULL; 11671 11672 /* We can convert EXACTF nodes to EXACTFU if they contain only 11673 * characters that match identically regardless of the target 11674 * string's UTF8ness. The reason to do this is that EXACTF is not 11675 * trie-able, EXACTFU is. 11676 * 11677 * Similarly, we can convert EXACTFL nodes to EXACTFU if they 11678 * contain only above-Latin1 characters (hence must be in UTF8), 11679 * which don't participate in folds with Latin1-range characters, 11680 * as the latter's folds aren't known until runtime. (We don't 11681 * need to figure this out until pass 2) */ 11682 bool maybe_exactfu = PASS2 11683 && (node_type == EXACTF || node_type == EXACTFL); 11684 11685 /* If a folding node contains only code points that don't 11686 * participate in folds, it can be changed into an EXACT node, 11687 * which allows the optimizer more things to look for */ 11688 bool maybe_exact; 11689 11690 ret = reg_node(pRExC_state, node_type); 11691 11692 /* In pass1, folded, we use a temporary buffer instead of the 11693 * actual node, as the node doesn't exist yet */ 11694 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); 11695 11696 s0 = s; 11697 11698 reparse: 11699 11700 /* We do the EXACTFish to EXACT node only if folding. (And we 11701 * don't need to figure this out until pass 2) */ 11702 maybe_exact = FOLD && PASS2; 11703 11704 /* XXX The node can hold up to 255 bytes, yet this only goes to 11705 * 127. I (khw) do not know why. Keeping it somewhat less than 11706 * 255 allows us to not have to worry about overflow due to 11707 * converting to utf8 and fold expansion, but that value is 11708 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes 11709 * split up by this limit into a single one using the real max of 11710 * 255. Even at 127, this breaks under rare circumstances. If 11711 * folding, we do not want to split a node at a character that is a 11712 * non-final in a multi-char fold, as an input string could just 11713 * happen to want to match across the node boundary. The join 11714 * would solve that problem if the join actually happens. But a 11715 * series of more than two nodes in a row each of 127 would cause 11716 * the first join to succeed to get to 254, but then there wouldn't 11717 * be room for the next one, which could at be one of those split 11718 * multi-char folds. I don't know of any fool-proof solution. One 11719 * could back off to end with only a code point that isn't such a 11720 * non-final, but it is possible for there not to be any in the 11721 * entire node. */ 11722 for (p = RExC_parse - 1; 11723 len < upper_parse && p < RExC_end; 11724 len++) 11725 { 11726 oldp = p; 11727 11728 if (RExC_flags & RXf_PMf_EXTENDED) 11729 p = regwhite( pRExC_state, p ); 11730 switch ((U8)*p) { 11731 case '^': 11732 case '$': 11733 case '.': 11734 case '[': 11735 case '(': 11736 case ')': 11737 case '|': 11738 goto loopdone; 11739 case '\\': 11740 /* Literal Escapes Switch 11741 11742 This switch is meant to handle escape sequences that 11743 resolve to a literal character. 11744 11745 Every escape sequence that represents something 11746 else, like an assertion or a char class, is handled 11747 in the switch marked 'Special Escapes' above in this 11748 routine, but also has an entry here as anything that 11749 isn't explicitly mentioned here will be treated as 11750 an unescaped equivalent literal. 11751 */ 11752 11753 switch ((U8)*++p) { 11754 /* These are all the special escapes. */ 11755 case 'A': /* Start assertion */ 11756 case 'b': case 'B': /* Word-boundary assertion*/ 11757 case 'C': /* Single char !DANGEROUS! */ 11758 case 'd': case 'D': /* digit class */ 11759 case 'g': case 'G': /* generic-backref, pos assertion */ 11760 case 'h': case 'H': /* HORIZWS */ 11761 case 'k': case 'K': /* named backref, keep marker */ 11762 case 'p': case 'P': /* Unicode property */ 11763 case 'R': /* LNBREAK */ 11764 case 's': case 'S': /* space class */ 11765 case 'v': case 'V': /* VERTWS */ 11766 case 'w': case 'W': /* word class */ 11767 case 'X': /* eXtended Unicode "combining 11768 character sequence" */ 11769 case 'z': case 'Z': /* End of line/string assertion */ 11770 --p; 11771 goto loopdone; 11772 11773 /* Anything after here is an escape that resolves to a 11774 literal. (Except digits, which may or may not) 11775 */ 11776 case 'n': 11777 ender = '\n'; 11778 p++; 11779 break; 11780 case 'N': /* Handle a single-code point named character. */ 11781 /* The options cause it to fail if a multiple code 11782 * point sequence. Handle those in the switch() above 11783 * */ 11784 RExC_parse = p + 1; 11785 if (! grok_bslash_N(pRExC_state, NULL, &ender, 11786 flagp, depth, FALSE, 11787 FALSE /* not strict */ )) 11788 { 11789 if (*flagp & RESTART_UTF8) 11790 FAIL("panic: grok_bslash_N set RESTART_UTF8"); 11791 RExC_parse = p = oldp; 11792 goto loopdone; 11793 } 11794 p = RExC_parse; 11795 if (ender > 0xff) { 11796 REQUIRE_UTF8; 11797 } 11798 break; 11799 case 'r': 11800 ender = '\r'; 11801 p++; 11802 break; 11803 case 't': 11804 ender = '\t'; 11805 p++; 11806 break; 11807 case 'f': 11808 ender = '\f'; 11809 p++; 11810 break; 11811 case 'e': 11812 ender = ASCII_TO_NATIVE('\033'); 11813 p++; 11814 break; 11815 case 'a': 11816 ender = '\a'; 11817 p++; 11818 break; 11819 case 'o': 11820 { 11821 UV result; 11822 const char* error_msg; 11823 11824 bool valid = grok_bslash_o(&p, 11825 &result, 11826 &error_msg, 11827 TRUE, /* out warnings */ 11828 FALSE, /* not strict */ 11829 TRUE, /* Output warnings 11830 for non- 11831 portables */ 11832 UTF); 11833 if (! valid) { 11834 RExC_parse = p; /* going to die anyway; point 11835 to exact spot of failure */ 11836 vFAIL(error_msg); 11837 } 11838 ender = result; 11839 if (PL_encoding && ender < 0x100) { 11840 goto recode_encoding; 11841 } 11842 if (ender > 0xff) { 11843 REQUIRE_UTF8; 11844 } 11845 break; 11846 } 11847 case 'x': 11848 { 11849 UV result = UV_MAX; /* initialize to erroneous 11850 value */ 11851 const char* error_msg; 11852 11853 bool valid = grok_bslash_x(&p, 11854 &result, 11855 &error_msg, 11856 TRUE, /* out warnings */ 11857 FALSE, /* not strict */ 11858 TRUE, /* Output warnings 11859 for non- 11860 portables */ 11861 UTF); 11862 if (! valid) { 11863 RExC_parse = p; /* going to die anyway; point 11864 to exact spot of failure */ 11865 vFAIL(error_msg); 11866 } 11867 ender = result; 11868 11869 if (PL_encoding && ender < 0x100) { 11870 goto recode_encoding; 11871 } 11872 if (ender > 0xff) { 11873 REQUIRE_UTF8; 11874 } 11875 break; 11876 } 11877 case 'c': 11878 p++; 11879 ender = grok_bslash_c(*p++, SIZE_ONLY); 11880 break; 11881 case '8': case '9': /* must be a backreference */ 11882 --p; 11883 goto loopdone; 11884 case '1': case '2': case '3':case '4': 11885 case '5': case '6': case '7': 11886 /* When we parse backslash escapes there is ambiguity 11887 * between backreferences and octal escapes. Any escape 11888 * from \1 - \9 is a backreference, any multi-digit 11889 * escape which does not start with 0 and which when 11890 * evaluated as decimal could refer to an already 11891 * parsed capture buffer is a backslash. Anything else 11892 * is octal. 11893 * 11894 * Note this implies that \118 could be interpreted as 11895 * 118 OR as "\11" . "8" depending on whether there 11896 * were 118 capture buffers defined already in the 11897 * pattern. */ 11898 11899 /* NOTE, RExC_npar is 1 more than the actual number of 11900 * parens we have seen so far, hence the < RExC_npar below. */ 11901 11902 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) 11903 { /* Not to be treated as an octal constant, go 11904 find backref */ 11905 --p; 11906 goto loopdone; 11907 } 11908 case '0': 11909 { 11910 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 11911 STRLEN numlen = 3; 11912 ender = grok_oct(p, &numlen, &flags, NULL); 11913 if (ender > 0xff) { 11914 REQUIRE_UTF8; 11915 } 11916 p += numlen; 11917 if (SIZE_ONLY /* like \08, \178 */ 11918 && numlen < 3 11919 && p < RExC_end 11920 && isDIGIT(*p) && ckWARN(WARN_REGEXP)) 11921 { 11922 reg_warn_non_literal_string( 11923 p + 1, 11924 form_short_octal_warning(p, numlen)); 11925 } 11926 } 11927 if (PL_encoding && ender < 0x100) 11928 goto recode_encoding; 11929 break; 11930 recode_encoding: 11931 if (! RExC_override_recoding) { 11932 SV* enc = PL_encoding; 11933 ender = reg_recode((const char)(U8)ender, &enc); 11934 if (!enc && SIZE_ONLY) 11935 ckWARNreg(p, "Invalid escape in the specified encoding"); 11936 REQUIRE_UTF8; 11937 } 11938 break; 11939 case '\0': 11940 if (p >= RExC_end) 11941 FAIL("Trailing \\"); 11942 /* FALL THROUGH */ 11943 default: 11944 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { 11945 /* Include any { following the alpha to emphasize 11946 * that it could be part of an escape at some point 11947 * in the future */ 11948 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; 11949 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); 11950 } 11951 goto normal_default; 11952 } /* End of switch on '\' */ 11953 break; 11954 default: /* A literal character */ 11955 11956 if (! SIZE_ONLY 11957 && RExC_flags & RXf_PMf_EXTENDED 11958 && ckWARN_d(WARN_DEPRECATED) 11959 && is_PATWS_non_low_safe(p, RExC_end, UTF)) 11960 { 11961 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), 11962 "Escape literal pattern white space under /x"); 11963 } 11964 11965 normal_default: 11966 if (UTF8_IS_START(*p) && UTF) { 11967 STRLEN numlen; 11968 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 11969 &numlen, UTF8_ALLOW_DEFAULT); 11970 p += numlen; 11971 } 11972 else 11973 ender = (U8) *p++; 11974 break; 11975 } /* End of switch on the literal */ 11976 11977 /* Here, have looked at the literal character and <ender> 11978 * contains its ordinal, <p> points to the character after it 11979 */ 11980 11981 if ( RExC_flags & RXf_PMf_EXTENDED) 11982 p = regwhite( pRExC_state, p ); 11983 11984 /* If the next thing is a quantifier, it applies to this 11985 * character only, which means that this character has to be in 11986 * its own node and can't just be appended to the string in an 11987 * existing node, so if there are already other characters in 11988 * the node, close the node with just them, and set up to do 11989 * this character again next time through, when it will be the 11990 * only thing in its new node */ 11991 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) 11992 { 11993 p = oldp; 11994 goto loopdone; 11995 } 11996 11997 if (! FOLD /* The simple case, just append the literal */ 11998 || (LOC /* Also don't fold for tricky chars under /l */ 11999 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) 12000 { 12001 if (UTF) { 12002 12003 /* Normally, we don't need the representation of the 12004 * character in the sizing pass--just its size, but if 12005 * folding, we have to actually put the character out 12006 * even in the sizing pass, because the size could 12007 * change as we juggle things at the end of this loop 12008 * to avoid splitting a too-full node in the middle of 12009 * a potential multi-char fold [perl #123539] */ 12010 const STRLEN unilen = (SIZE_ONLY && ! FOLD) 12011 ? UNISKIP(ender) 12012 : (uvchr_to_utf8((U8*)s, ender) - (U8*)s); 12013 if (unilen > 0) { 12014 s += unilen; 12015 len += unilen; 12016 } 12017 12018 /* The loop increments <len> each time, as all but this 12019 * path (and one other) through it add a single byte to 12020 * the EXACTish node. But this one has changed len to 12021 * be the correct final value, so subtract one to 12022 * cancel out the increment that follows */ 12023 len--; 12024 } 12025 else if (FOLD) { 12026 /* See comment above for [perl #123539] */ 12027 *(s++) = (char) ender; 12028 } 12029 else { 12030 REGC((char)ender, s++); 12031 } 12032 12033 /* Can get here if folding only if is one of the /l 12034 * characters whose fold depends on the locale. The 12035 * occurrence of any of these indicate that we can't 12036 * simplify things */ 12037 if (FOLD) { 12038 maybe_exact = FALSE; 12039 maybe_exactfu = FALSE; 12040 } 12041 } 12042 else /* FOLD */ 12043 if (! ( UTF 12044 /* See comments for join_exact() as to why we fold this 12045 * non-UTF at compile time */ 12046 || (node_type == EXACTFU 12047 && ender == LATIN_SMALL_LETTER_SHARP_S))) 12048 { 12049 /* Here, are folding and are not UTF-8 encoded; therefore 12050 * the character must be in the range 0-255, and is not /l 12051 * (Not /l because we already handled these under /l in 12052 * is_PROBLEMATIC_LOCALE_FOLD_cp */ 12053 if (IS_IN_SOME_FOLD_L1(ender)) { 12054 maybe_exact = FALSE; 12055 12056 /* See if the character's fold differs between /d and 12057 * /u. This includes the multi-char fold SHARP S to 12058 * 'ss' */ 12059 if (maybe_exactfu 12060 && (PL_fold[ender] != PL_fold_latin1[ender] 12061 || ender == LATIN_SMALL_LETTER_SHARP_S 12062 || (len > 0 12063 && isARG2_lower_or_UPPER_ARG1('s', ender) 12064 && isARG2_lower_or_UPPER_ARG1('s', 12065 *(s-1))))) 12066 { 12067 maybe_exactfu = FALSE; 12068 } 12069 } 12070 12071 /* Even when folding, we store just the input character, as 12072 * we have an array that finds its fold quickly */ 12073 *(s++) = (char) ender; 12074 } 12075 else { /* FOLD and UTF */ 12076 /* Unlike the non-fold case, we do actually have to 12077 * calculate the results here in pass 1. This is for two 12078 * reasons, the folded length may be longer than the 12079 * unfolded, and we have to calculate how many EXACTish 12080 * nodes it will take; and we may run out of room in a node 12081 * in the middle of a potential multi-char fold, and have 12082 * to back off accordingly. (Hence we can't use REGC for 12083 * the simple case just below.) */ 12084 12085 UV folded; 12086 if (isASCII(ender)) { 12087 folded = toFOLD(ender); 12088 *(s)++ = (U8) folded; 12089 } 12090 else { 12091 STRLEN foldlen; 12092 12093 folded = _to_uni_fold_flags( 12094 ender, 12095 (U8 *) s, 12096 &foldlen, 12097 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 12098 ? FOLD_FLAGS_NOMIX_ASCII 12099 : 0)); 12100 s += foldlen; 12101 12102 /* The loop increments <len> each time, as all but this 12103 * path (and one other) through it add a single byte to 12104 * the EXACTish node. But this one has changed len to 12105 * be the correct final value, so subtract one to 12106 * cancel out the increment that follows */ 12107 len += foldlen - 1; 12108 } 12109 /* If this node only contains non-folding code points so 12110 * far, see if this new one is also non-folding */ 12111 if (maybe_exact) { 12112 if (folded != ender) { 12113 maybe_exact = FALSE; 12114 } 12115 else { 12116 /* Here the fold is the original; we have to check 12117 * further to see if anything folds to it */ 12118 if (_invlist_contains_cp(PL_utf8_foldable, 12119 ender)) 12120 { 12121 maybe_exact = FALSE; 12122 } 12123 } 12124 } 12125 ender = folded; 12126 } 12127 12128 if (next_is_quantifier) { 12129 12130 /* Here, the next input is a quantifier, and to get here, 12131 * the current character is the only one in the node. 12132 * Also, here <len> doesn't include the final byte for this 12133 * character */ 12134 len++; 12135 goto loopdone; 12136 } 12137 12138 } /* End of loop through literal characters */ 12139 12140 /* Here we have either exhausted the input or ran out of room in 12141 * the node. (If we encountered a character that can't be in the 12142 * node, transfer is made directly to <loopdone>, and so we 12143 * wouldn't have fallen off the end of the loop.) In the latter 12144 * case, we artificially have to split the node into two, because 12145 * we just don't have enough space to hold everything. This 12146 * creates a problem if the final character participates in a 12147 * multi-character fold in the non-final position, as a match that 12148 * should have occurred won't, due to the way nodes are matched, 12149 * and our artificial boundary. So back off until we find a non- 12150 * problematic character -- one that isn't at the beginning or 12151 * middle of such a fold. (Either it doesn't participate in any 12152 * folds, or appears only in the final position of all the folds it 12153 * does participate in.) A better solution with far fewer false 12154 * positives, and that would fill the nodes more completely, would 12155 * be to actually have available all the multi-character folds to 12156 * test against, and to back-off only far enough to be sure that 12157 * this node isn't ending with a partial one. <upper_parse> is set 12158 * further below (if we need to reparse the node) to include just 12159 * up through that final non-problematic character that this code 12160 * identifies, so when it is set to less than the full node, we can 12161 * skip the rest of this */ 12162 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { 12163 12164 const STRLEN full_len = len; 12165 12166 assert(len >= MAX_NODE_STRING_SIZE); 12167 12168 /* Here, <s> points to the final byte of the final character. 12169 * Look backwards through the string until find a non- 12170 * problematic character */ 12171 12172 if (! UTF) { 12173 12174 /* This has no multi-char folds to non-UTF characters */ 12175 if (ASCII_FOLD_RESTRICTED) { 12176 goto loopdone; 12177 } 12178 12179 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } 12180 len = s - s0 + 1; 12181 } 12182 else { 12183 if (! PL_NonL1NonFinalFold) { 12184 PL_NonL1NonFinalFold = _new_invlist_C_array( 12185 NonL1_Perl_Non_Final_Folds_invlist); 12186 } 12187 12188 /* Point to the first byte of the final character */ 12189 s = (char *) utf8_hop((U8 *) s, -1); 12190 12191 while (s >= s0) { /* Search backwards until find 12192 non-problematic char */ 12193 if (UTF8_IS_INVARIANT(*s)) { 12194 12195 /* There are no ascii characters that participate 12196 * in multi-char folds under /aa. In EBCDIC, the 12197 * non-ascii invariants are all control characters, 12198 * so don't ever participate in any folds. */ 12199 if (ASCII_FOLD_RESTRICTED 12200 || ! IS_NON_FINAL_FOLD(*s)) 12201 { 12202 break; 12203 } 12204 } 12205 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { 12206 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( 12207 *s, *(s+1)))) 12208 { 12209 break; 12210 } 12211 } 12212 else if (! _invlist_contains_cp( 12213 PL_NonL1NonFinalFold, 12214 valid_utf8_to_uvchr((U8 *) s, NULL))) 12215 { 12216 break; 12217 } 12218 12219 /* Here, the current character is problematic in that 12220 * it does occur in the non-final position of some 12221 * fold, so try the character before it, but have to 12222 * special case the very first byte in the string, so 12223 * we don't read outside the string */ 12224 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); 12225 } /* End of loop backwards through the string */ 12226 12227 /* If there were only problematic characters in the string, 12228 * <s> will point to before s0, in which case the length 12229 * should be 0, otherwise include the length of the 12230 * non-problematic character just found */ 12231 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); 12232 } 12233 12234 /* Here, have found the final character, if any, that is 12235 * non-problematic as far as ending the node without splitting 12236 * it across a potential multi-char fold. <len> contains the 12237 * number of bytes in the node up-to and including that 12238 * character, or is 0 if there is no such character, meaning 12239 * the whole node contains only problematic characters. In 12240 * this case, give up and just take the node as-is. We can't 12241 * do any better */ 12242 if (len == 0) { 12243 len = full_len; 12244 12245 /* If the node ends in an 's' we make sure it stays EXACTF, 12246 * as if it turns into an EXACTFU, it could later get 12247 * joined with another 's' that would then wrongly match 12248 * the sharp s */ 12249 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) 12250 { 12251 maybe_exactfu = FALSE; 12252 } 12253 } else { 12254 12255 /* Here, the node does contain some characters that aren't 12256 * problematic. If one such is the final character in the 12257 * node, we are done */ 12258 if (len == full_len) { 12259 goto loopdone; 12260 } 12261 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { 12262 12263 /* If the final character is problematic, but the 12264 * penultimate is not, back-off that last character to 12265 * later start a new node with it */ 12266 p = oldp; 12267 goto loopdone; 12268 } 12269 12270 /* Here, the final non-problematic character is earlier 12271 * in the input than the penultimate character. What we do 12272 * is reparse from the beginning, going up only as far as 12273 * this final ok one, thus guaranteeing that the node ends 12274 * in an acceptable character. The reason we reparse is 12275 * that we know how far in the character is, but we don't 12276 * know how to correlate its position with the input parse. 12277 * An alternate implementation would be to build that 12278 * correlation as we go along during the original parse, 12279 * but that would entail extra work for every node, whereas 12280 * this code gets executed only when the string is too 12281 * large for the node, and the final two characters are 12282 * problematic, an infrequent occurrence. Yet another 12283 * possible strategy would be to save the tail of the 12284 * string, and the next time regatom is called, initialize 12285 * with that. The problem with this is that unless you 12286 * back off one more character, you won't be guaranteed 12287 * regatom will get called again, unless regbranch, 12288 * regpiece ... are also changed. If you do back off that 12289 * extra character, so that there is input guaranteed to 12290 * force calling regatom, you can't handle the case where 12291 * just the first character in the node is acceptable. I 12292 * (khw) decided to try this method which doesn't have that 12293 * pitfall; if performance issues are found, we can do a 12294 * combination of the current approach plus that one */ 12295 upper_parse = len; 12296 len = 0; 12297 s = s0; 12298 goto reparse; 12299 } 12300 } /* End of verifying node ends with an appropriate char */ 12301 12302 loopdone: /* Jumped to when encounters something that shouldn't be in 12303 the node */ 12304 12305 /* I (khw) don't know if you can get here with zero length, but the 12306 * old code handled this situation by creating a zero-length EXACT 12307 * node. Might as well be NOTHING instead */ 12308 if (len == 0) { 12309 OP(ret) = NOTHING; 12310 } 12311 else { 12312 if (FOLD) { 12313 /* If 'maybe_exact' is still set here, means there are no 12314 * code points in the node that participate in folds; 12315 * similarly for 'maybe_exactfu' and code points that match 12316 * differently depending on UTF8ness of the target string 12317 * (for /u), or depending on locale for /l */ 12318 if (maybe_exact) { 12319 OP(ret) = EXACT; 12320 } 12321 else if (maybe_exactfu) { 12322 OP(ret) = EXACTFU; 12323 } 12324 } 12325 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, 12326 FALSE /* Don't look to see if could 12327 be turned into an EXACT 12328 node, as we have already 12329 computed that */ 12330 ); 12331 } 12332 12333 RExC_parse = p - 1; 12334 Set_Node_Cur_Length(ret, parse_start); 12335 nextchar(pRExC_state); 12336 { 12337 /* len is STRLEN which is unsigned, need to copy to signed */ 12338 IV iv = len; 12339 if (iv < 0) 12340 vFAIL("Internal disaster"); 12341 } 12342 12343 } /* End of label 'defchar:' */ 12344 break; 12345 } /* End of giant switch on input character */ 12346 12347 return(ret); 12348 } 12349 12350 STATIC char * 12351 S_regwhite( RExC_state_t *pRExC_state, char *p ) 12352 { 12353 const char *e = RExC_end; 12354 12355 PERL_ARGS_ASSERT_REGWHITE; 12356 12357 while (p < e) { 12358 if (isSPACE(*p)) 12359 ++p; 12360 else if (*p == '#') { 12361 bool ended = 0; 12362 do { 12363 if (*p++ == '\n') { 12364 ended = 1; 12365 break; 12366 } 12367 } while (p < e); 12368 if (!ended) 12369 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 12370 } 12371 else 12372 break; 12373 } 12374 return p; 12375 } 12376 12377 STATIC char * 12378 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) 12379 { 12380 /* Returns the next non-pattern-white space, non-comment character (the 12381 * latter only if 'recognize_comment is true) in the string p, which is 12382 * ended by RExC_end. If there is no line break ending a comment, 12383 * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ 12384 const char *e = RExC_end; 12385 12386 PERL_ARGS_ASSERT_REGPATWS; 12387 12388 while (p < e) { 12389 STRLEN len; 12390 if ((len = is_PATWS_safe(p, e, UTF))) { 12391 p += len; 12392 } 12393 else if (recognize_comment && *p == '#') { 12394 bool ended = 0; 12395 do { 12396 p++; 12397 if (is_LNBREAK_safe(p, e, UTF)) { 12398 ended = 1; 12399 break; 12400 } 12401 } while (p < e); 12402 if (!ended) 12403 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 12404 } 12405 else 12406 break; 12407 } 12408 return p; 12409 } 12410 12411 STATIC void 12412 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) 12413 { 12414 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It 12415 * sets up the bitmap and any flags, removing those code points from the 12416 * inversion list, setting it to NULL should it become completely empty */ 12417 12418 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; 12419 assert(PL_regkind[OP(node)] == ANYOF); 12420 12421 ANYOF_BITMAP_ZERO(node); 12422 if (*invlist_ptr) { 12423 12424 /* This gets set if we actually need to modify things */ 12425 bool change_invlist = FALSE; 12426 12427 UV start, end; 12428 12429 /* Start looking through *invlist_ptr */ 12430 invlist_iterinit(*invlist_ptr); 12431 while (invlist_iternext(*invlist_ptr, &start, &end)) { 12432 UV high; 12433 int i; 12434 12435 if (end == UV_MAX && start <= 256) { 12436 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; 12437 } 12438 else if (end >= 256) { 12439 ANYOF_FLAGS(node) |= ANYOF_UTF8; 12440 } 12441 12442 /* Quit if are above what we should change */ 12443 if (start > 255) { 12444 break; 12445 } 12446 12447 change_invlist = TRUE; 12448 12449 /* Set all the bits in the range, up to the max that we are doing */ 12450 high = (end < 255) ? end : 255; 12451 for (i = start; i <= (int) high; i++) { 12452 if (! ANYOF_BITMAP_TEST(node, i)) { 12453 ANYOF_BITMAP_SET(node, i); 12454 } 12455 } 12456 } 12457 invlist_iterfinish(*invlist_ptr); 12458 12459 /* Done with loop; remove any code points that are in the bitmap from 12460 * *invlist_ptr; similarly for code points above latin1 if we have a 12461 * flag to match all of them anyways */ 12462 if (change_invlist) { 12463 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); 12464 } 12465 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { 12466 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); 12467 } 12468 12469 /* If have completely emptied it, remove it completely */ 12470 if (_invlist_len(*invlist_ptr) == 0) { 12471 SvREFCNT_dec_NN(*invlist_ptr); 12472 *invlist_ptr = NULL; 12473 } 12474 } 12475 } 12476 12477 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 12478 Character classes ([:foo:]) can also be negated ([:^foo:]). 12479 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 12480 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 12481 but trigger failures because they are currently unimplemented. */ 12482 12483 #define POSIXCC_DONE(c) ((c) == ':') 12484 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 12485 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 12486 12487 PERL_STATIC_INLINE I32 12488 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) 12489 { 12490 dVAR; 12491 I32 namedclass = OOB_NAMEDCLASS; 12492 12493 PERL_ARGS_ASSERT_REGPPOSIXCC; 12494 12495 if (value == '[' && RExC_parse + 1 < RExC_end && 12496 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 12497 POSIXCC(UCHARAT(RExC_parse))) 12498 { 12499 const char c = UCHARAT(RExC_parse); 12500 char* const s = RExC_parse++; 12501 12502 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 12503 RExC_parse++; 12504 if (RExC_parse == RExC_end) { 12505 if (strict) { 12506 12507 /* Try to give a better location for the error (than the end of 12508 * the string) by looking for the matching ']' */ 12509 RExC_parse = s; 12510 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 12511 RExC_parse++; 12512 } 12513 vFAIL2("Unmatched '%c' in POSIX class", c); 12514 } 12515 /* Grandfather lone [:, [=, [. */ 12516 RExC_parse = s; 12517 } 12518 else { 12519 const char* const t = RExC_parse++; /* skip over the c */ 12520 assert(*t == c); 12521 12522 if (UCHARAT(RExC_parse) == ']') { 12523 const char *posixcc = s + 1; 12524 RExC_parse++; /* skip over the ending ] */ 12525 12526 if (*s == ':') { 12527 const I32 complement = *posixcc == '^' ? *posixcc++ : 0; 12528 const I32 skip = t - posixcc; 12529 12530 /* Initially switch on the length of the name. */ 12531 switch (skip) { 12532 case 4: 12533 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, 12534 this is the Perl \w 12535 */ 12536 namedclass = ANYOF_WORDCHAR; 12537 break; 12538 case 5: 12539 /* Names all of length 5. */ 12540 /* alnum alpha ascii blank cntrl digit graph lower 12541 print punct space upper */ 12542 /* Offset 4 gives the best switch position. */ 12543 switch (posixcc[4]) { 12544 case 'a': 12545 if (memEQ(posixcc, "alph", 4)) /* alpha */ 12546 namedclass = ANYOF_ALPHA; 12547 break; 12548 case 'e': 12549 if (memEQ(posixcc, "spac", 4)) /* space */ 12550 namedclass = ANYOF_PSXSPC; 12551 break; 12552 case 'h': 12553 if (memEQ(posixcc, "grap", 4)) /* graph */ 12554 namedclass = ANYOF_GRAPH; 12555 break; 12556 case 'i': 12557 if (memEQ(posixcc, "asci", 4)) /* ascii */ 12558 namedclass = ANYOF_ASCII; 12559 break; 12560 case 'k': 12561 if (memEQ(posixcc, "blan", 4)) /* blank */ 12562 namedclass = ANYOF_BLANK; 12563 break; 12564 case 'l': 12565 if (memEQ(posixcc, "cntr", 4)) /* cntrl */ 12566 namedclass = ANYOF_CNTRL; 12567 break; 12568 case 'm': 12569 if (memEQ(posixcc, "alnu", 4)) /* alnum */ 12570 namedclass = ANYOF_ALPHANUMERIC; 12571 break; 12572 case 'r': 12573 if (memEQ(posixcc, "lowe", 4)) /* lower */ 12574 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; 12575 else if (memEQ(posixcc, "uppe", 4)) /* upper */ 12576 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; 12577 break; 12578 case 't': 12579 if (memEQ(posixcc, "digi", 4)) /* digit */ 12580 namedclass = ANYOF_DIGIT; 12581 else if (memEQ(posixcc, "prin", 4)) /* print */ 12582 namedclass = ANYOF_PRINT; 12583 else if (memEQ(posixcc, "punc", 4)) /* punct */ 12584 namedclass = ANYOF_PUNCT; 12585 break; 12586 } 12587 break; 12588 case 6: 12589 if (memEQ(posixcc, "xdigit", 6)) 12590 namedclass = ANYOF_XDIGIT; 12591 break; 12592 } 12593 12594 if (namedclass == OOB_NAMEDCLASS) 12595 vFAIL2utf8f( 12596 "POSIX class [:%"UTF8f":] unknown", 12597 UTF8fARG(UTF, t - s - 1, s + 1)); 12598 12599 /* The #defines are structured so each complement is +1 to 12600 * the normal one */ 12601 if (complement) { 12602 namedclass++; 12603 } 12604 assert (posixcc[skip] == ':'); 12605 assert (posixcc[skip+1] == ']'); 12606 } else if (!SIZE_ONLY) { 12607 /* [[=foo=]] and [[.foo.]] are still future. */ 12608 12609 /* adjust RExC_parse so the warning shows after 12610 the class closes */ 12611 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 12612 RExC_parse++; 12613 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 12614 } 12615 } else { 12616 /* Maternal grandfather: 12617 * "[:" ending in ":" but not in ":]" */ 12618 if (strict) { 12619 vFAIL("Unmatched '[' in POSIX class"); 12620 } 12621 12622 /* Grandfather lone [:, [=, [. */ 12623 RExC_parse = s; 12624 } 12625 } 12626 } 12627 12628 return namedclass; 12629 } 12630 12631 STATIC bool 12632 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) 12633 { 12634 /* This applies some heuristics at the current parse position (which should 12635 * be at a '[') to see if what follows might be intended to be a [:posix:] 12636 * class. It returns true if it really is a posix class, of course, but it 12637 * also can return true if it thinks that what was intended was a posix 12638 * class that didn't quite make it. 12639 * 12640 * It will return true for 12641 * [:alphanumerics: 12642 * [:alphanumerics] (as long as the ] isn't followed immediately by a 12643 * ')' indicating the end of the (?[ 12644 * [:any garbage including %^&$ punctuation:] 12645 * 12646 * This is designed to be called only from S_handle_regex_sets; it could be 12647 * easily adapted to be called from the spot at the beginning of regclass() 12648 * that checks to see in a normal bracketed class if the surrounding [] 12649 * have been omitted ([:word:] instead of [[:word:]]). But doing so would 12650 * change long-standing behavior, so I (khw) didn't do that */ 12651 char* p = RExC_parse + 1; 12652 char first_char = *p; 12653 12654 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; 12655 12656 assert(*(p - 1) == '['); 12657 12658 if (! POSIXCC(first_char)) { 12659 return FALSE; 12660 } 12661 12662 p++; 12663 while (p < RExC_end && isWORDCHAR(*p)) p++; 12664 12665 if (p >= RExC_end) { 12666 return FALSE; 12667 } 12668 12669 if (p - RExC_parse > 2 /* Got at least 1 word character */ 12670 && (*p == first_char 12671 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) 12672 { 12673 return TRUE; 12674 } 12675 12676 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); 12677 12678 return (p 12679 && p - RExC_parse > 2 /* [:] evaluates to colon; 12680 [::] is a bad posix class. */ 12681 && first_char == *(p - 1)); 12682 } 12683 12684 STATIC regnode * 12685 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 12686 I32 *flagp, U32 depth, 12687 char * const oregcomp_parse) 12688 { 12689 /* Handle the (?[...]) construct to do set operations */ 12690 12691 U8 curchar; 12692 UV start, end; /* End points of code point ranges */ 12693 SV* result_string; 12694 char *save_end, *save_parse; 12695 SV* final; 12696 STRLEN len; 12697 regnode* node; 12698 AV* stack; 12699 const bool save_fold = FOLD; 12700 12701 GET_RE_DEBUG_FLAGS_DECL; 12702 12703 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; 12704 12705 if (LOC) { 12706 vFAIL("(?[...]) not valid in locale"); 12707 } 12708 RExC_uni_semantics = 1; 12709 12710 /* This will return only an ANYOF regnode, or (unlikely) something smaller 12711 * (such as EXACT). Thus we can skip most everything if just sizing. We 12712 * call regclass to handle '[]' so as to not have to reinvent its parsing 12713 * rules here (throwing away the size it computes each time). And, we exit 12714 * upon an unescaped ']' that isn't one ending a regclass. To do both 12715 * these things, we need to realize that something preceded by a backslash 12716 * is escaped, so we have to keep track of backslashes */ 12717 if (SIZE_ONLY) { 12718 UV depth = 0; /* how many nested (?[...]) constructs */ 12719 12720 Perl_ck_warner_d(aTHX_ 12721 packWARN(WARN_EXPERIMENTAL__REGEX_SETS), 12722 "The regex_sets feature is experimental" REPORT_LOCATION, 12723 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), 12724 UTF8fARG(UTF, 12725 RExC_end - RExC_start - (RExC_parse - RExC_precomp), 12726 RExC_precomp + (RExC_parse - RExC_precomp))); 12727 12728 while (RExC_parse < RExC_end) { 12729 SV* current = NULL; 12730 RExC_parse = regpatws(pRExC_state, RExC_parse, 12731 TRUE); /* means recognize comments */ 12732 switch (*RExC_parse) { 12733 case '?': 12734 if (RExC_parse[1] == '[') depth++, RExC_parse++; 12735 /* FALL THROUGH */ 12736 default: 12737 break; 12738 case '\\': 12739 /* Skip the next byte (which could cause us to end up in 12740 * the middle of a UTF-8 character, but since none of those 12741 * are confusable with anything we currently handle in this 12742 * switch (invariants all), it's safe. We'll just hit the 12743 * default: case next time and keep on incrementing until 12744 * we find one of the invariants we do handle. */ 12745 RExC_parse++; 12746 break; 12747 case '[': 12748 { 12749 /* If this looks like it is a [:posix:] class, leave the 12750 * parse pointer at the '[' to fool regclass() into 12751 * thinking it is part of a '[[:posix:]]'. That function 12752 * will use strict checking to force a syntax error if it 12753 * doesn't work out to a legitimate class */ 12754 bool is_posix_class 12755 = could_it_be_a_POSIX_class(pRExC_state); 12756 if (! is_posix_class) { 12757 RExC_parse++; 12758 } 12759 12760 /* regclass() can only return RESTART_UTF8 if multi-char 12761 folds are allowed. */ 12762 if (!regclass(pRExC_state, flagp,depth+1, 12763 is_posix_class, /* parse the whole char 12764 class only if not a 12765 posix class */ 12766 FALSE, /* don't allow multi-char folds */ 12767 TRUE, /* silence non-portable warnings. */ 12768 ¤t)) 12769 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", 12770 (UV) *flagp); 12771 12772 /* function call leaves parse pointing to the ']', except 12773 * if we faked it */ 12774 if (is_posix_class) { 12775 RExC_parse--; 12776 } 12777 12778 SvREFCNT_dec(current); /* In case it returned something */ 12779 break; 12780 } 12781 12782 case ']': 12783 if (depth--) break; 12784 RExC_parse++; 12785 if (RExC_parse < RExC_end 12786 && *RExC_parse == ')') 12787 { 12788 node = reganode(pRExC_state, ANYOF, 0); 12789 RExC_size += ANYOF_SKIP; 12790 nextchar(pRExC_state); 12791 Set_Node_Length(node, 12792 RExC_parse - oregcomp_parse + 1); /* MJD */ 12793 return node; 12794 } 12795 goto no_close; 12796 } 12797 RExC_parse++; 12798 } 12799 12800 no_close: 12801 FAIL("Syntax error in (?[...])"); 12802 } 12803 12804 /* Pass 2 only after this. Everything in this construct is a 12805 * metacharacter. Operands begin with either a '\' (for an escape 12806 * sequence), or a '[' for a bracketed character class. Any other 12807 * character should be an operator, or parenthesis for grouping. Both 12808 * types of operands are handled by calling regclass() to parse them. It 12809 * is called with a parameter to indicate to return the computed inversion 12810 * list. The parsing here is implemented via a stack. Each entry on the 12811 * stack is a single character representing one of the operators, or the 12812 * '('; or else a pointer to an operand inversion list. */ 12813 12814 #define IS_OPERAND(a) (! SvIOK(a)) 12815 12816 /* The stack starts empty. It is a syntax error if the first thing parsed 12817 * is a binary operator; everything else is pushed on the stack. When an 12818 * operand is parsed, the top of the stack is examined. If it is a binary 12819 * operator, the item before it should be an operand, and both are replaced 12820 * by the result of doing that operation on the new operand and the one on 12821 * the stack. Thus a sequence of binary operands is reduced to a single 12822 * one before the next one is parsed. 12823 * 12824 * A unary operator may immediately follow a binary in the input, for 12825 * example 12826 * [a] + ! [b] 12827 * When an operand is parsed and the top of the stack is a unary operator, 12828 * the operation is performed, and then the stack is rechecked to see if 12829 * this new operand is part of a binary operation; if so, it is handled as 12830 * above. 12831 * 12832 * A '(' is simply pushed on the stack; it is valid only if the stack is 12833 * empty, or the top element of the stack is an operator or another '(' 12834 * (for which the parenthesized expression will become an operand). By the 12835 * time the corresponding ')' is parsed everything in between should have 12836 * been parsed and evaluated to a single operand (or else is a syntax 12837 * error), and is handled as a regular operand */ 12838 12839 sv_2mortal((SV *)(stack = newAV())); 12840 12841 while (RExC_parse < RExC_end) { 12842 I32 top_index = av_tindex(stack); 12843 SV** top_ptr; 12844 SV* current = NULL; 12845 12846 /* Skip white space */ 12847 RExC_parse = regpatws(pRExC_state, RExC_parse, 12848 TRUE); /* means recognize comments */ 12849 if (RExC_parse >= RExC_end) { 12850 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); 12851 } 12852 if ((curchar = UCHARAT(RExC_parse)) == ']') { 12853 break; 12854 } 12855 12856 switch (curchar) { 12857 12858 case '?': 12859 if (av_tindex(stack) >= 0 /* This makes sure that we can 12860 safely subtract 1 from 12861 RExC_parse in the next clause. 12862 If we have something on the 12863 stack, we have parsed something 12864 */ 12865 && UCHARAT(RExC_parse - 1) == '(' 12866 && RExC_parse < RExC_end) 12867 { 12868 /* If is a '(?', could be an embedded '(?flags:(?[...])'. 12869 * This happens when we have some thing like 12870 * 12871 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; 12872 * ... 12873 * qr/(?[ \p{Digit} & $thai_or_lao ])/; 12874 * 12875 * Here we would be handling the interpolated 12876 * '$thai_or_lao'. We handle this by a recursive call to 12877 * ourselves which returns the inversion list the 12878 * interpolated expression evaluates to. We use the flags 12879 * from the interpolated pattern. */ 12880 U32 save_flags = RExC_flags; 12881 const char * const save_parse = ++RExC_parse; 12882 12883 parse_lparen_question_flags(pRExC_state); 12884 12885 if (RExC_parse == save_parse /* Makes sure there was at 12886 least one flag (or this 12887 embedding wasn't compiled) 12888 */ 12889 || RExC_parse >= RExC_end - 4 12890 || UCHARAT(RExC_parse) != ':' 12891 || UCHARAT(++RExC_parse) != '(' 12892 || UCHARAT(++RExC_parse) != '?' 12893 || UCHARAT(++RExC_parse) != '[') 12894 { 12895 12896 /* In combination with the above, this moves the 12897 * pointer to the point just after the first erroneous 12898 * character (or if there are no flags, to where they 12899 * should have been) */ 12900 if (RExC_parse >= RExC_end - 4) { 12901 RExC_parse = RExC_end; 12902 } 12903 else if (RExC_parse != save_parse) { 12904 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 12905 } 12906 vFAIL("Expecting '(?flags:(?[...'"); 12907 } 12908 RExC_parse++; 12909 (void) handle_regex_sets(pRExC_state, ¤t, flagp, 12910 depth+1, oregcomp_parse); 12911 12912 /* Here, 'current' contains the embedded expression's 12913 * inversion list, and RExC_parse points to the trailing 12914 * ']'; the next character should be the ')' which will be 12915 * paired with the '(' that has been put on the stack, so 12916 * the whole embedded expression reduces to '(operand)' */ 12917 RExC_parse++; 12918 12919 RExC_flags = save_flags; 12920 goto handle_operand; 12921 } 12922 /* FALL THROUGH */ 12923 12924 default: 12925 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 12926 vFAIL("Unexpected character"); 12927 12928 case '\\': 12929 /* regclass() can only return RESTART_UTF8 if multi-char 12930 folds are allowed. */ 12931 if (!regclass(pRExC_state, flagp,depth+1, 12932 TRUE, /* means parse just the next thing */ 12933 FALSE, /* don't allow multi-char folds */ 12934 FALSE, /* don't silence non-portable warnings. */ 12935 ¤t)) 12936 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", 12937 (UV) *flagp); 12938 /* regclass() will return with parsing just the \ sequence, 12939 * leaving the parse pointer at the next thing to parse */ 12940 RExC_parse--; 12941 goto handle_operand; 12942 12943 case '[': /* Is a bracketed character class */ 12944 { 12945 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); 12946 12947 if (! is_posix_class) { 12948 RExC_parse++; 12949 } 12950 12951 /* regclass() can only return RESTART_UTF8 if multi-char 12952 folds are allowed. */ 12953 if(!regclass(pRExC_state, flagp,depth+1, 12954 is_posix_class, /* parse the whole char class 12955 only if not a posix class */ 12956 FALSE, /* don't allow multi-char folds */ 12957 FALSE, /* don't silence non-portable warnings. */ 12958 ¤t)) 12959 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", 12960 (UV) *flagp); 12961 /* function call leaves parse pointing to the ']', except if we 12962 * faked it */ 12963 if (is_posix_class) { 12964 RExC_parse--; 12965 } 12966 12967 goto handle_operand; 12968 } 12969 12970 case '&': 12971 case '|': 12972 case '+': 12973 case '-': 12974 case '^': 12975 if (top_index < 0 12976 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) 12977 || ! IS_OPERAND(*top_ptr)) 12978 { 12979 RExC_parse++; 12980 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); 12981 } 12982 av_push(stack, newSVuv(curchar)); 12983 break; 12984 12985 case '!': 12986 av_push(stack, newSVuv(curchar)); 12987 break; 12988 12989 case '(': 12990 if (top_index >= 0) { 12991 top_ptr = av_fetch(stack, top_index, FALSE); 12992 assert(top_ptr); 12993 if (IS_OPERAND(*top_ptr)) { 12994 RExC_parse++; 12995 vFAIL("Unexpected '(' with no preceding operator"); 12996 } 12997 } 12998 av_push(stack, newSVuv(curchar)); 12999 break; 13000 13001 case ')': 13002 { 13003 SV* lparen; 13004 if (top_index < 1 13005 || ! (current = av_pop(stack)) 13006 || ! IS_OPERAND(current) 13007 || ! (lparen = av_pop(stack)) 13008 || IS_OPERAND(lparen) 13009 || SvUV(lparen) != '(') 13010 { 13011 SvREFCNT_dec(current); 13012 RExC_parse++; 13013 vFAIL("Unexpected ')'"); 13014 } 13015 top_index -= 2; 13016 SvREFCNT_dec_NN(lparen); 13017 13018 /* FALL THROUGH */ 13019 } 13020 13021 handle_operand: 13022 13023 /* Here, we have an operand to process, in 'current' */ 13024 13025 if (top_index < 0) { /* Just push if stack is empty */ 13026 av_push(stack, current); 13027 } 13028 else { 13029 SV* top = av_pop(stack); 13030 SV *prev = NULL; 13031 char current_operator; 13032 13033 if (IS_OPERAND(top)) { 13034 SvREFCNT_dec_NN(top); 13035 SvREFCNT_dec_NN(current); 13036 vFAIL("Operand with no preceding operator"); 13037 } 13038 current_operator = (char) SvUV(top); 13039 switch (current_operator) { 13040 case '(': /* Push the '(' back on followed by the new 13041 operand */ 13042 av_push(stack, top); 13043 av_push(stack, current); 13044 SvREFCNT_inc(top); /* Counters the '_dec' done 13045 just after the 'break', so 13046 it doesn't get wrongly freed 13047 */ 13048 break; 13049 13050 case '!': 13051 _invlist_invert(current); 13052 13053 /* Unlike binary operators, the top of the stack, 13054 * now that this unary one has been popped off, may 13055 * legally be an operator, and we now have operand 13056 * for it. */ 13057 top_index--; 13058 SvREFCNT_dec_NN(top); 13059 goto handle_operand; 13060 13061 case '&': 13062 prev = av_pop(stack); 13063 _invlist_intersection(prev, 13064 current, 13065 ¤t); 13066 av_push(stack, current); 13067 break; 13068 13069 case '|': 13070 case '+': 13071 prev = av_pop(stack); 13072 _invlist_union(prev, current, ¤t); 13073 av_push(stack, current); 13074 break; 13075 13076 case '-': 13077 prev = av_pop(stack);; 13078 _invlist_subtract(prev, current, ¤t); 13079 av_push(stack, current); 13080 break; 13081 13082 case '^': /* The union minus the intersection */ 13083 { 13084 SV* i = NULL; 13085 SV* u = NULL; 13086 SV* element; 13087 13088 prev = av_pop(stack); 13089 _invlist_union(prev, current, &u); 13090 _invlist_intersection(prev, current, &i); 13091 /* _invlist_subtract will overwrite current 13092 without freeing what it already contains */ 13093 element = current; 13094 _invlist_subtract(u, i, ¤t); 13095 av_push(stack, current); 13096 SvREFCNT_dec_NN(i); 13097 SvREFCNT_dec_NN(u); 13098 SvREFCNT_dec_NN(element); 13099 break; 13100 } 13101 13102 default: 13103 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); 13104 } 13105 SvREFCNT_dec_NN(top); 13106 SvREFCNT_dec(prev); 13107 } 13108 } 13109 13110 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 13111 } 13112 13113 if (av_tindex(stack) < 0 /* Was empty */ 13114 || ((final = av_pop(stack)) == NULL) 13115 || ! IS_OPERAND(final) 13116 || av_tindex(stack) >= 0) /* More left on stack */ 13117 { 13118 vFAIL("Incomplete expression within '(?[ ])'"); 13119 } 13120 13121 /* Here, 'final' is the resultant inversion list from evaluating the 13122 * expression. Return it if so requested */ 13123 if (return_invlist) { 13124 *return_invlist = final; 13125 return END; 13126 } 13127 13128 /* Otherwise generate a resultant node, based on 'final'. regclass() is 13129 * expecting a string of ranges and individual code points */ 13130 invlist_iterinit(final); 13131 result_string = newSVpvs(""); 13132 while (invlist_iternext(final, &start, &end)) { 13133 if (start == end) { 13134 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); 13135 } 13136 else { 13137 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", 13138 start, end); 13139 } 13140 } 13141 13142 save_parse = RExC_parse; 13143 RExC_parse = SvPV(result_string, len); 13144 save_end = RExC_end; 13145 RExC_end = RExC_parse + len; 13146 13147 /* We turn off folding around the call, as the class we have constructed 13148 * already has all folding taken into consideration, and we don't want 13149 * regclass() to add to that */ 13150 RExC_flags &= ~RXf_PMf_FOLD; 13151 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. 13152 */ 13153 node = regclass(pRExC_state, flagp,depth+1, 13154 FALSE, /* means parse the whole char class */ 13155 FALSE, /* don't allow multi-char folds */ 13156 TRUE, /* silence non-portable warnings. The above may very 13157 well have generated non-portable code points, but 13158 they're valid on this machine */ 13159 NULL); 13160 if (!node) 13161 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, 13162 PTR2UV(flagp)); 13163 if (save_fold) { 13164 RExC_flags |= RXf_PMf_FOLD; 13165 } 13166 RExC_parse = save_parse + 1; 13167 RExC_end = save_end; 13168 SvREFCNT_dec_NN(final); 13169 SvREFCNT_dec_NN(result_string); 13170 13171 nextchar(pRExC_state); 13172 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ 13173 return node; 13174 } 13175 #undef IS_OPERAND 13176 13177 /* The names of properties whose definitions are not known at compile time are 13178 * stored in this SV, after a constant heading. So if the length has been 13179 * changed since initialization, then there is a run-time definition. */ 13180 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ 13181 (SvCUR(listsv) != initial_listsv_len) 13182 13183 STATIC regnode * 13184 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, 13185 const bool stop_at_1, /* Just parse the next thing, don't 13186 look for a full character class */ 13187 bool allow_multi_folds, 13188 const bool silence_non_portable, /* Don't output warnings 13189 about too large 13190 characters */ 13191 SV** ret_invlist) /* Return an inversion list, not a node */ 13192 { 13193 /* parse a bracketed class specification. Most of these will produce an 13194 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an 13195 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex 13196 * under /i with multi-character folds: it will be rewritten following the 13197 * paradigm of this example, where the <multi-fold>s are characters which 13198 * fold to multiple character sequences: 13199 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i 13200 * gets effectively rewritten as: 13201 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i 13202 * reg() gets called (recursively) on the rewritten version, and this 13203 * function will return what it constructs. (Actually the <multi-fold>s 13204 * aren't physically removed from the [abcdefghi], it's just that they are 13205 * ignored in the recursion by means of a flag: 13206 * <RExC_in_multi_char_class>.) 13207 * 13208 * ANYOF nodes contain a bit map for the first 256 characters, with the 13209 * corresponding bit set if that character is in the list. For characters 13210 * above 255, a range list or swash is used. There are extra bits for \w, 13211 * etc. in locale ANYOFs, as what these match is not determinable at 13212 * compile time 13213 * 13214 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs 13215 * to be restarted. This can only happen if ret_invlist is non-NULL. 13216 */ 13217 13218 dVAR; 13219 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; 13220 IV range = 0; 13221 UV value = OOB_UNICODE, save_value = OOB_UNICODE; 13222 regnode *ret; 13223 STRLEN numlen; 13224 IV namedclass = OOB_NAMEDCLASS; 13225 char *rangebegin = NULL; 13226 bool need_class = 0; 13227 SV *listsv = NULL; 13228 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more 13229 than just initialized. */ 13230 SV* properties = NULL; /* Code points that match \p{} \P{} */ 13231 SV* posixes = NULL; /* Code points that match classes like [:word:], 13232 extended beyond the Latin1 range. These have to 13233 be kept separate from other code points for much 13234 of this function because their handling is 13235 different under /i, and for most classes under 13236 /d as well */ 13237 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept 13238 separate for a while from the non-complemented 13239 versions because of complications with /d 13240 matching */ 13241 UV element_count = 0; /* Number of distinct elements in the class. 13242 Optimizations may be possible if this is tiny */ 13243 AV * multi_char_matches = NULL; /* Code points that fold to more than one 13244 character; used under /i */ 13245 UV n; 13246 char * stop_ptr = RExC_end; /* where to stop parsing */ 13247 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white 13248 space? */ 13249 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ 13250 13251 /* Unicode properties are stored in a swash; this holds the current one 13252 * being parsed. If this swash is the only above-latin1 component of the 13253 * character class, an optimization is to pass it directly on to the 13254 * execution engine. Otherwise, it is set to NULL to indicate that there 13255 * are other things in the class that have to be dealt with at execution 13256 * time */ 13257 SV* swash = NULL; /* Code points that match \p{} \P{} */ 13258 13259 /* Set if a component of this character class is user-defined; just passed 13260 * on to the engine */ 13261 bool has_user_defined_property = FALSE; 13262 13263 /* inversion list of code points this node matches only when the target 13264 * string is in UTF-8. (Because is under /d) */ 13265 SV* depends_list = NULL; 13266 13267 /* Inversion list of code points this node matches regardless of things 13268 * like locale, folding, utf8ness of the target string */ 13269 SV* cp_list = NULL; 13270 13271 /* Like cp_list, but code points on this list need to be checked for things 13272 * that fold to/from them under /i */ 13273 SV* cp_foldable_list = NULL; 13274 13275 /* Like cp_list, but code points on this list are valid only when the 13276 * runtime locale is UTF-8 */ 13277 SV* only_utf8_locale_list = NULL; 13278 13279 #ifdef EBCDIC 13280 /* In a range, counts how many 0-2 of the ends of it came from literals, 13281 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ 13282 UV literal_endpoint = 0; 13283 #endif 13284 bool invert = FALSE; /* Is this class to be complemented */ 13285 13286 bool warn_super = ALWAYS_WARN_SUPER; 13287 13288 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in 13289 case we need to change the emitted regop to an EXACT. */ 13290 const char * orig_parse = RExC_parse; 13291 const SSize_t orig_size = RExC_size; 13292 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ 13293 GET_RE_DEBUG_FLAGS_DECL; 13294 13295 PERL_ARGS_ASSERT_REGCLASS; 13296 #ifndef DEBUGGING 13297 PERL_UNUSED_ARG(depth); 13298 #endif 13299 13300 DEBUG_PARSE("clas"); 13301 13302 /* Assume we are going to generate an ANYOF node. */ 13303 ret = reganode(pRExC_state, ANYOF, 0); 13304 13305 if (SIZE_ONLY) { 13306 RExC_size += ANYOF_SKIP; 13307 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ 13308 } 13309 else { 13310 ANYOF_FLAGS(ret) = 0; 13311 13312 RExC_emit += ANYOF_SKIP; 13313 listsv = newSVpvs_flags("# comment\n", SVs_TEMP); 13314 initial_listsv_len = SvCUR(listsv); 13315 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ 13316 } 13317 13318 if (skip_white) { 13319 RExC_parse = regpatws(pRExC_state, RExC_parse, 13320 FALSE /* means don't recognize comments */); 13321 } 13322 13323 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 13324 RExC_parse++; 13325 invert = TRUE; 13326 allow_multi_folds = FALSE; 13327 RExC_naughty++; 13328 if (skip_white) { 13329 RExC_parse = regpatws(pRExC_state, RExC_parse, 13330 FALSE /* means don't recognize comments */); 13331 } 13332 } 13333 13334 /* Check that they didn't say [:posix:] instead of [[:posix:]] */ 13335 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { 13336 const char *s = RExC_parse; 13337 const char c = *s++; 13338 13339 while (isWORDCHAR(*s)) 13340 s++; 13341 if (*s && c == *s && s[1] == ']') { 13342 SAVEFREESV(RExC_rx_sv); 13343 ckWARN3reg(s+2, 13344 "POSIX syntax [%c %c] belongs inside character classes", 13345 c, c); 13346 (void)ReREFCNT_inc(RExC_rx_sv); 13347 } 13348 } 13349 13350 /* If the caller wants us to just parse a single element, accomplish this 13351 * by faking the loop ending condition */ 13352 if (stop_at_1 && RExC_end > RExC_parse) { 13353 stop_ptr = RExC_parse + 1; 13354 } 13355 13356 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ 13357 if (UCHARAT(RExC_parse) == ']') 13358 goto charclassloop; 13359 13360 parseit: 13361 while (1) { 13362 if (RExC_parse >= stop_ptr) { 13363 break; 13364 } 13365 13366 if (skip_white) { 13367 RExC_parse = regpatws(pRExC_state, RExC_parse, 13368 FALSE /* means don't recognize comments */); 13369 } 13370 13371 if (UCHARAT(RExC_parse) == ']') { 13372 break; 13373 } 13374 13375 charclassloop: 13376 13377 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 13378 save_value = value; 13379 save_prevvalue = prevvalue; 13380 13381 if (!range) { 13382 rangebegin = RExC_parse; 13383 element_count++; 13384 } 13385 if (UTF) { 13386 value = utf8n_to_uvchr((U8*)RExC_parse, 13387 RExC_end - RExC_parse, 13388 &numlen, UTF8_ALLOW_DEFAULT); 13389 RExC_parse += numlen; 13390 } 13391 else 13392 value = UCHARAT(RExC_parse++); 13393 13394 if (value == '[' 13395 && RExC_parse < RExC_end 13396 && POSIXCC(UCHARAT(RExC_parse))) 13397 { 13398 namedclass = regpposixcc(pRExC_state, value, strict); 13399 } 13400 else if (value == '\\') { 13401 if (UTF) { 13402 value = utf8n_to_uvchr((U8*)RExC_parse, 13403 RExC_end - RExC_parse, 13404 &numlen, UTF8_ALLOW_DEFAULT); 13405 RExC_parse += numlen; 13406 } 13407 else 13408 value = UCHARAT(RExC_parse++); 13409 13410 /* Some compilers cannot handle switching on 64-bit integer 13411 * values, therefore value cannot be an UV. Yes, this will 13412 * be a problem later if we want switch on Unicode. 13413 * A similar issue a little bit later when switching on 13414 * namedclass. --jhi */ 13415 13416 /* If the \ is escaping white space when white space is being 13417 * skipped, it means that that white space is wanted literally, and 13418 * is already in 'value'. Otherwise, need to translate the escape 13419 * into what it signifies. */ 13420 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { 13421 13422 case 'w': namedclass = ANYOF_WORDCHAR; break; 13423 case 'W': namedclass = ANYOF_NWORDCHAR; break; 13424 case 's': namedclass = ANYOF_SPACE; break; 13425 case 'S': namedclass = ANYOF_NSPACE; break; 13426 case 'd': namedclass = ANYOF_DIGIT; break; 13427 case 'D': namedclass = ANYOF_NDIGIT; break; 13428 case 'v': namedclass = ANYOF_VERTWS; break; 13429 case 'V': namedclass = ANYOF_NVERTWS; break; 13430 case 'h': namedclass = ANYOF_HORIZWS; break; 13431 case 'H': namedclass = ANYOF_NHORIZWS; break; 13432 case 'N': /* Handle \N{NAME} in class */ 13433 { 13434 /* We only pay attention to the first char of 13435 multichar strings being returned. I kinda wonder 13436 if this makes sense as it does change the behaviour 13437 from earlier versions, OTOH that behaviour was broken 13438 as well. */ 13439 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, 13440 TRUE, /* => charclass */ 13441 strict)) 13442 { 13443 if (*flagp & RESTART_UTF8) 13444 FAIL("panic: grok_bslash_N set RESTART_UTF8"); 13445 goto parseit; 13446 } 13447 } 13448 break; 13449 case 'p': 13450 case 'P': 13451 { 13452 char *e; 13453 13454 /* We will handle any undefined properties ourselves */ 13455 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF 13456 /* And we actually would prefer to get 13457 * the straight inversion list of the 13458 * swash, since we will be accessing it 13459 * anyway, to save a little time */ 13460 |_CORE_SWASH_INIT_ACCEPT_INVLIST; 13461 13462 if (RExC_parse >= RExC_end) 13463 vFAIL2("Empty \\%c{}", (U8)value); 13464 if (*RExC_parse == '{') { 13465 const U8 c = (U8)value; 13466 e = strchr(RExC_parse++, '}'); 13467 if (!e) 13468 vFAIL2("Missing right brace on \\%c{}", c); 13469 while (isSPACE(UCHARAT(RExC_parse))) 13470 RExC_parse++; 13471 if (e == RExC_parse) 13472 vFAIL2("Empty \\%c{}", c); 13473 n = e - RExC_parse; 13474 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 13475 n--; 13476 } 13477 else { 13478 e = RExC_parse; 13479 n = 1; 13480 } 13481 if (!SIZE_ONLY) { 13482 SV* invlist; 13483 char* formatted; 13484 char* name; 13485 13486 if (UCHARAT(RExC_parse) == '^') { 13487 RExC_parse++; 13488 n--; 13489 /* toggle. (The rhs xor gets the single bit that 13490 * differs between P and p; the other xor inverts just 13491 * that bit) */ 13492 value ^= 'P' ^ 'p'; 13493 13494 while (isSPACE(UCHARAT(RExC_parse))) { 13495 RExC_parse++; 13496 n--; 13497 } 13498 } 13499 /* Try to get the definition of the property into 13500 * <invlist>. If /i is in effect, the effective property 13501 * will have its name be <__NAME_i>. The design is 13502 * discussed in commit 13503 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ 13504 formatted = Perl_form(aTHX_ 13505 "%s%.*s%s\n", 13506 (FOLD) ? "__" : "", 13507 (int)n, 13508 RExC_parse, 13509 (FOLD) ? "_i" : "" 13510 ); 13511 name = savepvn(formatted, strlen(formatted)); 13512 13513 /* Look up the property name, and get its swash and 13514 * inversion list, if the property is found */ 13515 if (swash) { 13516 SvREFCNT_dec_NN(swash); 13517 } 13518 swash = _core_swash_init("utf8", name, &PL_sv_undef, 13519 1, /* binary */ 13520 0, /* not tr/// */ 13521 NULL, /* No inversion list */ 13522 &swash_init_flags 13523 ); 13524 if (! swash || ! (invlist = _get_swash_invlist(swash))) { 13525 if (swash) { 13526 SvREFCNT_dec_NN(swash); 13527 swash = NULL; 13528 } 13529 13530 /* Here didn't find it. It could be a user-defined 13531 * property that will be available at run-time. If we 13532 * accept only compile-time properties, is an error; 13533 * otherwise add it to the list for run-time look up */ 13534 if (ret_invlist) { 13535 RExC_parse = e + 1; 13536 vFAIL2utf8f( 13537 "Property '%"UTF8f"' is unknown", 13538 UTF8fARG(UTF, n, name)); 13539 } 13540 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", 13541 (value == 'p' ? '+' : '!'), 13542 UTF8fARG(UTF, n, name)); 13543 has_user_defined_property = TRUE; 13544 13545 /* We don't know yet, so have to assume that the 13546 * property could match something in the Latin1 range, 13547 * hence something that isn't utf8. Note that this 13548 * would cause things in <depends_list> to match 13549 * inappropriately, except that any \p{}, including 13550 * this one forces Unicode semantics, which means there 13551 * is no <depends_list> */ 13552 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; 13553 } 13554 else { 13555 13556 /* Here, did get the swash and its inversion list. If 13557 * the swash is from a user-defined property, then this 13558 * whole character class should be regarded as such */ 13559 if (swash_init_flags 13560 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) 13561 { 13562 has_user_defined_property = TRUE; 13563 } 13564 else if 13565 /* We warn on matching an above-Unicode code point 13566 * if the match would return true, except don't 13567 * warn for \p{All}, which has exactly one element 13568 * = 0 */ 13569 (_invlist_contains_cp(invlist, 0x110000) 13570 && (! (_invlist_len(invlist) == 1 13571 && *invlist_array(invlist) == 0))) 13572 { 13573 warn_super = TRUE; 13574 } 13575 13576 13577 /* Invert if asking for the complement */ 13578 if (value == 'P') { 13579 _invlist_union_complement_2nd(properties, 13580 invlist, 13581 &properties); 13582 13583 /* The swash can't be used as-is, because we've 13584 * inverted things; delay removing it to here after 13585 * have copied its invlist above */ 13586 SvREFCNT_dec_NN(swash); 13587 swash = NULL; 13588 } 13589 else { 13590 _invlist_union(properties, invlist, &properties); 13591 } 13592 } 13593 Safefree(name); 13594 } 13595 RExC_parse = e + 1; 13596 namedclass = ANYOF_UNIPROP; /* no official name, but it's 13597 named */ 13598 13599 /* \p means they want Unicode semantics */ 13600 RExC_uni_semantics = 1; 13601 } 13602 break; 13603 case 'n': value = '\n'; break; 13604 case 'r': value = '\r'; break; 13605 case 't': value = '\t'; break; 13606 case 'f': value = '\f'; break; 13607 case 'b': value = '\b'; break; 13608 case 'e': value = ASCII_TO_NATIVE('\033');break; 13609 case 'a': value = '\a'; break; 13610 case 'o': 13611 RExC_parse--; /* function expects to be pointed at the 'o' */ 13612 { 13613 const char* error_msg; 13614 bool valid = grok_bslash_o(&RExC_parse, 13615 &value, 13616 &error_msg, 13617 SIZE_ONLY, /* warnings in pass 13618 1 only */ 13619 strict, 13620 silence_non_portable, 13621 UTF); 13622 if (! valid) { 13623 vFAIL(error_msg); 13624 } 13625 } 13626 if (PL_encoding && value < 0x100) { 13627 goto recode_encoding; 13628 } 13629 break; 13630 case 'x': 13631 RExC_parse--; /* function expects to be pointed at the 'x' */ 13632 { 13633 const char* error_msg; 13634 bool valid = grok_bslash_x(&RExC_parse, 13635 &value, 13636 &error_msg, 13637 TRUE, /* Output warnings */ 13638 strict, 13639 silence_non_portable, 13640 UTF); 13641 if (! valid) { 13642 vFAIL(error_msg); 13643 } 13644 } 13645 if (PL_encoding && value < 0x100) 13646 goto recode_encoding; 13647 break; 13648 case 'c': 13649 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); 13650 break; 13651 case '0': case '1': case '2': case '3': case '4': 13652 case '5': case '6': case '7': 13653 { 13654 /* Take 1-3 octal digits */ 13655 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 13656 numlen = (strict) ? 4 : 3; 13657 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 13658 RExC_parse += numlen; 13659 if (numlen != 3) { 13660 if (strict) { 13661 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 13662 vFAIL("Need exactly 3 octal digits"); 13663 } 13664 else if (! SIZE_ONLY /* like \08, \178 */ 13665 && numlen < 3 13666 && RExC_parse < RExC_end 13667 && isDIGIT(*RExC_parse) 13668 && ckWARN(WARN_REGEXP)) 13669 { 13670 SAVEFREESV(RExC_rx_sv); 13671 reg_warn_non_literal_string( 13672 RExC_parse + 1, 13673 form_short_octal_warning(RExC_parse, numlen)); 13674 (void)ReREFCNT_inc(RExC_rx_sv); 13675 } 13676 } 13677 if (PL_encoding && value < 0x100) 13678 goto recode_encoding; 13679 break; 13680 } 13681 recode_encoding: 13682 if (! RExC_override_recoding) { 13683 SV* enc = PL_encoding; 13684 value = reg_recode((const char)(U8)value, &enc); 13685 if (!enc) { 13686 if (strict) { 13687 vFAIL("Invalid escape in the specified encoding"); 13688 } 13689 else if (SIZE_ONLY) { 13690 ckWARNreg(RExC_parse, 13691 "Invalid escape in the specified encoding"); 13692 } 13693 } 13694 break; 13695 } 13696 default: 13697 /* Allow \_ to not give an error */ 13698 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { 13699 if (strict) { 13700 vFAIL2("Unrecognized escape \\%c in character class", 13701 (int)value); 13702 } 13703 else { 13704 SAVEFREESV(RExC_rx_sv); 13705 ckWARN2reg(RExC_parse, 13706 "Unrecognized escape \\%c in character class passed through", 13707 (int)value); 13708 (void)ReREFCNT_inc(RExC_rx_sv); 13709 } 13710 } 13711 break; 13712 } /* End of switch on char following backslash */ 13713 } /* end of handling backslash escape sequences */ 13714 #ifdef EBCDIC 13715 else 13716 literal_endpoint++; 13717 #endif 13718 13719 /* Here, we have the current token in 'value' */ 13720 13721 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 13722 U8 classnum; 13723 13724 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a 13725 * literal, as is the character that began the false range, i.e. 13726 * the 'a' in the examples */ 13727 if (range) { 13728 if (!SIZE_ONLY) { 13729 const int w = (RExC_parse >= rangebegin) 13730 ? RExC_parse - rangebegin 13731 : 0; 13732 if (strict) { 13733 vFAIL2utf8f( 13734 "False [] range \"%"UTF8f"\"", 13735 UTF8fARG(UTF, w, rangebegin)); 13736 } 13737 else { 13738 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ 13739 ckWARN2reg(RExC_parse, 13740 "False [] range \"%"UTF8f"\"", 13741 UTF8fARG(UTF, w, rangebegin)); 13742 (void)ReREFCNT_inc(RExC_rx_sv); 13743 cp_list = add_cp_to_invlist(cp_list, '-'); 13744 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 13745 prevvalue); 13746 } 13747 } 13748 13749 range = 0; /* this was not a true range */ 13750 element_count += 2; /* So counts for three values */ 13751 } 13752 13753 classnum = namedclass_to_classnum(namedclass); 13754 13755 if (LOC && namedclass < ANYOF_POSIXL_MAX 13756 #ifndef HAS_ISASCII 13757 && classnum != _CC_ASCII 13758 #endif 13759 ) { 13760 /* What the Posix classes (like \w, [:space:]) match in locale 13761 * isn't knowable under locale until actual match time. Room 13762 * must be reserved (one time per outer bracketed class) to 13763 * store such classes. The space will contain a bit for each 13764 * named class that is to be matched against. This isn't 13765 * needed for \p{} and pseudo-classes, as they are not affected 13766 * by locale, and hence are dealt with separately */ 13767 if (! need_class) { 13768 need_class = 1; 13769 if (SIZE_ONLY) { 13770 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; 13771 } 13772 else { 13773 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; 13774 } 13775 ANYOF_FLAGS(ret) |= ANYOF_POSIXL; 13776 ANYOF_POSIXL_ZERO(ret); 13777 } 13778 13779 /* See if it already matches the complement of this POSIX 13780 * class */ 13781 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) 13782 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) 13783 ? -1 13784 : 1))) 13785 { 13786 posixl_matches_all = TRUE; 13787 break; /* No need to continue. Since it matches both 13788 e.g., \w and \W, it matches everything, and the 13789 bracketed class can be optimized into qr/./s */ 13790 } 13791 13792 /* Add this class to those that should be checked at runtime */ 13793 ANYOF_POSIXL_SET(ret, namedclass); 13794 13795 /* The above-Latin1 characters are not subject to locale rules. 13796 * Just add them, in the second pass, to the 13797 * unconditionally-matched list */ 13798 if (! SIZE_ONLY) { 13799 SV* scratch_list = NULL; 13800 13801 /* Get the list of the above-Latin1 code points this 13802 * matches */ 13803 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, 13804 PL_XPosix_ptrs[classnum], 13805 13806 /* Odd numbers are complements, like 13807 * NDIGIT, NASCII, ... */ 13808 namedclass % 2 != 0, 13809 &scratch_list); 13810 /* Checking if 'cp_list' is NULL first saves an extra 13811 * clone. Its reference count will be decremented at the 13812 * next union, etc, or if this is the only instance, at the 13813 * end of the routine */ 13814 if (! cp_list) { 13815 cp_list = scratch_list; 13816 } 13817 else { 13818 _invlist_union(cp_list, scratch_list, &cp_list); 13819 SvREFCNT_dec_NN(scratch_list); 13820 } 13821 continue; /* Go get next character */ 13822 } 13823 } 13824 else if (! SIZE_ONLY) { 13825 13826 /* Here, not in pass1 (in that pass we skip calculating the 13827 * contents of this class), and is /l, or is a POSIX class for 13828 * which /l doesn't matter (or is a Unicode property, which is 13829 * skipped here). */ 13830 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ 13831 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ 13832 13833 /* Here, should be \h, \H, \v, or \V. None of /d, /i 13834 * nor /l make a difference in what these match, 13835 * therefore we just add what they match to cp_list. */ 13836 if (classnum != _CC_VERTSPACE) { 13837 assert( namedclass == ANYOF_HORIZWS 13838 || namedclass == ANYOF_NHORIZWS); 13839 13840 /* It turns out that \h is just a synonym for 13841 * XPosixBlank */ 13842 classnum = _CC_BLANK; 13843 } 13844 13845 _invlist_union_maybe_complement_2nd( 13846 cp_list, 13847 PL_XPosix_ptrs[classnum], 13848 namedclass % 2 != 0, /* Complement if odd 13849 (NHORIZWS, NVERTWS) 13850 */ 13851 &cp_list); 13852 } 13853 } 13854 else { /* Garden variety class. If is NASCII, NDIGIT, ... 13855 complement and use nposixes */ 13856 SV** posixes_ptr = namedclass % 2 == 0 13857 ? &posixes 13858 : &nposixes; 13859 SV** source_ptr = &PL_XPosix_ptrs[classnum]; 13860 _invlist_union_maybe_complement_2nd( 13861 *posixes_ptr, 13862 *source_ptr, 13863 namedclass % 2 != 0, 13864 posixes_ptr); 13865 } 13866 continue; /* Go get next character */ 13867 } 13868 } /* end of namedclass \blah */ 13869 13870 /* Here, we have a single value. If 'range' is set, it is the ending 13871 * of a range--check its validity. Later, we will handle each 13872 * individual code point in the range. If 'range' isn't set, this 13873 * could be the beginning of a range, so check for that by looking 13874 * ahead to see if the next real character to be processed is the range 13875 * indicator--the minus sign */ 13876 13877 if (skip_white) { 13878 RExC_parse = regpatws(pRExC_state, RExC_parse, 13879 FALSE /* means don't recognize comments */); 13880 } 13881 13882 if (range) { 13883 if (prevvalue > value) /* b-a */ { 13884 const int w = RExC_parse - rangebegin; 13885 vFAIL2utf8f( 13886 "Invalid [] range \"%"UTF8f"\"", 13887 UTF8fARG(UTF, w, rangebegin)); 13888 range = 0; /* not a valid range */ 13889 } 13890 } 13891 else { 13892 prevvalue = value; /* save the beginning of the potential range */ 13893 if (! stop_at_1 /* Can't be a range if parsing just one thing */ 13894 && *RExC_parse == '-') 13895 { 13896 char* next_char_ptr = RExC_parse + 1; 13897 if (skip_white) { /* Get the next real char after the '-' */ 13898 next_char_ptr = regpatws(pRExC_state, 13899 RExC_parse + 1, 13900 FALSE); /* means don't recognize 13901 comments */ 13902 } 13903 13904 /* If the '-' is at the end of the class (just before the ']', 13905 * it is a literal minus; otherwise it is a range */ 13906 if (next_char_ptr < RExC_end && *next_char_ptr != ']') { 13907 RExC_parse = next_char_ptr; 13908 13909 /* a bad range like \w-, [:word:]- ? */ 13910 if (namedclass > OOB_NAMEDCLASS) { 13911 if (strict || ckWARN(WARN_REGEXP)) { 13912 const int w = 13913 RExC_parse >= rangebegin ? 13914 RExC_parse - rangebegin : 0; 13915 if (strict) { 13916 vFAIL4("False [] range \"%*.*s\"", 13917 w, w, rangebegin); 13918 } 13919 else { 13920 vWARN4(RExC_parse, 13921 "False [] range \"%*.*s\"", 13922 w, w, rangebegin); 13923 } 13924 } 13925 if (!SIZE_ONLY) { 13926 cp_list = add_cp_to_invlist(cp_list, '-'); 13927 } 13928 element_count++; 13929 } else 13930 range = 1; /* yeah, it's a range! */ 13931 continue; /* but do it the next time */ 13932 } 13933 } 13934 } 13935 13936 /* Here, <prevvalue> is the beginning of the range, if any; or <value> 13937 * if not */ 13938 13939 /* non-Latin1 code point implies unicode semantics. Must be set in 13940 * pass1 so is there for the whole of pass 2 */ 13941 if (value > 255) { 13942 RExC_uni_semantics = 1; 13943 } 13944 13945 /* Ready to process either the single value, or the completed range. 13946 * For single-valued non-inverted ranges, we consider the possibility 13947 * of multi-char folds. (We made a conscious decision to not do this 13948 * for the other cases because it can often lead to non-intuitive 13949 * results. For example, you have the peculiar case that: 13950 * "s s" =~ /^[^\xDF]+$/i => Y 13951 * "ss" =~ /^[^\xDF]+$/i => N 13952 * 13953 * See [perl #89750] */ 13954 if (FOLD && allow_multi_folds && value == prevvalue) { 13955 if (value == LATIN_SMALL_LETTER_SHARP_S 13956 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, 13957 value))) 13958 { 13959 /* Here <value> is indeed a multi-char fold. Get what it is */ 13960 13961 U8 foldbuf[UTF8_MAXBYTES_CASE]; 13962 STRLEN foldlen; 13963 13964 UV folded = _to_uni_fold_flags( 13965 value, 13966 foldbuf, 13967 &foldlen, 13968 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED 13969 ? FOLD_FLAGS_NOMIX_ASCII 13970 : 0) 13971 ); 13972 13973 /* Here, <folded> should be the first character of the 13974 * multi-char fold of <value>, with <foldbuf> containing the 13975 * whole thing. But, if this fold is not allowed (because of 13976 * the flags), <fold> will be the same as <value>, and should 13977 * be processed like any other character, so skip the special 13978 * handling */ 13979 if (folded != value) { 13980 13981 /* Skip if we are recursed, currently parsing the class 13982 * again. Otherwise add this character to the list of 13983 * multi-char folds. */ 13984 if (! RExC_in_multi_char_class) { 13985 AV** this_array_ptr; 13986 AV* this_array; 13987 STRLEN cp_count = utf8_length(foldbuf, 13988 foldbuf + foldlen); 13989 SV* multi_fold = sv_2mortal(newSVpvn("", 0)); 13990 13991 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); 13992 13993 13994 if (! multi_char_matches) { 13995 multi_char_matches = newAV(); 13996 } 13997 13998 /* <multi_char_matches> is actually an array of arrays. 13999 * There will be one or two top-level elements: [2], 14000 * and/or [3]. The [2] element is an array, each 14001 * element thereof is a character which folds to TWO 14002 * characters; [3] is for folds to THREE characters. 14003 * (Unicode guarantees a maximum of 3 characters in any 14004 * fold.) When we rewrite the character class below, 14005 * we will do so such that the longest folds are 14006 * written first, so that it prefers the longest 14007 * matching strings first. This is done even if it 14008 * turns out that any quantifier is non-greedy, out of 14009 * programmer laziness. Tom Christiansen has agreed 14010 * that this is ok. This makes the test for the 14011 * ligature 'ffi' come before the test for 'ff' */ 14012 if (av_exists(multi_char_matches, cp_count)) { 14013 this_array_ptr = (AV**) av_fetch(multi_char_matches, 14014 cp_count, FALSE); 14015 this_array = *this_array_ptr; 14016 } 14017 else { 14018 this_array = newAV(); 14019 av_store(multi_char_matches, cp_count, 14020 (SV*) this_array); 14021 } 14022 av_push(this_array, multi_fold); 14023 } 14024 14025 /* This element should not be processed further in this 14026 * class */ 14027 element_count--; 14028 value = save_value; 14029 prevvalue = save_prevvalue; 14030 continue; 14031 } 14032 } 14033 } 14034 14035 /* Deal with this element of the class */ 14036 if (! SIZE_ONLY) { 14037 #ifndef EBCDIC 14038 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 14039 prevvalue, value); 14040 #else 14041 SV* this_range = _new_invlist(1); 14042 _append_range_to_invlist(this_range, prevvalue, value); 14043 14044 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. 14045 * If this range was specified using something like 'i-j', we want 14046 * to include only the 'i' and the 'j', and not anything in 14047 * between, so exclude non-ASCII, non-alphabetics from it. 14048 * However, if the range was specified with something like 14049 * [\x89-\x91] or [\x89-j], all code points within it should be 14050 * included. literal_endpoint==2 means both ends of the range used 14051 * a literal character, not \x{foo} */ 14052 if (literal_endpoint == 2 14053 && ((prevvalue >= 'a' && value <= 'z') 14054 || (prevvalue >= 'A' && value <= 'Z'))) 14055 { 14056 _invlist_intersection(this_range, PL_ASCII, 14057 &this_range); 14058 14059 /* Since this above only contains ascii, the intersection of it 14060 * with anything will still yield only ascii */ 14061 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], 14062 &this_range); 14063 } 14064 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); 14065 literal_endpoint = 0; 14066 #endif 14067 } 14068 14069 range = 0; /* this range (if it was one) is done now */ 14070 } /* End of loop through all the text within the brackets */ 14071 14072 /* If anything in the class expands to more than one character, we have to 14073 * deal with them by building up a substitute parse string, and recursively 14074 * calling reg() on it, instead of proceeding */ 14075 if (multi_char_matches) { 14076 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); 14077 I32 cp_count; 14078 STRLEN len; 14079 char *save_end = RExC_end; 14080 char *save_parse = RExC_parse; 14081 bool first_time = TRUE; /* First multi-char occurrence doesn't get 14082 a "|" */ 14083 I32 reg_flags; 14084 14085 assert(! invert); 14086 #if 0 /* Have decided not to deal with multi-char folds in inverted classes, 14087 because too confusing */ 14088 if (invert) { 14089 sv_catpv(substitute_parse, "(?:"); 14090 } 14091 #endif 14092 14093 /* Look at the longest folds first */ 14094 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { 14095 14096 if (av_exists(multi_char_matches, cp_count)) { 14097 AV** this_array_ptr; 14098 SV* this_sequence; 14099 14100 this_array_ptr = (AV**) av_fetch(multi_char_matches, 14101 cp_count, FALSE); 14102 while ((this_sequence = av_pop(*this_array_ptr)) != 14103 &PL_sv_undef) 14104 { 14105 if (! first_time) { 14106 sv_catpv(substitute_parse, "|"); 14107 } 14108 first_time = FALSE; 14109 14110 sv_catpv(substitute_parse, SvPVX(this_sequence)); 14111 } 14112 } 14113 } 14114 14115 /* If the character class contains anything else besides these 14116 * multi-character folds, have to include it in recursive parsing */ 14117 if (element_count) { 14118 sv_catpv(substitute_parse, "|["); 14119 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); 14120 sv_catpv(substitute_parse, "]"); 14121 } 14122 14123 sv_catpv(substitute_parse, ")"); 14124 #if 0 14125 if (invert) { 14126 /* This is a way to get the parse to skip forward a whole named 14127 * sequence instead of matching the 2nd character when it fails the 14128 * first */ 14129 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); 14130 } 14131 #endif 14132 14133 RExC_parse = SvPV(substitute_parse, len); 14134 RExC_end = RExC_parse + len; 14135 RExC_in_multi_char_class = 1; 14136 RExC_emit = (regnode *)orig_emit; 14137 14138 ret = reg(pRExC_state, 1, ®_flags, depth+1); 14139 14140 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); 14141 14142 RExC_parse = save_parse; 14143 RExC_end = save_end; 14144 RExC_in_multi_char_class = 0; 14145 SvREFCNT_dec_NN(multi_char_matches); 14146 return ret; 14147 } 14148 14149 /* Here, we've gone through the entire class and dealt with multi-char 14150 * folds. We are now in a position that we can do some checks to see if we 14151 * can optimize this ANYOF node into a simpler one, even in Pass 1. 14152 * Currently we only do two checks: 14153 * 1) is in the unlikely event that the user has specified both, eg. \w and 14154 * \W under /l, then the class matches everything. (This optimization 14155 * is done only to make the optimizer code run later work.) 14156 * 2) if the character class contains only a single element (including a 14157 * single range), we see if there is an equivalent node for it. 14158 * Other checks are possible */ 14159 if (! ret_invlist /* Can't optimize if returning the constructed 14160 inversion list */ 14161 && (UNLIKELY(posixl_matches_all) || element_count == 1)) 14162 { 14163 U8 op = END; 14164 U8 arg = 0; 14165 14166 if (UNLIKELY(posixl_matches_all)) { 14167 op = SANY; 14168 } 14169 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like 14170 \w or [:digit:] or \p{foo} 14171 */ 14172 14173 /* All named classes are mapped into POSIXish nodes, with its FLAG 14174 * argument giving which class it is */ 14175 switch ((I32)namedclass) { 14176 case ANYOF_UNIPROP: 14177 break; 14178 14179 /* These don't depend on the charset modifiers. They always 14180 * match under /u rules */ 14181 case ANYOF_NHORIZWS: 14182 case ANYOF_HORIZWS: 14183 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; 14184 /* FALLTHROUGH */ 14185 14186 case ANYOF_NVERTWS: 14187 case ANYOF_VERTWS: 14188 op = POSIXU; 14189 goto join_posix; 14190 14191 /* The actual POSIXish node for all the rest depends on the 14192 * charset modifier. The ones in the first set depend only on 14193 * ASCII or, if available on this platform, locale */ 14194 case ANYOF_ASCII: 14195 case ANYOF_NASCII: 14196 #ifdef HAS_ISASCII 14197 op = (LOC) ? POSIXL : POSIXA; 14198 #else 14199 op = POSIXA; 14200 #endif 14201 goto join_posix; 14202 14203 case ANYOF_NCASED: 14204 case ANYOF_LOWER: 14205 case ANYOF_NLOWER: 14206 case ANYOF_UPPER: 14207 case ANYOF_NUPPER: 14208 /* under /a could be alpha */ 14209 if (FOLD) { 14210 if (ASCII_RESTRICTED) { 14211 namedclass = ANYOF_ALPHA + (namedclass % 2); 14212 } 14213 else if (! LOC) { 14214 break; 14215 } 14216 } 14217 /* FALLTHROUGH */ 14218 14219 /* The rest have more possibilities depending on the charset. 14220 * We take advantage of the enum ordering of the charset 14221 * modifiers to get the exact node type, */ 14222 default: 14223 op = POSIXD + get_regex_charset(RExC_flags); 14224 if (op > POSIXA) { /* /aa is same as /a */ 14225 op = POSIXA; 14226 } 14227 14228 join_posix: 14229 /* The odd numbered ones are the complements of the 14230 * next-lower even number one */ 14231 if (namedclass % 2 == 1) { 14232 invert = ! invert; 14233 namedclass--; 14234 } 14235 arg = namedclass_to_classnum(namedclass); 14236 break; 14237 } 14238 } 14239 else if (value == prevvalue) { 14240 14241 /* Here, the class consists of just a single code point */ 14242 14243 if (invert) { 14244 if (! LOC && value == '\n') { 14245 op = REG_ANY; /* Optimize [^\n] */ 14246 *flagp |= HASWIDTH|SIMPLE; 14247 RExC_naughty++; 14248 } 14249 } 14250 else if (value < 256 || UTF) { 14251 14252 /* Optimize a single value into an EXACTish node, but not if it 14253 * would require converting the pattern to UTF-8. */ 14254 op = compute_EXACTish(pRExC_state); 14255 } 14256 } /* Otherwise is a range */ 14257 else if (! LOC) { /* locale could vary these */ 14258 if (prevvalue == '0') { 14259 if (value == '9') { 14260 arg = _CC_DIGIT; 14261 op = POSIXA; 14262 } 14263 } 14264 } 14265 14266 /* Here, we have changed <op> away from its initial value iff we found 14267 * an optimization */ 14268 if (op != END) { 14269 14270 /* Throw away this ANYOF regnode, and emit the calculated one, 14271 * which should correspond to the beginning, not current, state of 14272 * the parse */ 14273 const char * cur_parse = RExC_parse; 14274 RExC_parse = (char *)orig_parse; 14275 if ( SIZE_ONLY) { 14276 if (! LOC) { 14277 14278 /* To get locale nodes to not use the full ANYOF size would 14279 * require moving the code above that writes the portions 14280 * of it that aren't in other nodes to after this point. 14281 * e.g. ANYOF_POSIXL_SET */ 14282 RExC_size = orig_size; 14283 } 14284 } 14285 else { 14286 RExC_emit = (regnode *)orig_emit; 14287 if (PL_regkind[op] == POSIXD) { 14288 if (op == POSIXL) { 14289 RExC_contains_locale = 1; 14290 } 14291 if (invert) { 14292 op += NPOSIXD - POSIXD; 14293 } 14294 } 14295 } 14296 14297 ret = reg_node(pRExC_state, op); 14298 14299 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { 14300 if (! SIZE_ONLY) { 14301 FLAGS(ret) = arg; 14302 } 14303 *flagp |= HASWIDTH|SIMPLE; 14304 } 14305 else if (PL_regkind[op] == EXACT) { 14306 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, 14307 TRUE /* downgradable to EXACT */ 14308 ); 14309 } 14310 14311 RExC_parse = (char *) cur_parse; 14312 14313 SvREFCNT_dec(posixes); 14314 SvREFCNT_dec(nposixes); 14315 SvREFCNT_dec(cp_list); 14316 SvREFCNT_dec(cp_foldable_list); 14317 return ret; 14318 } 14319 } 14320 14321 if (SIZE_ONLY) 14322 return ret; 14323 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ 14324 14325 /* If folding, we calculate all characters that could fold to or from the 14326 * ones already on the list */ 14327 if (cp_foldable_list) { 14328 if (FOLD) { 14329 UV start, end; /* End points of code point ranges */ 14330 14331 SV* fold_intersection = NULL; 14332 SV** use_list; 14333 14334 /* Our calculated list will be for Unicode rules. For locale 14335 * matching, we have to keep a separate list that is consulted at 14336 * runtime only when the locale indicates Unicode rules. For 14337 * non-locale, we just use to the general list */ 14338 if (LOC) { 14339 use_list = &only_utf8_locale_list; 14340 } 14341 else { 14342 use_list = &cp_list; 14343 } 14344 14345 /* Only the characters in this class that participate in folds need 14346 * be checked. Get the intersection of this class and all the 14347 * possible characters that are foldable. This can quickly narrow 14348 * down a large class */ 14349 _invlist_intersection(PL_utf8_foldable, cp_foldable_list, 14350 &fold_intersection); 14351 14352 /* The folds for all the Latin1 characters are hard-coded into this 14353 * program, but we have to go out to disk to get the others. */ 14354 if (invlist_highest(cp_foldable_list) >= 256) { 14355 14356 /* This is a hash that for a particular fold gives all 14357 * characters that are involved in it */ 14358 if (! PL_utf8_foldclosures) { 14359 14360 /* If the folds haven't been read in, call a fold function 14361 * to force that */ 14362 if (! PL_utf8_tofold) { 14363 U8 dummy[UTF8_MAXBYTES_CASE+1]; 14364 14365 /* This string is just a short named one above \xff */ 14366 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); 14367 assert(PL_utf8_tofold); /* Verify that worked */ 14368 } 14369 PL_utf8_foldclosures 14370 = _swash_inversion_hash(PL_utf8_tofold); 14371 } 14372 } 14373 14374 /* Now look at the foldable characters in this class individually */ 14375 invlist_iterinit(fold_intersection); 14376 while (invlist_iternext(fold_intersection, &start, &end)) { 14377 UV j; 14378 14379 /* Look at every character in the range */ 14380 for (j = start; j <= end; j++) { 14381 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 14382 STRLEN foldlen; 14383 SV** listp; 14384 14385 if (j < 256) { 14386 14387 /* We have the latin1 folding rules hard-coded here so 14388 * that an innocent-looking character class, like 14389 * /[ks]/i won't have to go out to disk to find the 14390 * possible matches. XXX It would be better to 14391 * generate these via regen, in case a new version of 14392 * the Unicode standard adds new mappings, though that 14393 * is not really likely, and may be caught by the 14394 * default: case of the switch below. */ 14395 14396 if (IS_IN_SOME_FOLD_L1(j)) { 14397 14398 /* ASCII is always matched; non-ASCII is matched 14399 * only under Unicode rules (which could happen 14400 * under /l if the locale is a UTF-8 one */ 14401 if (isASCII(j) || ! DEPENDS_SEMANTICS) { 14402 *use_list = add_cp_to_invlist(*use_list, 14403 PL_fold_latin1[j]); 14404 } 14405 else { 14406 depends_list = 14407 add_cp_to_invlist(depends_list, 14408 PL_fold_latin1[j]); 14409 } 14410 } 14411 14412 if (HAS_NONLATIN1_FOLD_CLOSURE(j) 14413 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) 14414 { 14415 /* Certain Latin1 characters have matches outside 14416 * Latin1. To get here, <j> is one of those 14417 * characters. None of these matches is valid for 14418 * ASCII characters under /aa, which is why the 'if' 14419 * just above excludes those. These matches only 14420 * happen when the target string is utf8. The code 14421 * below adds the single fold closures for <j> to the 14422 * inversion list. */ 14423 14424 switch (j) { 14425 case 'k': 14426 case 'K': 14427 *use_list = 14428 add_cp_to_invlist(*use_list, KELVIN_SIGN); 14429 break; 14430 case 's': 14431 case 'S': 14432 *use_list = add_cp_to_invlist(*use_list, 14433 LATIN_SMALL_LETTER_LONG_S); 14434 break; 14435 case MICRO_SIGN: 14436 *use_list = add_cp_to_invlist(*use_list, 14437 GREEK_CAPITAL_LETTER_MU); 14438 *use_list = add_cp_to_invlist(*use_list, 14439 GREEK_SMALL_LETTER_MU); 14440 break; 14441 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: 14442 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: 14443 *use_list = 14444 add_cp_to_invlist(*use_list, ANGSTROM_SIGN); 14445 break; 14446 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 14447 *use_list = add_cp_to_invlist(*use_list, 14448 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); 14449 break; 14450 case LATIN_SMALL_LETTER_SHARP_S: 14451 *use_list = add_cp_to_invlist(*use_list, 14452 LATIN_CAPITAL_LETTER_SHARP_S); 14453 break; 14454 case 'F': case 'f': 14455 case 'I': case 'i': 14456 case 'L': case 'l': 14457 case 'T': case 't': 14458 case 'A': case 'a': 14459 case 'H': case 'h': 14460 case 'J': case 'j': 14461 case 'N': case 'n': 14462 case 'W': case 'w': 14463 case 'Y': case 'y': 14464 /* These all are targets of multi-character 14465 * folds from code points that require UTF8 14466 * to express, so they can't match unless 14467 * the target string is in UTF-8, so no 14468 * action here is necessary, as regexec.c 14469 * properly handles the general case for 14470 * UTF-8 matching and multi-char folds */ 14471 break; 14472 default: 14473 /* Use deprecated warning to increase the 14474 * chances of this being output */ 14475 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); 14476 break; 14477 } 14478 } 14479 continue; 14480 } 14481 14482 /* Here is an above Latin1 character. We don't have the 14483 * rules hard-coded for it. First, get its fold. This is 14484 * the simple fold, as the multi-character folds have been 14485 * handled earlier and separated out */ 14486 _to_uni_fold_flags(j, foldbuf, &foldlen, 14487 (ASCII_FOLD_RESTRICTED) 14488 ? FOLD_FLAGS_NOMIX_ASCII 14489 : 0); 14490 14491 /* Single character fold of above Latin1. Add everything in 14492 * its fold closure to the list that this node should match. 14493 * The fold closures data structure is a hash with the keys 14494 * being the UTF-8 of every character that is folded to, like 14495 * 'k', and the values each an array of all code points that 14496 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. 14497 * Multi-character folds are not included */ 14498 if ((listp = hv_fetch(PL_utf8_foldclosures, 14499 (char *) foldbuf, foldlen, FALSE))) 14500 { 14501 AV* list = (AV*) *listp; 14502 IV k; 14503 for (k = 0; k <= av_tindex(list); k++) { 14504 SV** c_p = av_fetch(list, k, FALSE); 14505 UV c; 14506 if (c_p == NULL) { 14507 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); 14508 } 14509 c = SvUV(*c_p); 14510 14511 /* /aa doesn't allow folds between ASCII and non- */ 14512 if ((ASCII_FOLD_RESTRICTED 14513 && (isASCII(c) != isASCII(j)))) 14514 { 14515 continue; 14516 } 14517 14518 /* Folds under /l which cross the 255/256 boundary 14519 * are added to a separate list. (These are valid 14520 * only when the locale is UTF-8.) */ 14521 if (c < 256 && LOC) { 14522 *use_list = add_cp_to_invlist(*use_list, c); 14523 continue; 14524 } 14525 14526 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) 14527 { 14528 cp_list = add_cp_to_invlist(cp_list, c); 14529 } 14530 else { 14531 /* Similarly folds involving non-ascii Latin1 14532 * characters under /d are added to their list */ 14533 depends_list = add_cp_to_invlist(depends_list, 14534 c); 14535 } 14536 } 14537 } 14538 } 14539 } 14540 SvREFCNT_dec_NN(fold_intersection); 14541 } 14542 14543 /* Now that we have finished adding all the folds, there is no reason 14544 * to keep the foldable list separate */ 14545 _invlist_union(cp_list, cp_foldable_list, &cp_list); 14546 SvREFCNT_dec_NN(cp_foldable_list); 14547 } 14548 14549 /* And combine the result (if any) with any inversion list from posix 14550 * classes. The lists are kept separate up to now because we don't want to 14551 * fold the classes (folding of those is automatically handled by the swash 14552 * fetching code) */ 14553 if (posixes || nposixes) { 14554 if (posixes && AT_LEAST_ASCII_RESTRICTED) { 14555 /* Under /a and /aa, nothing above ASCII matches these */ 14556 _invlist_intersection(posixes, 14557 PL_XPosix_ptrs[_CC_ASCII], 14558 &posixes); 14559 } 14560 if (nposixes) { 14561 if (DEPENDS_SEMANTICS) { 14562 /* Under /d, everything in the upper half of the Latin1 range 14563 * matches these complements */ 14564 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; 14565 } 14566 else if (AT_LEAST_ASCII_RESTRICTED) { 14567 /* Under /a and /aa, everything above ASCII matches these 14568 * complements */ 14569 _invlist_union_complement_2nd(nposixes, 14570 PL_XPosix_ptrs[_CC_ASCII], 14571 &nposixes); 14572 } 14573 if (posixes) { 14574 _invlist_union(posixes, nposixes, &posixes); 14575 SvREFCNT_dec_NN(nposixes); 14576 } 14577 else { 14578 posixes = nposixes; 14579 } 14580 } 14581 if (! DEPENDS_SEMANTICS) { 14582 if (cp_list) { 14583 _invlist_union(cp_list, posixes, &cp_list); 14584 SvREFCNT_dec_NN(posixes); 14585 } 14586 else { 14587 cp_list = posixes; 14588 } 14589 } 14590 else { 14591 /* Under /d, we put into a separate list the Latin1 things that 14592 * match only when the target string is utf8 */ 14593 SV* nonascii_but_latin1_properties = NULL; 14594 _invlist_intersection(posixes, PL_UpperLatin1, 14595 &nonascii_but_latin1_properties); 14596 _invlist_subtract(posixes, nonascii_but_latin1_properties, 14597 &posixes); 14598 if (cp_list) { 14599 _invlist_union(cp_list, posixes, &cp_list); 14600 SvREFCNT_dec_NN(posixes); 14601 } 14602 else { 14603 cp_list = posixes; 14604 } 14605 14606 if (depends_list) { 14607 _invlist_union(depends_list, nonascii_but_latin1_properties, 14608 &depends_list); 14609 SvREFCNT_dec_NN(nonascii_but_latin1_properties); 14610 } 14611 else { 14612 depends_list = nonascii_but_latin1_properties; 14613 } 14614 } 14615 } 14616 14617 /* And combine the result (if any) with any inversion list from properties. 14618 * The lists are kept separate up to now so that we can distinguish the two 14619 * in regards to matching above-Unicode. A run-time warning is generated 14620 * if a Unicode property is matched against a non-Unicode code point. But, 14621 * we allow user-defined properties to match anything, without any warning, 14622 * and we also suppress the warning if there is a portion of the character 14623 * class that isn't a Unicode property, and which matches above Unicode, \W 14624 * or [\x{110000}] for example. 14625 * (Note that in this case, unlike the Posix one above, there is no 14626 * <depends_list>, because having a Unicode property forces Unicode 14627 * semantics */ 14628 if (properties) { 14629 if (cp_list) { 14630 14631 /* If it matters to the final outcome, see if a non-property 14632 * component of the class matches above Unicode. If so, the 14633 * warning gets suppressed. This is true even if just a single 14634 * such code point is specified, as though not strictly correct if 14635 * another such code point is matched against, the fact that they 14636 * are using above-Unicode code points indicates they should know 14637 * the issues involved */ 14638 if (warn_super) { 14639 warn_super = ! (invert 14640 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); 14641 } 14642 14643 _invlist_union(properties, cp_list, &cp_list); 14644 SvREFCNT_dec_NN(properties); 14645 } 14646 else { 14647 cp_list = properties; 14648 } 14649 14650 if (warn_super) { 14651 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; 14652 } 14653 } 14654 14655 /* Here, we have calculated what code points should be in the character 14656 * class. 14657 * 14658 * Now we can see about various optimizations. Fold calculation (which we 14659 * did above) needs to take place before inversion. Otherwise /[^k]/i 14660 * would invert to include K, which under /i would match k, which it 14661 * shouldn't. Therefore we can't invert folded locale now, as it won't be 14662 * folded until runtime */ 14663 14664 /* If we didn't do folding, it's because some information isn't available 14665 * until runtime; set the run-time fold flag for these. (We don't have to 14666 * worry about properties folding, as that is taken care of by the swash 14667 * fetching). We know to set the flag if we have a non-NULL list for UTF-8 14668 * locales, or the class matches at least one 0-255 range code point */ 14669 if (LOC && FOLD) { 14670 if (only_utf8_locale_list) { 14671 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; 14672 } 14673 else if (cp_list) { /* Look to see if there a 0-255 code point is in 14674 the list */ 14675 UV start, end; 14676 invlist_iterinit(cp_list); 14677 if (invlist_iternext(cp_list, &start, &end) && start < 256) { 14678 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; 14679 } 14680 invlist_iterfinish(cp_list); 14681 } 14682 } 14683 14684 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known 14685 * at compile time. Besides not inverting folded locale now, we can't 14686 * invert if there are things such as \w, which aren't known until runtime 14687 * */ 14688 if (cp_list 14689 && invert 14690 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) 14691 && ! depends_list 14692 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 14693 { 14694 _invlist_invert(cp_list); 14695 14696 /* Any swash can't be used as-is, because we've inverted things */ 14697 if (swash) { 14698 SvREFCNT_dec_NN(swash); 14699 swash = NULL; 14700 } 14701 14702 /* Clear the invert flag since have just done it here */ 14703 invert = FALSE; 14704 } 14705 14706 if (ret_invlist) { 14707 *ret_invlist = cp_list; 14708 SvREFCNT_dec(swash); 14709 14710 /* Discard the generated node */ 14711 if (SIZE_ONLY) { 14712 RExC_size = orig_size; 14713 } 14714 else { 14715 RExC_emit = orig_emit; 14716 } 14717 return orig_emit; 14718 } 14719 14720 /* Some character classes are equivalent to other nodes. Such nodes take 14721 * up less room and generally fewer operations to execute than ANYOF nodes. 14722 * Above, we checked for and optimized into some such equivalents for 14723 * certain common classes that are easy to test. Getting to this point in 14724 * the code means that the class didn't get optimized there. Since this 14725 * code is only executed in Pass 2, it is too late to save space--it has 14726 * been allocated in Pass 1, and currently isn't given back. But turning 14727 * things into an EXACTish node can allow the optimizer to join it to any 14728 * adjacent such nodes. And if the class is equivalent to things like /./, 14729 * expensive run-time swashes can be avoided. Now that we have more 14730 * complete information, we can find things necessarily missed by the 14731 * earlier code. I (khw) am not sure how much to look for here. It would 14732 * be easy, but perhaps too slow, to check any candidates against all the 14733 * node types they could possibly match using _invlistEQ(). */ 14734 14735 if (cp_list 14736 && ! invert 14737 && ! depends_list 14738 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) 14739 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 14740 14741 /* We don't optimize if we are supposed to make sure all non-Unicode 14742 * code points raise a warning, as only ANYOF nodes have this check. 14743 * */ 14744 && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) 14745 { 14746 UV start, end; 14747 U8 op = END; /* The optimzation node-type */ 14748 const char * cur_parse= RExC_parse; 14749 14750 invlist_iterinit(cp_list); 14751 if (! invlist_iternext(cp_list, &start, &end)) { 14752 14753 /* Here, the list is empty. This happens, for example, when a 14754 * Unicode property is the only thing in the character class, and 14755 * it doesn't match anything. (perluniprops.pod notes such 14756 * properties) */ 14757 op = OPFAIL; 14758 *flagp |= HASWIDTH|SIMPLE; 14759 } 14760 else if (start == end) { /* The range is a single code point */ 14761 if (! invlist_iternext(cp_list, &start, &end) 14762 14763 /* Don't do this optimization if it would require changing 14764 * the pattern to UTF-8 */ 14765 && (start < 256 || UTF)) 14766 { 14767 /* Here, the list contains a single code point. Can optimize 14768 * into an EXACTish node */ 14769 14770 value = start; 14771 14772 if (! FOLD) { 14773 op = EXACT; 14774 } 14775 else if (LOC) { 14776 14777 /* A locale node under folding with one code point can be 14778 * an EXACTFL, as its fold won't be calculated until 14779 * runtime */ 14780 op = EXACTFL; 14781 } 14782 else { 14783 14784 /* Here, we are generally folding, but there is only one 14785 * code point to match. If we have to, we use an EXACT 14786 * node, but it would be better for joining with adjacent 14787 * nodes in the optimization pass if we used the same 14788 * EXACTFish node that any such are likely to be. We can 14789 * do this iff the code point doesn't participate in any 14790 * folds. For example, an EXACTF of a colon is the same as 14791 * an EXACT one, since nothing folds to or from a colon. */ 14792 if (value < 256) { 14793 if (IS_IN_SOME_FOLD_L1(value)) { 14794 op = EXACT; 14795 } 14796 } 14797 else { 14798 if (_invlist_contains_cp(PL_utf8_foldable, value)) { 14799 op = EXACT; 14800 } 14801 } 14802 14803 /* If we haven't found the node type, above, it means we 14804 * can use the prevailing one */ 14805 if (op == END) { 14806 op = compute_EXACTish(pRExC_state); 14807 } 14808 } 14809 } 14810 } 14811 else if (start == 0) { 14812 if (end == UV_MAX) { 14813 op = SANY; 14814 *flagp |= HASWIDTH|SIMPLE; 14815 RExC_naughty++; 14816 } 14817 else if (end == '\n' - 1 14818 && invlist_iternext(cp_list, &start, &end) 14819 && start == '\n' + 1 && end == UV_MAX) 14820 { 14821 op = REG_ANY; 14822 *flagp |= HASWIDTH|SIMPLE; 14823 RExC_naughty++; 14824 } 14825 } 14826 invlist_iterfinish(cp_list); 14827 14828 if (op != END) { 14829 RExC_parse = (char *)orig_parse; 14830 RExC_emit = (regnode *)orig_emit; 14831 14832 ret = reg_node(pRExC_state, op); 14833 14834 RExC_parse = (char *)cur_parse; 14835 14836 if (PL_regkind[op] == EXACT) { 14837 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, 14838 TRUE /* downgradable to EXACT */ 14839 ); 14840 } 14841 14842 SvREFCNT_dec_NN(cp_list); 14843 return ret; 14844 } 14845 } 14846 14847 /* Here, <cp_list> contains all the code points we can determine at 14848 * compile time that match under all conditions. Go through it, and 14849 * for things that belong in the bitmap, put them there, and delete from 14850 * <cp_list>. While we are at it, see if everything above 255 is in the 14851 * list, and if so, set a flag to speed up execution */ 14852 14853 populate_ANYOF_from_invlist(ret, &cp_list); 14854 14855 if (invert) { 14856 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 14857 } 14858 14859 /* Here, the bitmap has been populated with all the Latin1 code points that 14860 * always match. Can now add to the overall list those that match only 14861 * when the target string is UTF-8 (<depends_list>). */ 14862 if (depends_list) { 14863 if (cp_list) { 14864 _invlist_union(cp_list, depends_list, &cp_list); 14865 SvREFCNT_dec_NN(depends_list); 14866 } 14867 else { 14868 cp_list = depends_list; 14869 } 14870 ANYOF_FLAGS(ret) |= ANYOF_UTF8; 14871 } 14872 14873 /* If there is a swash and more than one element, we can't use the swash in 14874 * the optimization below. */ 14875 if (swash && element_count > 1) { 14876 SvREFCNT_dec_NN(swash); 14877 swash = NULL; 14878 } 14879 14880 set_ANYOF_arg(pRExC_state, ret, cp_list, 14881 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 14882 ? listsv : NULL, 14883 only_utf8_locale_list, 14884 swash, has_user_defined_property); 14885 14886 *flagp |= HASWIDTH|SIMPLE; 14887 14888 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { 14889 RExC_contains_locale = 1; 14890 } 14891 14892 return ret; 14893 } 14894 14895 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 14896 14897 STATIC void 14898 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, 14899 regnode* const node, 14900 SV* const cp_list, 14901 SV* const runtime_defns, 14902 SV* const only_utf8_locale_list, 14903 SV* const swash, 14904 const bool has_user_defined_property) 14905 { 14906 /* Sets the arg field of an ANYOF-type node 'node', using information about 14907 * the node passed-in. If there is nothing outside the node's bitmap, the 14908 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to 14909 * the count returned by add_data(), having allocated and stored an array, 14910 * av, that that count references, as follows: 14911 * av[0] stores the character class description in its textual form. 14912 * This is used later (regexec.c:Perl_regclass_swash()) to 14913 * initialize the appropriate swash, and is also useful for dumping 14914 * the regnode. This is set to &PL_sv_undef if the textual 14915 * description is not needed at run-time (as happens if the other 14916 * elements completely define the class) 14917 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash 14918 * computed from av[0]. But if no further computation need be done, 14919 * the swash is stored here now (and av[0] is &PL_sv_undef). 14920 * av[2] stores the inversion list of code points that match only if the 14921 * current locale is UTF-8 14922 * av[3] stores the cp_list inversion list for use in addition or instead 14923 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. 14924 * (Otherwise everything needed is already in av[0] and av[1]) 14925 * av[4] is set if any component of the class is from a user-defined 14926 * property; used only if av[3] exists */ 14927 14928 UV n; 14929 14930 PERL_ARGS_ASSERT_SET_ANYOF_ARG; 14931 14932 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { 14933 assert(! (ANYOF_FLAGS(node) 14934 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); 14935 ARG_SET(node, ANYOF_NONBITMAP_EMPTY); 14936 } 14937 else { 14938 AV * const av = newAV(); 14939 SV *rv; 14940 14941 assert(ANYOF_FLAGS(node) 14942 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); 14943 14944 av_store(av, 0, (runtime_defns) 14945 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); 14946 if (swash) { 14947 av_store(av, 1, swash); 14948 SvREFCNT_dec_NN(cp_list); 14949 } 14950 else { 14951 av_store(av, 1, &PL_sv_undef); 14952 if (cp_list) { 14953 av_store(av, 3, cp_list); 14954 av_store(av, 4, newSVuv(has_user_defined_property)); 14955 } 14956 } 14957 14958 if (only_utf8_locale_list) { 14959 av_store(av, 2, only_utf8_locale_list); 14960 } 14961 else { 14962 av_store(av, 2, &PL_sv_undef); 14963 } 14964 14965 rv = newRV_noinc(MUTABLE_SV(av)); 14966 n = add_data(pRExC_state, STR_WITH_LEN("s")); 14967 RExC_rxi->data->data[n] = (void*)rv; 14968 ARG_SET(node, n); 14969 } 14970 } 14971 14972 14973 /* reg_skipcomment() 14974 14975 Absorbs an /x style # comments from the input stream. 14976 Returns true if there is more text remaining in the stream. 14977 Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment 14978 terminates the pattern without including a newline. 14979 14980 Note its the callers responsibility to ensure that we are 14981 actually in /x mode 14982 14983 */ 14984 14985 STATIC bool 14986 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) 14987 { 14988 bool ended = 0; 14989 14990 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 14991 14992 while (RExC_parse < RExC_end) 14993 if (*RExC_parse++ == '\n') { 14994 ended = 1; 14995 break; 14996 } 14997 if (!ended) { 14998 /* we ran off the end of the pattern without ending 14999 the comment, so we have to add an \n when wrapping */ 15000 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 15001 return 0; 15002 } else 15003 return 1; 15004 } 15005 15006 /* nextchar() 15007 15008 Advances the parse position, and optionally absorbs 15009 "whitespace" from the inputstream. 15010 15011 Without /x "whitespace" means (?#...) style comments only, 15012 with /x this means (?#...) and # comments and whitespace proper. 15013 15014 Returns the RExC_parse point from BEFORE the scan occurs. 15015 15016 This is the /x friendly way of saying RExC_parse++. 15017 */ 15018 15019 STATIC char* 15020 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 15021 { 15022 char* const retval = RExC_parse++; 15023 15024 PERL_ARGS_ASSERT_NEXTCHAR; 15025 15026 for (;;) { 15027 if (RExC_end - RExC_parse >= 3 15028 && *RExC_parse == '(' 15029 && RExC_parse[1] == '?' 15030 && RExC_parse[2] == '#') 15031 { 15032 while (*RExC_parse != ')') { 15033 if (RExC_parse == RExC_end) 15034 FAIL("Sequence (?#... not terminated"); 15035 RExC_parse++; 15036 } 15037 RExC_parse++; 15038 continue; 15039 } 15040 if (RExC_flags & RXf_PMf_EXTENDED) { 15041 if (isSPACE(*RExC_parse)) { 15042 RExC_parse++; 15043 continue; 15044 } 15045 else if (*RExC_parse == '#') { 15046 if ( reg_skipcomment( pRExC_state ) ) 15047 continue; 15048 } 15049 } 15050 return retval; 15051 } 15052 } 15053 15054 /* 15055 - reg_node - emit a node 15056 */ 15057 STATIC regnode * /* Location. */ 15058 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 15059 { 15060 dVAR; 15061 regnode *ptr; 15062 regnode * const ret = RExC_emit; 15063 GET_RE_DEBUG_FLAGS_DECL; 15064 15065 PERL_ARGS_ASSERT_REG_NODE; 15066 15067 if (SIZE_ONLY) { 15068 SIZE_ALIGN(RExC_size); 15069 RExC_size += 1; 15070 return(ret); 15071 } 15072 if (RExC_emit >= RExC_emit_bound) 15073 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", 15074 op, RExC_emit, RExC_emit_bound); 15075 15076 NODE_ALIGN_FILL(ret); 15077 ptr = ret; 15078 FILL_ADVANCE_NODE(ptr, op); 15079 #ifdef RE_TRACK_PATTERN_OFFSETS 15080 if (RExC_offsets) { /* MJD */ 15081 MJD_OFFSET_DEBUG( 15082 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", 15083 "reg_node", __LINE__, 15084 PL_reg_name[op], 15085 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] 15086 ? "Overwriting end of array!\n" : "OK", 15087 (UV)(RExC_emit - RExC_emit_start), 15088 (UV)(RExC_parse - RExC_start), 15089 (UV)RExC_offsets[0])); 15090 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 15091 } 15092 #endif 15093 RExC_emit = ptr; 15094 return(ret); 15095 } 15096 15097 /* 15098 - reganode - emit a node with an argument 15099 */ 15100 STATIC regnode * /* Location. */ 15101 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 15102 { 15103 dVAR; 15104 regnode *ptr; 15105 regnode * const ret = RExC_emit; 15106 GET_RE_DEBUG_FLAGS_DECL; 15107 15108 PERL_ARGS_ASSERT_REGANODE; 15109 15110 if (SIZE_ONLY) { 15111 SIZE_ALIGN(RExC_size); 15112 RExC_size += 2; 15113 /* 15114 We can't do this: 15115 15116 assert(2==regarglen[op]+1); 15117 15118 Anything larger than this has to allocate the extra amount. 15119 If we changed this to be: 15120 15121 RExC_size += (1 + regarglen[op]); 15122 15123 then it wouldn't matter. Its not clear what side effect 15124 might come from that so its not done so far. 15125 -- dmq 15126 */ 15127 return(ret); 15128 } 15129 if (RExC_emit >= RExC_emit_bound) 15130 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", 15131 op, RExC_emit, RExC_emit_bound); 15132 15133 NODE_ALIGN_FILL(ret); 15134 ptr = ret; 15135 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 15136 #ifdef RE_TRACK_PATTERN_OFFSETS 15137 if (RExC_offsets) { /* MJD */ 15138 MJD_OFFSET_DEBUG( 15139 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 15140 "reganode", 15141 __LINE__, 15142 PL_reg_name[op], 15143 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? 15144 "Overwriting end of array!\n" : "OK", 15145 (UV)(RExC_emit - RExC_emit_start), 15146 (UV)(RExC_parse - RExC_start), 15147 (UV)RExC_offsets[0])); 15148 Set_Cur_Node_Offset; 15149 } 15150 #endif 15151 RExC_emit = ptr; 15152 return(ret); 15153 } 15154 15155 /* 15156 - reguni - emit (if appropriate) a Unicode character 15157 */ 15158 PERL_STATIC_INLINE STRLEN 15159 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) 15160 { 15161 dVAR; 15162 15163 PERL_ARGS_ASSERT_REGUNI; 15164 15165 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 15166 } 15167 15168 /* 15169 - reginsert - insert an operator in front of already-emitted operand 15170 * 15171 * Means relocating the operand. 15172 */ 15173 STATIC void 15174 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) 15175 { 15176 dVAR; 15177 regnode *src; 15178 regnode *dst; 15179 regnode *place; 15180 const int offset = regarglen[(U8)op]; 15181 const int size = NODE_STEP_REGNODE + offset; 15182 GET_RE_DEBUG_FLAGS_DECL; 15183 15184 PERL_ARGS_ASSERT_REGINSERT; 15185 PERL_UNUSED_ARG(depth); 15186 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 15187 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); 15188 if (SIZE_ONLY) { 15189 RExC_size += size; 15190 return; 15191 } 15192 15193 src = RExC_emit; 15194 RExC_emit += size; 15195 dst = RExC_emit; 15196 if (RExC_open_parens) { 15197 int paren; 15198 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ 15199 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 15200 if ( RExC_open_parens[paren] >= opnd ) { 15201 /*DEBUG_PARSE_FMT("open"," - %d",size);*/ 15202 RExC_open_parens[paren] += size; 15203 } else { 15204 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 15205 } 15206 if ( RExC_close_parens[paren] >= opnd ) { 15207 /*DEBUG_PARSE_FMT("close"," - %d",size);*/ 15208 RExC_close_parens[paren] += size; 15209 } else { 15210 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 15211 } 15212 } 15213 } 15214 15215 while (src > opnd) { 15216 StructCopy(--src, --dst, regnode); 15217 #ifdef RE_TRACK_PATTERN_OFFSETS 15218 if (RExC_offsets) { /* MJD 20010112 */ 15219 MJD_OFFSET_DEBUG( 15220 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", 15221 "reg_insert", 15222 __LINE__, 15223 PL_reg_name[op], 15224 (UV)(dst - RExC_emit_start) > RExC_offsets[0] 15225 ? "Overwriting end of array!\n" : "OK", 15226 (UV)(src - RExC_emit_start), 15227 (UV)(dst - RExC_emit_start), 15228 (UV)RExC_offsets[0])); 15229 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 15230 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 15231 } 15232 #endif 15233 } 15234 15235 15236 place = opnd; /* Op node, where operand used to be. */ 15237 #ifdef RE_TRACK_PATTERN_OFFSETS 15238 if (RExC_offsets) { /* MJD */ 15239 MJD_OFFSET_DEBUG( 15240 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 15241 "reginsert", 15242 __LINE__, 15243 PL_reg_name[op], 15244 (UV)(place - RExC_emit_start) > RExC_offsets[0] 15245 ? "Overwriting end of array!\n" : "OK", 15246 (UV)(place - RExC_emit_start), 15247 (UV)(RExC_parse - RExC_start), 15248 (UV)RExC_offsets[0])); 15249 Set_Node_Offset(place, RExC_parse); 15250 Set_Node_Length(place, 1); 15251 } 15252 #endif 15253 src = NEXTOPER(place); 15254 FILL_ADVANCE_NODE(place, op); 15255 Zero(src, offset, regnode); 15256 } 15257 15258 /* 15259 - regtail - set the next-pointer at the end of a node chain of p to val. 15260 - SEE ALSO: regtail_study 15261 */ 15262 /* TODO: All three parms should be const */ 15263 STATIC void 15264 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, 15265 const regnode *val,U32 depth) 15266 { 15267 dVAR; 15268 regnode *scan; 15269 GET_RE_DEBUG_FLAGS_DECL; 15270 15271 PERL_ARGS_ASSERT_REGTAIL; 15272 #ifndef DEBUGGING 15273 PERL_UNUSED_ARG(depth); 15274 #endif 15275 15276 if (SIZE_ONLY) 15277 return; 15278 15279 /* Find last node. */ 15280 scan = p; 15281 for (;;) { 15282 regnode * const temp = regnext(scan); 15283 DEBUG_PARSE_r({ 15284 SV * const mysv=sv_newmortal(); 15285 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 15286 regprop(RExC_rx, mysv, scan, NULL); 15287 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", 15288 SvPV_nolen_const(mysv), REG_NODE_NUM(scan), 15289 (temp == NULL ? "->" : ""), 15290 (temp == NULL ? PL_reg_name[OP(val)] : "") 15291 ); 15292 }); 15293 if (temp == NULL) 15294 break; 15295 scan = temp; 15296 } 15297 15298 if (reg_off_by_arg[OP(scan)]) { 15299 ARG_SET(scan, val - scan); 15300 } 15301 else { 15302 NEXT_OFF(scan) = val - scan; 15303 } 15304 } 15305 15306 #ifdef DEBUGGING 15307 /* 15308 - regtail_study - set the next-pointer at the end of a node chain of p to val. 15309 - Look for optimizable sequences at the same time. 15310 - currently only looks for EXACT chains. 15311 15312 This is experimental code. The idea is to use this routine to perform 15313 in place optimizations on branches and groups as they are constructed, 15314 with the long term intention of removing optimization from study_chunk so 15315 that it is purely analytical. 15316 15317 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 15318 to control which is which. 15319 15320 */ 15321 /* TODO: All four parms should be const */ 15322 15323 STATIC U8 15324 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, 15325 const regnode *val,U32 depth) 15326 { 15327 dVAR; 15328 regnode *scan; 15329 U8 exact = PSEUDO; 15330 #ifdef EXPERIMENTAL_INPLACESCAN 15331 I32 min = 0; 15332 #endif 15333 GET_RE_DEBUG_FLAGS_DECL; 15334 15335 PERL_ARGS_ASSERT_REGTAIL_STUDY; 15336 15337 15338 if (SIZE_ONLY) 15339 return exact; 15340 15341 /* Find last node. */ 15342 15343 scan = p; 15344 for (;;) { 15345 regnode * const temp = regnext(scan); 15346 #ifdef EXPERIMENTAL_INPLACESCAN 15347 if (PL_regkind[OP(scan)] == EXACT) { 15348 bool unfolded_multi_char; /* Unexamined in this routine */ 15349 if (join_exact(pRExC_state, scan, &min, 15350 &unfolded_multi_char, 1, val, depth+1)) 15351 return EXACT; 15352 } 15353 #endif 15354 if ( exact ) { 15355 switch (OP(scan)) { 15356 case EXACT: 15357 case EXACTF: 15358 case EXACTFA_NO_TRIE: 15359 case EXACTFA: 15360 case EXACTFU: 15361 case EXACTFU_SS: 15362 case EXACTFL: 15363 if( exact == PSEUDO ) 15364 exact= OP(scan); 15365 else if ( exact != OP(scan) ) 15366 exact= 0; 15367 case NOTHING: 15368 break; 15369 default: 15370 exact= 0; 15371 } 15372 } 15373 DEBUG_PARSE_r({ 15374 SV * const mysv=sv_newmortal(); 15375 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 15376 regprop(RExC_rx, mysv, scan, NULL); 15377 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", 15378 SvPV_nolen_const(mysv), 15379 REG_NODE_NUM(scan), 15380 PL_reg_name[exact]); 15381 }); 15382 if (temp == NULL) 15383 break; 15384 scan = temp; 15385 } 15386 DEBUG_PARSE_r({ 15387 SV * const mysv_val=sv_newmortal(); 15388 DEBUG_PARSE_MSG(""); 15389 regprop(RExC_rx, mysv_val, val, NULL); 15390 PerlIO_printf(Perl_debug_log, 15391 "~ attach to %s (%"IVdf") offset to %"IVdf"\n", 15392 SvPV_nolen_const(mysv_val), 15393 (IV)REG_NODE_NUM(val), 15394 (IV)(val - scan) 15395 ); 15396 }); 15397 if (reg_off_by_arg[OP(scan)]) { 15398 ARG_SET(scan, val - scan); 15399 } 15400 else { 15401 NEXT_OFF(scan) = val - scan; 15402 } 15403 15404 return exact; 15405 } 15406 #endif 15407 15408 /* 15409 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 15410 */ 15411 #ifdef DEBUGGING 15412 15413 static void 15414 S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 15415 { 15416 int bit; 15417 int set=0; 15418 15419 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); 15420 15421 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { 15422 if (flags & (1<<bit)) { 15423 if (!set++ && lead) 15424 PerlIO_printf(Perl_debug_log, "%s",lead); 15425 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]); 15426 } 15427 } 15428 if (lead) { 15429 if (set) 15430 PerlIO_printf(Perl_debug_log, "\n"); 15431 else 15432 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); 15433 } 15434 } 15435 15436 static void 15437 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 15438 { 15439 int bit; 15440 int set=0; 15441 regex_charset cs; 15442 15443 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); 15444 15445 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 15446 if (flags & (1<<bit)) { 15447 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 15448 continue; 15449 } 15450 if (!set++ && lead) 15451 PerlIO_printf(Perl_debug_log, "%s",lead); 15452 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); 15453 } 15454 } 15455 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 15456 if (!set++ && lead) { 15457 PerlIO_printf(Perl_debug_log, "%s",lead); 15458 } 15459 switch (cs) { 15460 case REGEX_UNICODE_CHARSET: 15461 PerlIO_printf(Perl_debug_log, "UNICODE"); 15462 break; 15463 case REGEX_LOCALE_CHARSET: 15464 PerlIO_printf(Perl_debug_log, "LOCALE"); 15465 break; 15466 case REGEX_ASCII_RESTRICTED_CHARSET: 15467 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED"); 15468 break; 15469 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 15470 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED"); 15471 break; 15472 default: 15473 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET"); 15474 break; 15475 } 15476 } 15477 if (lead) { 15478 if (set) 15479 PerlIO_printf(Perl_debug_log, "\n"); 15480 else 15481 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); 15482 } 15483 } 15484 #endif 15485 15486 void 15487 Perl_regdump(pTHX_ const regexp *r) 15488 { 15489 #ifdef DEBUGGING 15490 dVAR; 15491 SV * const sv = sv_newmortal(); 15492 SV *dsv= sv_newmortal(); 15493 RXi_GET_DECL(r,ri); 15494 GET_RE_DEBUG_FLAGS_DECL; 15495 15496 PERL_ARGS_ASSERT_REGDUMP; 15497 15498 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 15499 15500 /* Header fields of interest. */ 15501 if (r->anchored_substr) { 15502 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), 15503 RE_SV_DUMPLEN(r->anchored_substr), 30); 15504 PerlIO_printf(Perl_debug_log, 15505 "anchored %s%s at %"IVdf" ", 15506 s, RE_SV_TAIL(r->anchored_substr), 15507 (IV)r->anchored_offset); 15508 } else if (r->anchored_utf8) { 15509 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), 15510 RE_SV_DUMPLEN(r->anchored_utf8), 30); 15511 PerlIO_printf(Perl_debug_log, 15512 "anchored utf8 %s%s at %"IVdf" ", 15513 s, RE_SV_TAIL(r->anchored_utf8), 15514 (IV)r->anchored_offset); 15515 } 15516 if (r->float_substr) { 15517 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), 15518 RE_SV_DUMPLEN(r->float_substr), 30); 15519 PerlIO_printf(Perl_debug_log, 15520 "floating %s%s at %"IVdf"..%"UVuf" ", 15521 s, RE_SV_TAIL(r->float_substr), 15522 (IV)r->float_min_offset, (UV)r->float_max_offset); 15523 } else if (r->float_utf8) { 15524 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), 15525 RE_SV_DUMPLEN(r->float_utf8), 30); 15526 PerlIO_printf(Perl_debug_log, 15527 "floating utf8 %s%s at %"IVdf"..%"UVuf" ", 15528 s, RE_SV_TAIL(r->float_utf8), 15529 (IV)r->float_min_offset, (UV)r->float_max_offset); 15530 } 15531 if (r->check_substr || r->check_utf8) 15532 PerlIO_printf(Perl_debug_log, 15533 (const char *) 15534 (r->check_substr == r->float_substr 15535 && r->check_utf8 == r->float_utf8 15536 ? "(checking floating" : "(checking anchored")); 15537 if (r->intflags & PREGf_NOSCAN) 15538 PerlIO_printf(Perl_debug_log, " noscan"); 15539 if (r->extflags & RXf_CHECK_ALL) 15540 PerlIO_printf(Perl_debug_log, " isall"); 15541 if (r->check_substr || r->check_utf8) 15542 PerlIO_printf(Perl_debug_log, ") "); 15543 15544 if (ri->regstclass) { 15545 regprop(r, sv, ri->regstclass, NULL); 15546 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); 15547 } 15548 if (r->intflags & PREGf_ANCH) { 15549 PerlIO_printf(Perl_debug_log, "anchored"); 15550 if (r->intflags & PREGf_ANCH_BOL) 15551 PerlIO_printf(Perl_debug_log, "(BOL)"); 15552 if (r->intflags & PREGf_ANCH_MBOL) 15553 PerlIO_printf(Perl_debug_log, "(MBOL)"); 15554 if (r->intflags & PREGf_ANCH_SBOL) 15555 PerlIO_printf(Perl_debug_log, "(SBOL)"); 15556 if (r->intflags & PREGf_ANCH_GPOS) 15557 PerlIO_printf(Perl_debug_log, "(GPOS)"); 15558 PerlIO_putc(Perl_debug_log, ' '); 15559 } 15560 if (r->intflags & PREGf_GPOS_SEEN) 15561 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); 15562 if (r->intflags & PREGf_SKIP) 15563 PerlIO_printf(Perl_debug_log, "plus "); 15564 if (r->intflags & PREGf_IMPLICIT) 15565 PerlIO_printf(Perl_debug_log, "implicit "); 15566 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); 15567 if (r->extflags & RXf_EVAL_SEEN) 15568 PerlIO_printf(Perl_debug_log, "with eval "); 15569 PerlIO_printf(Perl_debug_log, "\n"); 15570 DEBUG_FLAGS_r({ 15571 regdump_extflags("r->extflags: ",r->extflags); 15572 regdump_intflags("r->intflags: ",r->intflags); 15573 }); 15574 #else 15575 PERL_ARGS_ASSERT_REGDUMP; 15576 PERL_UNUSED_CONTEXT; 15577 PERL_UNUSED_ARG(r); 15578 #endif /* DEBUGGING */ 15579 } 15580 15581 /* 15582 - regprop - printable representation of opcode, with run time support 15583 */ 15584 15585 void 15586 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) 15587 { 15588 #ifdef DEBUGGING 15589 dVAR; 15590 int k; 15591 15592 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ 15593 static const char * const anyofs[] = { 15594 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ 15595 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ 15596 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ 15597 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ 15598 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ 15599 || _CC_VERTSPACE != 16 15600 #error Need to adjust order of anyofs[] 15601 #endif 15602 "\\w", 15603 "\\W", 15604 "\\d", 15605 "\\D", 15606 "[:alpha:]", 15607 "[:^alpha:]", 15608 "[:lower:]", 15609 "[:^lower:]", 15610 "[:upper:]", 15611 "[:^upper:]", 15612 "[:punct:]", 15613 "[:^punct:]", 15614 "[:print:]", 15615 "[:^print:]", 15616 "[:alnum:]", 15617 "[:^alnum:]", 15618 "[:graph:]", 15619 "[:^graph:]", 15620 "[:cased:]", 15621 "[:^cased:]", 15622 "\\s", 15623 "\\S", 15624 "[:blank:]", 15625 "[:^blank:]", 15626 "[:xdigit:]", 15627 "[:^xdigit:]", 15628 "[:space:]", 15629 "[:^space:]", 15630 "[:cntrl:]", 15631 "[:^cntrl:]", 15632 "[:ascii:]", 15633 "[:^ascii:]", 15634 "\\v", 15635 "\\V" 15636 }; 15637 RXi_GET_DECL(prog,progi); 15638 GET_RE_DEBUG_FLAGS_DECL; 15639 15640 PERL_ARGS_ASSERT_REGPROP; 15641 15642 sv_setpvs(sv, ""); 15643 15644 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 15645 /* It would be nice to FAIL() here, but this may be called from 15646 regexec.c, and it would be hard to supply pRExC_state. */ 15647 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 15648 (int)OP(o), (int)REGNODE_MAX); 15649 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 15650 15651 k = PL_regkind[OP(o)]; 15652 15653 if (k == EXACT) { 15654 sv_catpvs(sv, " "); 15655 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 15656 * is a crude hack but it may be the best for now since 15657 * we have no flag "this EXACTish node was UTF-8" 15658 * --jhi */ 15659 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], 15660 PERL_PV_ESCAPE_UNI_DETECT | 15661 PERL_PV_ESCAPE_NONASCII | 15662 PERL_PV_PRETTY_ELLIPSES | 15663 PERL_PV_PRETTY_LTGT | 15664 PERL_PV_PRETTY_NOCLEAR 15665 ); 15666 } else if (k == TRIE) { 15667 /* print the details of the trie in dumpuntil instead, as 15668 * progi->data isn't available here */ 15669 const char op = OP(o); 15670 const U32 n = ARG(o); 15671 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 15672 (reg_ac_data *)progi->data->data[n] : 15673 NULL; 15674 const reg_trie_data * const trie 15675 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 15676 15677 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); 15678 DEBUG_TRIE_COMPILE_r( 15679 Perl_sv_catpvf(aTHX_ sv, 15680 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", 15681 (UV)trie->startstate, 15682 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 15683 (UV)trie->wordcount, 15684 (UV)trie->minlen, 15685 (UV)trie->maxlen, 15686 (UV)TRIE_CHARCOUNT(trie), 15687 (UV)trie->uniquecharcount 15688 ); 15689 ); 15690 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 15691 sv_catpvs(sv, "["); 15692 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) 15693 ? ANYOF_BITMAP(o) 15694 : TRIE_BITMAP(trie)); 15695 sv_catpvs(sv, "]"); 15696 } 15697 15698 } else if (k == CURLY) { 15699 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 15700 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 15701 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 15702 } 15703 else if (k == WHILEM && o->flags) /* Ordinal/of */ 15704 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 15705 else if (k == REF || k == OPEN || k == CLOSE 15706 || k == GROUPP || OP(o)==ACCEPT) 15707 { 15708 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 15709 if ( RXp_PAREN_NAMES(prog) ) { 15710 if ( k != REF || (OP(o) < NREF)) { 15711 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 15712 SV **name= av_fetch(list, ARG(o), 0 ); 15713 if (name) 15714 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 15715 } 15716 else { 15717 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); 15718 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); 15719 I32 *nums=(I32*)SvPVX(sv_dat); 15720 SV **name= av_fetch(list, nums[0], 0 ); 15721 I32 n; 15722 if (name) { 15723 for ( n=0; n<SvIVX(sv_dat); n++ ) { 15724 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, 15725 (n ? "," : ""), (IV)nums[n]); 15726 } 15727 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 15728 } 15729 } 15730 } 15731 if ( k == REF && reginfo) { 15732 U32 n = ARG(o); /* which paren pair */ 15733 I32 ln = prog->offs[n].start; 15734 if (prog->lastparen < n || ln == -1) 15735 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 15736 else if (ln == prog->offs[n].end) 15737 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 15738 else { 15739 const char *s = reginfo->strbeg + ln; 15740 Perl_sv_catpvf(aTHX_ sv, ": "); 15741 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, 15742 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 15743 } 15744 } 15745 } else if (k == GOSUB) 15746 /* Paren and offset */ 15747 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); 15748 else if (k == VERB) { 15749 if (!o->flags) 15750 Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 15751 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); 15752 } else if (k == LOGICAL) 15753 /* 2: embedded, otherwise 1 */ 15754 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); 15755 else if (k == ANYOF) { 15756 const U8 flags = ANYOF_FLAGS(o); 15757 int do_sep = 0; 15758 15759 15760 if (flags & ANYOF_LOCALE_FLAGS) 15761 sv_catpvs(sv, "{loc}"); 15762 if (flags & ANYOF_LOC_FOLD) 15763 sv_catpvs(sv, "{i}"); 15764 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 15765 if (flags & ANYOF_INVERT) 15766 sv_catpvs(sv, "^"); 15767 15768 /* output what the standard cp 0-255 bitmap matches */ 15769 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); 15770 15771 /* output any special charclass tests (used entirely under use 15772 * locale) * */ 15773 if (ANYOF_POSIXL_TEST_ANY_SET(o)) { 15774 int i; 15775 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 15776 if (ANYOF_POSIXL_TEST(o,i)) { 15777 sv_catpv(sv, anyofs[i]); 15778 do_sep = 1; 15779 } 15780 } 15781 } 15782 15783 if ((flags & (ANYOF_ABOVE_LATIN1_ALL 15784 |ANYOF_UTF8 15785 |ANYOF_NONBITMAP_NON_UTF8 15786 |ANYOF_LOC_FOLD))) 15787 { 15788 if (do_sep) { 15789 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); 15790 if (flags & ANYOF_INVERT) 15791 /*make sure the invert info is in each */ 15792 sv_catpvs(sv, "^"); 15793 } 15794 15795 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { 15796 sv_catpvs(sv, "{non-utf8-latin1-all}"); 15797 } 15798 15799 /* output information about the unicode matching */ 15800 if (flags & ANYOF_ABOVE_LATIN1_ALL) 15801 sv_catpvs(sv, "{unicode_all}"); 15802 else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { 15803 SV *lv; /* Set if there is something outside the bit map. */ 15804 bool byte_output = FALSE; /* If something in the bitmap has 15805 been output */ 15806 SV *only_utf8_locale; 15807 15808 /* Get the stuff that wasn't in the bitmap */ 15809 (void) _get_regclass_nonbitmap_data(prog, o, FALSE, 15810 &lv, &only_utf8_locale); 15811 if (lv && lv != &PL_sv_undef) { 15812 char *s = savesvpv(lv); 15813 char * const origs = s; 15814 15815 while (*s && *s != '\n') 15816 s++; 15817 15818 if (*s == '\n') { 15819 const char * const t = ++s; 15820 15821 if (flags & ANYOF_NONBITMAP_NON_UTF8) { 15822 sv_catpvs(sv, "{outside bitmap}"); 15823 } 15824 else { 15825 sv_catpvs(sv, "{utf8}"); 15826 } 15827 15828 if (byte_output) { 15829 sv_catpvs(sv, " "); 15830 } 15831 15832 while (*s) { 15833 if (*s == '\n') { 15834 15835 /* Truncate very long output */ 15836 if (s - origs > 256) { 15837 Perl_sv_catpvf(aTHX_ sv, 15838 "%.*s...", 15839 (int) (s - origs - 1), 15840 t); 15841 goto out_dump; 15842 } 15843 *s = ' '; 15844 } 15845 else if (*s == '\t') { 15846 *s = '-'; 15847 } 15848 s++; 15849 } 15850 if (s[-1] == ' ') 15851 s[-1] = 0; 15852 15853 sv_catpv(sv, t); 15854 } 15855 15856 out_dump: 15857 15858 Safefree(origs); 15859 SvREFCNT_dec_NN(lv); 15860 } 15861 15862 if ((flags & ANYOF_LOC_FOLD) 15863 && only_utf8_locale 15864 && only_utf8_locale != &PL_sv_undef) 15865 { 15866 UV start, end; 15867 int max_entries = 256; 15868 15869 sv_catpvs(sv, "{utf8 locale}"); 15870 invlist_iterinit(only_utf8_locale); 15871 while (invlist_iternext(only_utf8_locale, 15872 &start, &end)) { 15873 put_range(sv, start, end); 15874 max_entries --; 15875 if (max_entries < 0) { 15876 sv_catpvs(sv, "..."); 15877 break; 15878 } 15879 } 15880 invlist_iterfinish(only_utf8_locale); 15881 } 15882 } 15883 } 15884 15885 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 15886 } 15887 else if (k == POSIXD || k == NPOSIXD) { 15888 U8 index = FLAGS(o) * 2; 15889 if (index < C_ARRAY_LENGTH(anyofs)) { 15890 if (*anyofs[index] != '[') { 15891 sv_catpv(sv, "["); 15892 } 15893 sv_catpv(sv, anyofs[index]); 15894 if (*anyofs[index] != '[') { 15895 sv_catpv(sv, "]"); 15896 } 15897 } 15898 else { 15899 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 15900 } 15901 } 15902 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 15903 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); 15904 #else 15905 PERL_UNUSED_CONTEXT; 15906 PERL_UNUSED_ARG(sv); 15907 PERL_UNUSED_ARG(o); 15908 PERL_UNUSED_ARG(prog); 15909 PERL_UNUSED_ARG(reginfo); 15910 #endif /* DEBUGGING */ 15911 } 15912 15913 15914 15915 SV * 15916 Perl_re_intuit_string(pTHX_ REGEXP * const r) 15917 { /* Assume that RE_INTUIT is set */ 15918 dVAR; 15919 struct regexp *const prog = ReANY(r); 15920 GET_RE_DEBUG_FLAGS_DECL; 15921 15922 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 15923 PERL_UNUSED_CONTEXT; 15924 15925 DEBUG_COMPILE_r( 15926 { 15927 const char * const s = SvPV_nolen_const(prog->check_substr 15928 ? prog->check_substr : prog->check_utf8); 15929 15930 if (!PL_colorset) reginitcolors(); 15931 PerlIO_printf(Perl_debug_log, 15932 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 15933 PL_colors[4], 15934 prog->check_substr ? "" : "utf8 ", 15935 PL_colors[5],PL_colors[0], 15936 s, 15937 PL_colors[1], 15938 (strlen(s) > 60 ? "..." : "")); 15939 } ); 15940 15941 return prog->check_substr ? prog->check_substr : prog->check_utf8; 15942 } 15943 15944 /* 15945 pregfree() 15946 15947 handles refcounting and freeing the perl core regexp structure. When 15948 it is necessary to actually free the structure the first thing it 15949 does is call the 'free' method of the regexp_engine associated to 15950 the regexp, allowing the handling of the void *pprivate; member 15951 first. (This routine is not overridable by extensions, which is why 15952 the extensions free is called first.) 15953 15954 See regdupe and regdupe_internal if you change anything here. 15955 */ 15956 #ifndef PERL_IN_XSUB_RE 15957 void 15958 Perl_pregfree(pTHX_ REGEXP *r) 15959 { 15960 SvREFCNT_dec(r); 15961 } 15962 15963 void 15964 Perl_pregfree2(pTHX_ REGEXP *rx) 15965 { 15966 dVAR; 15967 struct regexp *const r = ReANY(rx); 15968 GET_RE_DEBUG_FLAGS_DECL; 15969 15970 PERL_ARGS_ASSERT_PREGFREE2; 15971 15972 if (r->mother_re) { 15973 ReREFCNT_dec(r->mother_re); 15974 } else { 15975 CALLREGFREE_PVT(rx); /* free the private data */ 15976 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 15977 Safefree(r->xpv_len_u.xpvlenu_pv); 15978 } 15979 if (r->substrs) { 15980 SvREFCNT_dec(r->anchored_substr); 15981 SvREFCNT_dec(r->anchored_utf8); 15982 SvREFCNT_dec(r->float_substr); 15983 SvREFCNT_dec(r->float_utf8); 15984 Safefree(r->substrs); 15985 } 15986 RX_MATCH_COPY_FREE(rx); 15987 #ifdef PERL_ANY_COW 15988 SvREFCNT_dec(r->saved_copy); 15989 #endif 15990 Safefree(r->offs); 15991 SvREFCNT_dec(r->qr_anoncv); 15992 rx->sv_u.svu_rx = 0; 15993 } 15994 15995 /* reg_temp_copy() 15996 15997 This is a hacky workaround to the structural issue of match results 15998 being stored in the regexp structure which is in turn stored in 15999 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 16000 could be PL_curpm in multiple contexts, and could require multiple 16001 result sets being associated with the pattern simultaneously, such 16002 as when doing a recursive match with (??{$qr}) 16003 16004 The solution is to make a lightweight copy of the regexp structure 16005 when a qr// is returned from the code executed by (??{$qr}) this 16006 lightweight copy doesn't actually own any of its data except for 16007 the starp/end and the actual regexp structure itself. 16008 16009 */ 16010 16011 16012 REGEXP * 16013 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) 16014 { 16015 struct regexp *ret; 16016 struct regexp *const r = ReANY(rx); 16017 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; 16018 16019 PERL_ARGS_ASSERT_REG_TEMP_COPY; 16020 16021 if (!ret_x) 16022 ret_x = (REGEXP*) newSV_type(SVt_REGEXP); 16023 else { 16024 SvOK_off((SV *)ret_x); 16025 if (islv) { 16026 /* For PVLVs, SvANY points to the xpvlv body while sv_u points 16027 to the regexp. (For SVt_REGEXPs, sv_upgrade has already 16028 made both spots point to the same regexp body.) */ 16029 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); 16030 assert(!SvPVX(ret_x)); 16031 ret_x->sv_u.svu_rx = temp->sv_any; 16032 temp->sv_any = NULL; 16033 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; 16034 SvREFCNT_dec_NN(temp); 16035 /* SvCUR still resides in the xpvlv struct, so the regexp copy- 16036 ing below will not set it. */ 16037 SvCUR_set(ret_x, SvCUR(rx)); 16038 } 16039 } 16040 /* This ensures that SvTHINKFIRST(sv) is true, and hence that 16041 sv_force_normal(sv) is called. */ 16042 SvFAKE_on(ret_x); 16043 ret = ReANY(ret_x); 16044 16045 SvFLAGS(ret_x) |= SvUTF8(rx); 16046 /* We share the same string buffer as the original regexp, on which we 16047 hold a reference count, incremented when mother_re is set below. 16048 The string pointer is copied here, being part of the regexp struct. 16049 */ 16050 memcpy(&(ret->xpv_cur), &(r->xpv_cur), 16051 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 16052 if (r->offs) { 16053 const I32 npar = r->nparens+1; 16054 Newx(ret->offs, npar, regexp_paren_pair); 16055 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 16056 } 16057 if (r->substrs) { 16058 Newx(ret->substrs, 1, struct reg_substr_data); 16059 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 16060 16061 SvREFCNT_inc_void(ret->anchored_substr); 16062 SvREFCNT_inc_void(ret->anchored_utf8); 16063 SvREFCNT_inc_void(ret->float_substr); 16064 SvREFCNT_inc_void(ret->float_utf8); 16065 16066 /* check_substr and check_utf8, if non-NULL, point to either their 16067 anchored or float namesakes, and don't hold a second reference. */ 16068 } 16069 RX_MATCH_COPIED_off(ret_x); 16070 #ifdef PERL_ANY_COW 16071 ret->saved_copy = NULL; 16072 #endif 16073 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); 16074 SvREFCNT_inc_void(ret->qr_anoncv); 16075 16076 return ret_x; 16077 } 16078 #endif 16079 16080 /* regfree_internal() 16081 16082 Free the private data in a regexp. This is overloadable by 16083 extensions. Perl takes care of the regexp structure in pregfree(), 16084 this covers the *pprivate pointer which technically perl doesn't 16085 know about, however of course we have to handle the 16086 regexp_internal structure when no extension is in use. 16087 16088 Note this is called before freeing anything in the regexp 16089 structure. 16090 */ 16091 16092 void 16093 Perl_regfree_internal(pTHX_ REGEXP * const rx) 16094 { 16095 dVAR; 16096 struct regexp *const r = ReANY(rx); 16097 RXi_GET_DECL(r,ri); 16098 GET_RE_DEBUG_FLAGS_DECL; 16099 16100 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 16101 16102 DEBUG_COMPILE_r({ 16103 if (!PL_colorset) 16104 reginitcolors(); 16105 { 16106 SV *dsv= sv_newmortal(); 16107 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 16108 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); 16109 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 16110 PL_colors[4],PL_colors[5],s); 16111 } 16112 }); 16113 #ifdef RE_TRACK_PATTERN_OFFSETS 16114 if (ri->u.offsets) 16115 Safefree(ri->u.offsets); /* 20010421 MJD */ 16116 #endif 16117 if (ri->code_blocks) { 16118 int n; 16119 for (n = 0; n < ri->num_code_blocks; n++) 16120 SvREFCNT_dec(ri->code_blocks[n].src_regex); 16121 Safefree(ri->code_blocks); 16122 } 16123 16124 if (ri->data) { 16125 int n = ri->data->count; 16126 16127 while (--n >= 0) { 16128 /* If you add a ->what type here, update the comment in regcomp.h */ 16129 switch (ri->data->what[n]) { 16130 case 'a': 16131 case 'r': 16132 case 's': 16133 case 'S': 16134 case 'u': 16135 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 16136 break; 16137 case 'f': 16138 Safefree(ri->data->data[n]); 16139 break; 16140 case 'l': 16141 case 'L': 16142 break; 16143 case 'T': 16144 { /* Aho Corasick add-on structure for a trie node. 16145 Used in stclass optimization only */ 16146 U32 refcount; 16147 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 16148 OP_REFCNT_LOCK; 16149 refcount = --aho->refcount; 16150 OP_REFCNT_UNLOCK; 16151 if ( !refcount ) { 16152 PerlMemShared_free(aho->states); 16153 PerlMemShared_free(aho->fail); 16154 /* do this last!!!! */ 16155 PerlMemShared_free(ri->data->data[n]); 16156 PerlMemShared_free(ri->regstclass); 16157 } 16158 } 16159 break; 16160 case 't': 16161 { 16162 /* trie structure. */ 16163 U32 refcount; 16164 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 16165 OP_REFCNT_LOCK; 16166 refcount = --trie->refcount; 16167 OP_REFCNT_UNLOCK; 16168 if ( !refcount ) { 16169 PerlMemShared_free(trie->charmap); 16170 PerlMemShared_free(trie->states); 16171 PerlMemShared_free(trie->trans); 16172 if (trie->bitmap) 16173 PerlMemShared_free(trie->bitmap); 16174 if (trie->jump) 16175 PerlMemShared_free(trie->jump); 16176 PerlMemShared_free(trie->wordinfo); 16177 /* do this last!!!! */ 16178 PerlMemShared_free(ri->data->data[n]); 16179 } 16180 } 16181 break; 16182 default: 16183 Perl_croak(aTHX_ "panic: regfree data code '%c'", 16184 ri->data->what[n]); 16185 } 16186 } 16187 Safefree(ri->data->what); 16188 Safefree(ri->data); 16189 } 16190 16191 Safefree(ri); 16192 } 16193 16194 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) 16195 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) 16196 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) 16197 16198 /* 16199 re_dup - duplicate a regexp. 16200 16201 This routine is expected to clone a given regexp structure. It is only 16202 compiled under USE_ITHREADS. 16203 16204 After all of the core data stored in struct regexp is duplicated 16205 the regexp_engine.dupe method is used to copy any private data 16206 stored in the *pprivate pointer. This allows extensions to handle 16207 any duplication it needs to do. 16208 16209 See pregfree() and regfree_internal() if you change anything here. 16210 */ 16211 #if defined(USE_ITHREADS) 16212 #ifndef PERL_IN_XSUB_RE 16213 void 16214 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 16215 { 16216 dVAR; 16217 I32 npar; 16218 const struct regexp *r = ReANY(sstr); 16219 struct regexp *ret = ReANY(dstr); 16220 16221 PERL_ARGS_ASSERT_RE_DUP_GUTS; 16222 16223 npar = r->nparens+1; 16224 Newx(ret->offs, npar, regexp_paren_pair); 16225 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 16226 16227 if (ret->substrs) { 16228 /* Do it this way to avoid reading from *r after the StructCopy(). 16229 That way, if any of the sv_dup_inc()s dislodge *r from the L1 16230 cache, it doesn't matter. */ 16231 const bool anchored = r->check_substr 16232 ? r->check_substr == r->anchored_substr 16233 : r->check_utf8 == r->anchored_utf8; 16234 Newx(ret->substrs, 1, struct reg_substr_data); 16235 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 16236 16237 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); 16238 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); 16239 ret->float_substr = sv_dup_inc(ret->float_substr, param); 16240 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); 16241 16242 /* check_substr and check_utf8, if non-NULL, point to either their 16243 anchored or float namesakes, and don't hold a second reference. */ 16244 16245 if (ret->check_substr) { 16246 if (anchored) { 16247 assert(r->check_utf8 == r->anchored_utf8); 16248 ret->check_substr = ret->anchored_substr; 16249 ret->check_utf8 = ret->anchored_utf8; 16250 } else { 16251 assert(r->check_substr == r->float_substr); 16252 assert(r->check_utf8 == r->float_utf8); 16253 ret->check_substr = ret->float_substr; 16254 ret->check_utf8 = ret->float_utf8; 16255 } 16256 } else if (ret->check_utf8) { 16257 if (anchored) { 16258 ret->check_utf8 = ret->anchored_utf8; 16259 } else { 16260 ret->check_utf8 = ret->float_utf8; 16261 } 16262 } 16263 } 16264 16265 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 16266 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); 16267 16268 if (ret->pprivate) 16269 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); 16270 16271 if (RX_MATCH_COPIED(dstr)) 16272 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 16273 else 16274 ret->subbeg = NULL; 16275 #ifdef PERL_ANY_COW 16276 ret->saved_copy = NULL; 16277 #endif 16278 16279 /* Whether mother_re be set or no, we need to copy the string. We 16280 cannot refrain from copying it when the storage points directly to 16281 our mother regexp, because that's 16282 1: a buffer in a different thread 16283 2: something we no longer hold a reference on 16284 so we need to copy it locally. */ 16285 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); 16286 ret->mother_re = NULL; 16287 } 16288 #endif /* PERL_IN_XSUB_RE */ 16289 16290 /* 16291 regdupe_internal() 16292 16293 This is the internal complement to regdupe() which is used to copy 16294 the structure pointed to by the *pprivate pointer in the regexp. 16295 This is the core version of the extension overridable cloning hook. 16296 The regexp structure being duplicated will be copied by perl prior 16297 to this and will be provided as the regexp *r argument, however 16298 with the /old/ structures pprivate pointer value. Thus this routine 16299 may override any copying normally done by perl. 16300 16301 It returns a pointer to the new regexp_internal structure. 16302 */ 16303 16304 void * 16305 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 16306 { 16307 dVAR; 16308 struct regexp *const r = ReANY(rx); 16309 regexp_internal *reti; 16310 int len; 16311 RXi_GET_DECL(r,ri); 16312 16313 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 16314 16315 len = ProgLen(ri); 16316 16317 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), 16318 char, regexp_internal); 16319 Copy(ri->program, reti->program, len+1, regnode); 16320 16321 reti->num_code_blocks = ri->num_code_blocks; 16322 if (ri->code_blocks) { 16323 int n; 16324 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, 16325 struct reg_code_block); 16326 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, 16327 struct reg_code_block); 16328 for (n = 0; n < ri->num_code_blocks; n++) 16329 reti->code_blocks[n].src_regex = (REGEXP*) 16330 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); 16331 } 16332 else 16333 reti->code_blocks = NULL; 16334 16335 reti->regstclass = NULL; 16336 16337 if (ri->data) { 16338 struct reg_data *d; 16339 const int count = ri->data->count; 16340 int i; 16341 16342 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 16343 char, struct reg_data); 16344 Newx(d->what, count, U8); 16345 16346 d->count = count; 16347 for (i = 0; i < count; i++) { 16348 d->what[i] = ri->data->what[i]; 16349 switch (d->what[i]) { 16350 /* see also regcomp.h and regfree_internal() */ 16351 case 'a': /* actually an AV, but the dup function is identical. */ 16352 case 'r': 16353 case 's': 16354 case 'S': 16355 case 'u': /* actually an HV, but the dup function is identical. */ 16356 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 16357 break; 16358 case 'f': 16359 /* This is cheating. */ 16360 Newx(d->data[i], 1, regnode_ssc); 16361 StructCopy(ri->data->data[i], d->data[i], regnode_ssc); 16362 reti->regstclass = (regnode*)d->data[i]; 16363 break; 16364 case 'T': 16365 /* Trie stclasses are readonly and can thus be shared 16366 * without duplication. We free the stclass in pregfree 16367 * when the corresponding reg_ac_data struct is freed. 16368 */ 16369 reti->regstclass= ri->regstclass; 16370 /* Fall through */ 16371 case 't': 16372 OP_REFCNT_LOCK; 16373 ((reg_trie_data*)ri->data->data[i])->refcount++; 16374 OP_REFCNT_UNLOCK; 16375 /* Fall through */ 16376 case 'l': 16377 case 'L': 16378 d->data[i] = ri->data->data[i]; 16379 break; 16380 default: 16381 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", 16382 ri->data->what[i]); 16383 } 16384 } 16385 16386 reti->data = d; 16387 } 16388 else 16389 reti->data = NULL; 16390 16391 reti->name_list_idx = ri->name_list_idx; 16392 16393 #ifdef RE_TRACK_PATTERN_OFFSETS 16394 if (ri->u.offsets) { 16395 Newx(reti->u.offsets, 2*len+1, U32); 16396 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 16397 } 16398 #else 16399 SetProgLen(reti,len); 16400 #endif 16401 16402 return (void*)reti; 16403 } 16404 16405 #endif /* USE_ITHREADS */ 16406 16407 #ifndef PERL_IN_XSUB_RE 16408 16409 /* 16410 - regnext - dig the "next" pointer out of a node 16411 */ 16412 regnode * 16413 Perl_regnext(pTHX_ regnode *p) 16414 { 16415 dVAR; 16416 I32 offset; 16417 16418 if (!p) 16419 return(NULL); 16420 16421 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ 16422 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 16423 (int)OP(p), (int)REGNODE_MAX); 16424 } 16425 16426 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 16427 if (offset == 0) 16428 return(NULL); 16429 16430 return(p+offset); 16431 } 16432 #endif 16433 16434 STATIC void 16435 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) 16436 { 16437 va_list args; 16438 STRLEN l1 = strlen(pat1); 16439 STRLEN l2 = strlen(pat2); 16440 char buf[512]; 16441 SV *msv; 16442 const char *message; 16443 16444 PERL_ARGS_ASSERT_RE_CROAK2; 16445 16446 if (l1 > 510) 16447 l1 = 510; 16448 if (l1 + l2 > 510) 16449 l2 = 510 - l1; 16450 Copy(pat1, buf, l1 , char); 16451 Copy(pat2, buf + l1, l2 , char); 16452 buf[l1 + l2] = '\n'; 16453 buf[l1 + l2 + 1] = '\0'; 16454 va_start(args, pat2); 16455 msv = vmess(buf, &args); 16456 va_end(args); 16457 message = SvPV_const(msv,l1); 16458 if (l1 > 512) 16459 l1 = 512; 16460 Copy(message, buf, l1 , char); 16461 /* l1-1 to avoid \n */ 16462 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); 16463 } 16464 16465 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 16466 16467 #ifndef PERL_IN_XSUB_RE 16468 void 16469 Perl_save_re_context(pTHX) 16470 { 16471 dVAR; 16472 I32 nparens = -1; 16473 I32 i; 16474 16475 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 16476 16477 if (PL_curpm) { 16478 const REGEXP * const rx = PM_GETRE(PL_curpm); 16479 if (rx) 16480 nparens = RX_NPARENS(rx); 16481 } 16482 16483 /* RT #124109. This is a complete hack; in the SWASHNEW case we know 16484 * that PL_curpm will be null, but that utf8.pm and the modules it 16485 * loads will only use $1..$3. 16486 * The t/porting/re_context.t test file checks this assumption. 16487 */ 16488 if (nparens == -1) 16489 nparens = 3; 16490 16491 for (i = 1; i <= nparens; i++) { 16492 char digits[TYPE_CHARS(long)]; 16493 const STRLEN len = my_snprintf(digits, sizeof(digits), 16494 "%lu", (long)i); 16495 GV *const *const gvp 16496 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 16497 16498 if (gvp) { 16499 GV * const gv = *gvp; 16500 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 16501 save_scalar(gv); 16502 } 16503 } 16504 } 16505 #endif 16506 16507 #ifdef DEBUGGING 16508 16509 STATIC void 16510 S_put_byte(pTHX_ SV *sv, int c) 16511 { 16512 PERL_ARGS_ASSERT_PUT_BYTE; 16513 16514 if (!isPRINT(c)) { 16515 switch (c) { 16516 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; 16517 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; 16518 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; 16519 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; 16520 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; 16521 16522 default: 16523 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); 16524 break; 16525 } 16526 } 16527 else { 16528 const char string = c; 16529 if (c == '-' || c == ']' || c == '\\' || c == '^') 16530 sv_catpvs(sv, "\\"); 16531 sv_catpvn(sv, &string, 1); 16532 } 16533 } 16534 16535 STATIC void 16536 S_put_range(pTHX_ SV *sv, UV start, UV end) 16537 { 16538 16539 /* Appends to 'sv' a displayable version of the range of code points from 16540 * 'start' to 'end' */ 16541 16542 assert(start <= end); 16543 16544 PERL_ARGS_ASSERT_PUT_RANGE; 16545 16546 if (end - start < 3) { /* Individual chars in short ranges */ 16547 for (; start <= end; start++) 16548 put_byte(sv, start); 16549 } 16550 else if ( end > 255 16551 || ! isALPHANUMERIC(start) 16552 || ! isALPHANUMERIC(end) 16553 || isDIGIT(start) != isDIGIT(end) 16554 || isUPPER(start) != isUPPER(end) 16555 || isLOWER(start) != isLOWER(end) 16556 16557 /* This final test should get optimized out except on EBCDIC 16558 * platforms, where it causes ranges that cross discontinuities 16559 * like i/j to be shown as hex instead of the misleading, 16560 * e.g. H-K (since that range includes more than H, I, J, K). 16561 * */ 16562 || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) 16563 { 16564 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", 16565 start, 16566 (end < 256) ? end : 255); 16567 } 16568 else { /* Here, the ends of the range are both digits, or both uppercase, 16569 or both lowercase; and there's no discontinuity in the range 16570 (which could happen on EBCDIC platforms) */ 16571 put_byte(sv, start); 16572 sv_catpvs(sv, "-"); 16573 put_byte(sv, end); 16574 } 16575 } 16576 16577 STATIC bool 16578 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) 16579 { 16580 /* Appends to 'sv' a displayable version of the innards of the bracketed 16581 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually 16582 * output anything */ 16583 16584 int i; 16585 bool has_output_anything = FALSE; 16586 16587 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; 16588 16589 for (i = 0; i < 256; i++) { 16590 if (BITMAP_TEST((U8 *) bitmap,i)) { 16591 16592 /* The character at index i should be output. Find the next 16593 * character that should NOT be output */ 16594 int j; 16595 for (j = i + 1; j < 256; j++) { 16596 if (! BITMAP_TEST((U8 *) bitmap, j)) { 16597 break; 16598 } 16599 } 16600 16601 /* Everything between them is a single range that should be output 16602 * */ 16603 put_range(sv, i, j - 1); 16604 has_output_anything = TRUE; 16605 i = j; 16606 } 16607 } 16608 16609 return has_output_anything; 16610 } 16611 16612 #define CLEAR_OPTSTART \ 16613 if (optstart) STMT_START { \ 16614 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ 16615 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ 16616 optstart=NULL; \ 16617 } STMT_END 16618 16619 #define DUMPUNTIL(b,e) \ 16620 CLEAR_OPTSTART; \ 16621 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 16622 16623 STATIC const regnode * 16624 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 16625 const regnode *last, const regnode *plast, 16626 SV* sv, I32 indent, U32 depth) 16627 { 16628 dVAR; 16629 U8 op = PSEUDO; /* Arbitrary non-END op. */ 16630 const regnode *next; 16631 const regnode *optstart= NULL; 16632 16633 RXi_GET_DECL(r,ri); 16634 GET_RE_DEBUG_FLAGS_DECL; 16635 16636 PERL_ARGS_ASSERT_DUMPUNTIL; 16637 16638 #ifdef DEBUG_DUMPUNTIL 16639 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, 16640 last ? last-start : 0,plast ? plast-start : 0); 16641 #endif 16642 16643 if (plast && plast < last) 16644 last= plast; 16645 16646 while (PL_regkind[op] != END && (!last || node < last)) { 16647 /* While that wasn't END last time... */ 16648 NODE_ALIGN(node); 16649 op = OP(node); 16650 if (op == CLOSE || op == WHILEM) 16651 indent--; 16652 next = regnext((regnode *)node); 16653 16654 /* Where, what. */ 16655 if (OP(node) == OPTIMIZED) { 16656 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 16657 optstart = node; 16658 else 16659 goto after_print; 16660 } else 16661 CLEAR_OPTSTART; 16662 16663 regprop(r, sv, node, NULL); 16664 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 16665 (int)(2*indent + 1), "", SvPVX_const(sv)); 16666 16667 if (OP(node) != OPTIMIZED) { 16668 if (next == NULL) /* Next ptr. */ 16669 PerlIO_printf(Perl_debug_log, " (0)"); 16670 else if (PL_regkind[(U8)op] == BRANCH 16671 && PL_regkind[OP(next)] != BRANCH ) 16672 PerlIO_printf(Perl_debug_log, " (FAIL)"); 16673 else 16674 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); 16675 (void)PerlIO_putc(Perl_debug_log, '\n'); 16676 } 16677 16678 after_print: 16679 if (PL_regkind[(U8)op] == BRANCHJ) { 16680 assert(next); 16681 { 16682 const regnode *nnode = (OP(next) == LONGJMP 16683 ? regnext((regnode *)next) 16684 : next); 16685 if (last && nnode > last) 16686 nnode = last; 16687 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 16688 } 16689 } 16690 else if (PL_regkind[(U8)op] == BRANCH) { 16691 assert(next); 16692 DUMPUNTIL(NEXTOPER(node), next); 16693 } 16694 else if ( PL_regkind[(U8)op] == TRIE ) { 16695 const regnode *this_trie = node; 16696 const char op = OP(node); 16697 const U32 n = ARG(node); 16698 const reg_ac_data * const ac = op>=AHOCORASICK ? 16699 (reg_ac_data *)ri->data->data[n] : 16700 NULL; 16701 const reg_trie_data * const trie = 16702 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 16703 #ifdef DEBUGGING 16704 AV *const trie_words 16705 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 16706 #endif 16707 const regnode *nextbranch= NULL; 16708 I32 word_idx; 16709 sv_setpvs(sv, ""); 16710 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 16711 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); 16712 16713 PerlIO_printf(Perl_debug_log, "%*s%s ", 16714 (int)(2*(indent+3)), "", 16715 elem_ptr 16716 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 16717 SvCUR(*elem_ptr), 60, 16718 PL_colors[0], PL_colors[1], 16719 (SvUTF8(*elem_ptr) 16720 ? PERL_PV_ESCAPE_UNI 16721 : 0) 16722 | PERL_PV_PRETTY_ELLIPSES 16723 | PERL_PV_PRETTY_LTGT 16724 ) 16725 : "???" 16726 ); 16727 if (trie->jump) { 16728 U16 dist= trie->jump[word_idx+1]; 16729 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", 16730 (UV)((dist ? this_trie + dist : next) - start)); 16731 if (dist) { 16732 if (!nextbranch) 16733 nextbranch= this_trie + trie->jump[0]; 16734 DUMPUNTIL(this_trie + dist, nextbranch); 16735 } 16736 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 16737 nextbranch= regnext((regnode *)nextbranch); 16738 } else { 16739 PerlIO_printf(Perl_debug_log, "\n"); 16740 } 16741 } 16742 if (last && next > last) 16743 node= last; 16744 else 16745 node= next; 16746 } 16747 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 16748 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 16749 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 16750 } 16751 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 16752 assert(next); 16753 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 16754 } 16755 else if ( op == PLUS || op == STAR) { 16756 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 16757 } 16758 else if (PL_regkind[(U8)op] == ANYOF) { 16759 /* arglen 1 + class block */ 16760 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) 16761 ? ANYOF_POSIXL_SKIP 16762 : ANYOF_SKIP); 16763 node = NEXTOPER(node); 16764 } 16765 else if (PL_regkind[(U8)op] == EXACT) { 16766 /* Literal string, where present. */ 16767 node += NODE_SZ_STR(node) - 1; 16768 node = NEXTOPER(node); 16769 } 16770 else { 16771 node = NEXTOPER(node); 16772 node += regarglen[(U8)op]; 16773 } 16774 if (op == CURLYX || op == OPEN) 16775 indent++; 16776 } 16777 CLEAR_OPTSTART; 16778 #ifdef DEBUG_DUMPUNTIL 16779 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); 16780 #endif 16781 return node; 16782 } 16783 16784 #endif /* DEBUGGING */ 16785 16786 /* 16787 * Local variables: 16788 * c-indentation-style: bsd 16789 * c-basic-offset: 4 16790 * indent-tabs-mode: nil 16791 * End: 16792 * 16793 * ex: set ts=8 sts=4 sw=4 et: 16794 */ 16795