1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 #ifdef PERL_EXT_RE_BUILD 35 #include "re_top.h" 36 #endif 37 38 /* 39 * pregcomp and pregexec -- regsub and regerror are not used in perl 40 * 41 * Copyright (c) 1986 by University of Toronto. 42 * Written by Henry Spencer. Not derived from licensed software. 43 * 44 * Permission is granted to anyone to use this software for any 45 * purpose on any computer system, and to redistribute it freely, 46 * subject to the following restrictions: 47 * 48 * 1. The author is not responsible for the consequences of use of 49 * this software, no matter how awful, even if they arise 50 * from defects in it. 51 * 52 * 2. The origin of this software must not be misrepresented, either 53 * by explicit claim or by omission. 54 * 55 * 3. Altered versions must be plainly marked as such, and must not 56 * be misrepresented as being the original software. 57 * 58 * 59 **** Alterations to Henry's code are... 60 **** 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 63 **** by Larry Wall and others 64 **** 65 **** You may distribute under the terms of either the GNU General Public 66 **** License or the Artistic License, as specified in the README file. 67 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGCOMP_C 75 #include "perl.h" 76 77 #define REG_COMP_C 78 #ifdef PERL_IN_XSUB_RE 79 # include "re_comp.h" 80 EXTERN_C const struct regexp_engine my_reg_engine; 81 #else 82 # include "regcomp.h" 83 #endif 84 85 #include "dquote_inline.h" 86 #include "invlist_inline.h" 87 #include "unicode_constants.h" 88 89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ 90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ 92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 95 96 #ifndef STATIC 97 #define STATIC static 98 #endif 99 100 /* this is a chain of data about sub patterns we are processing that 101 need to be handled separately/specially in study_chunk. Its so 102 we can simulate recursion without losing state. */ 103 struct scan_frame; 104 typedef struct scan_frame { 105 regnode *last_regnode; /* last node to process in this frame */ 106 regnode *next_regnode; /* next node to process when last is reached */ 107 U32 prev_recursed_depth; 108 I32 stopparen; /* what stopparen do we use */ 109 bool in_gosub; /* this or an outer frame is for GOSUB */ 110 111 struct scan_frame *this_prev_frame; /* this previous frame */ 112 struct scan_frame *prev_frame; /* previous frame */ 113 struct scan_frame *next_frame; /* next frame */ 114 } scan_frame; 115 116 /* Certain characters are output as a sequence with the first being a 117 * backslash. */ 118 #define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c) 119 120 121 struct RExC_state_t { 122 U32 flags; /* RXf_* are we folding, multilining? */ 123 U32 pm_flags; /* PMf_* stuff from the calling PMOP */ 124 char *precomp; /* uncompiled string. */ 125 char *precomp_end; /* pointer to end of uncompiled string. */ 126 REGEXP *rx_sv; /* The SV that is the regexp. */ 127 regexp *rx; /* perl core regexp structure */ 128 regexp_internal *rxi; /* internal data for regexp object 129 pprivate field */ 130 char *start; /* Start of input for compile */ 131 char *end; /* End of input for compile */ 132 char *parse; /* Input-scan pointer. */ 133 char *copy_start; /* start of copy of input within 134 constructed parse string */ 135 char *save_copy_start; /* Provides one level of saving 136 and restoring 'copy_start' */ 137 char *copy_start_in_input; /* Position in input string 138 corresponding to copy_start */ 139 SSize_t whilem_seen; /* number of WHILEM in this expr */ 140 regnode *emit_start; /* Start of emitted-code area */ 141 regnode_offset emit; /* Code-emit pointer */ 142 I32 naughty; /* How bad is this pattern? */ 143 I32 sawback; /* Did we see \1, ...? */ 144 U32 seen; 145 SSize_t size; /* Number of regnode equivalents in 146 pattern */ 147 148 /* position beyond 'precomp' of the warning message furthest away from 149 * 'precomp'. During the parse, no warnings are raised for any problems 150 * earlier in the parse than this position. This works if warnings are 151 * raised the first time a given spot is parsed, and if only one 152 * independent warning is raised for any given spot */ 153 Size_t latest_warn_offset; 154 155 I32 npar; /* Capture buffer count so far in the 156 parse, (OPEN) plus one. ("par" 0 is 157 the whole pattern)*/ 158 I32 total_par; /* During initial parse, is either 0, 159 or -1; the latter indicating a 160 reparse is needed. After that pass, 161 it is what 'npar' became after the 162 pass. Hence, it being > 0 indicates 163 we are in a reparse situation */ 164 I32 nestroot; /* root parens we are in - used by 165 accept */ 166 I32 seen_zerolen; 167 regnode_offset *open_parens; /* offsets to open parens */ 168 regnode_offset *close_parens; /* offsets to close parens */ 169 I32 parens_buf_size; /* #slots malloced open/close_parens */ 170 regnode *end_op; /* END node in program */ 171 I32 utf8; /* whether the pattern is utf8 or not */ 172 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 173 /* XXX use this for future optimisation of case 174 * where pattern must be upgraded to utf8. */ 175 I32 uni_semantics; /* If a d charset modifier should use unicode 176 rules, even if the pattern is not in 177 utf8 */ 178 HV *paren_names; /* Paren names */ 179 180 regnode **recurse; /* Recurse regops */ 181 I32 recurse_count; /* Number of recurse regops we have generated */ 182 U8 *study_chunk_recursed; /* bitmap of which subs we have moved 183 through */ 184 U32 study_chunk_recursed_bytes; /* bytes in bitmap */ 185 I32 in_lookbehind; 186 I32 contains_locale; 187 I32 override_recoding; 188 #ifdef EBCDIC 189 I32 recode_x_to_native; 190 #endif 191 I32 in_multi_char_class; 192 struct reg_code_blocks *code_blocks;/* positions of literal (?{}) 193 within pattern */ 194 int code_index; /* next code_blocks[] slot */ 195 SSize_t maxlen; /* mininum possible number of chars in string to match */ 196 scan_frame *frame_head; 197 scan_frame *frame_last; 198 U32 frame_count; 199 AV *warn_text; 200 HV *unlexed_names; 201 #ifdef ADD_TO_REGEXEC 202 char *starttry; /* -Dr: where regtry was called. */ 203 #define RExC_starttry (pRExC_state->starttry) 204 #endif 205 SV *runtime_code_qr; /* qr with the runtime code blocks */ 206 #ifdef DEBUGGING 207 const char *lastparse; 208 I32 lastnum; 209 AV *paren_name_list; /* idx -> name */ 210 U32 study_chunk_recursed_count; 211 SV *mysv1; 212 SV *mysv2; 213 214 #define RExC_lastparse (pRExC_state->lastparse) 215 #define RExC_lastnum (pRExC_state->lastnum) 216 #define RExC_paren_name_list (pRExC_state->paren_name_list) 217 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) 218 #define RExC_mysv (pRExC_state->mysv1) 219 #define RExC_mysv1 (pRExC_state->mysv1) 220 #define RExC_mysv2 (pRExC_state->mysv2) 221 222 #endif 223 bool seen_d_op; 224 bool strict; 225 bool study_started; 226 bool in_script_run; 227 bool use_BRANCHJ; 228 }; 229 230 #define RExC_flags (pRExC_state->flags) 231 #define RExC_pm_flags (pRExC_state->pm_flags) 232 #define RExC_precomp (pRExC_state->precomp) 233 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) 234 #define RExC_copy_start_in_constructed (pRExC_state->copy_start) 235 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) 236 #define RExC_precomp_end (pRExC_state->precomp_end) 237 #define RExC_rx_sv (pRExC_state->rx_sv) 238 #define RExC_rx (pRExC_state->rx) 239 #define RExC_rxi (pRExC_state->rxi) 240 #define RExC_start (pRExC_state->start) 241 #define RExC_end (pRExC_state->end) 242 #define RExC_parse (pRExC_state->parse) 243 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) 244 #define RExC_whilem_seen (pRExC_state->whilem_seen) 245 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs 246 under /d from /u ? */ 247 248 249 #ifdef RE_TRACK_PATTERN_OFFSETS 250 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the 251 others */ 252 #endif 253 #define RExC_emit (pRExC_state->emit) 254 #define RExC_emit_start (pRExC_state->emit_start) 255 #define RExC_sawback (pRExC_state->sawback) 256 #define RExC_seen (pRExC_state->seen) 257 #define RExC_size (pRExC_state->size) 258 #define RExC_maxlen (pRExC_state->maxlen) 259 #define RExC_npar (pRExC_state->npar) 260 #define RExC_total_parens (pRExC_state->total_par) 261 #define RExC_parens_buf_size (pRExC_state->parens_buf_size) 262 #define RExC_nestroot (pRExC_state->nestroot) 263 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 264 #define RExC_utf8 (pRExC_state->utf8) 265 #define RExC_uni_semantics (pRExC_state->uni_semantics) 266 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 267 #define RExC_open_parens (pRExC_state->open_parens) 268 #define RExC_close_parens (pRExC_state->close_parens) 269 #define RExC_end_op (pRExC_state->end_op) 270 #define RExC_paren_names (pRExC_state->paren_names) 271 #define RExC_recurse (pRExC_state->recurse) 272 #define RExC_recurse_count (pRExC_state->recurse_count) 273 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) 274 #define RExC_study_chunk_recursed_bytes \ 275 (pRExC_state->study_chunk_recursed_bytes) 276 #define RExC_in_lookbehind (pRExC_state->in_lookbehind) 277 #define RExC_contains_locale (pRExC_state->contains_locale) 278 #ifdef EBCDIC 279 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) 280 #endif 281 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) 282 #define RExC_frame_head (pRExC_state->frame_head) 283 #define RExC_frame_last (pRExC_state->frame_last) 284 #define RExC_frame_count (pRExC_state->frame_count) 285 #define RExC_strict (pRExC_state->strict) 286 #define RExC_study_started (pRExC_state->study_started) 287 #define RExC_warn_text (pRExC_state->warn_text) 288 #define RExC_in_script_run (pRExC_state->in_script_run) 289 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) 290 #define RExC_unlexed_names (pRExC_state->unlexed_names) 291 292 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set 293 * a flag to disable back-off on the fixed/floating substrings - if it's 294 * a high complexity pattern we assume the benefit of avoiding a full match 295 * is worth the cost of checking for the substrings even if they rarely help. 296 */ 297 #define RExC_naughty (pRExC_state->naughty) 298 #define TOO_NAUGHTY (10) 299 #define MARK_NAUGHTY(add) \ 300 if (RExC_naughty < TOO_NAUGHTY) \ 301 RExC_naughty += (add) 302 #define MARK_NAUGHTY_EXP(exp, add) \ 303 if (RExC_naughty < TOO_NAUGHTY) \ 304 RExC_naughty += RExC_naughty / (exp) + (add) 305 306 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 307 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 308 ((*s) == '{' && regcurly(s))) 309 310 /* 311 * Flags to be passed up and down. 312 */ 313 #define WORST 0 /* Worst case. */ 314 #define HASWIDTH 0x01 /* Known to not match null strings, could match 315 non-null ones. */ 316 317 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single 318 * character. (There needs to be a case: in the switch statement in regexec.c 319 * for any node marked SIMPLE.) Note that this is not the same thing as 320 * REGNODE_SIMPLE */ 321 #define SIMPLE 0x02 322 #define SPSTART 0x04 /* Starts with * or + */ 323 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ 324 #define TRYAGAIN 0x10 /* Weeded out a declaration. */ 325 #define RESTART_PARSE 0x20 /* Need to redo the parse */ 326 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to 327 calcuate sizes as UTF-8 */ 328 329 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 330 331 /* whether trie related optimizations are enabled */ 332 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 333 #define TRIE_STUDY_OPT 334 #define FULL_TRIE_STUDY 335 #define TRIE_STCLASS 336 #endif 337 338 339 340 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 341 #define PBITVAL(paren) (1 << ((paren) & 7)) 342 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 343 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 344 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 345 346 #define REQUIRE_UTF8(flagp) STMT_START { \ 347 if (!UTF) { \ 348 *flagp = RESTART_PARSE|NEED_UTF8; \ 349 return 0; \ 350 } \ 351 } STMT_END 352 353 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is 354 * a flag that indicates we need to override /d with /u as a result of 355 * something in the pattern. It should only be used in regards to calling 356 * set_regex_charset() or get_regex_charse() */ 357 #define REQUIRE_UNI_RULES(flagp, restart_retval) \ 358 STMT_START { \ 359 if (DEPENDS_SEMANTICS) { \ 360 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ 361 RExC_uni_semantics = 1; \ 362 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ 363 /* No need to restart the parse if we haven't seen \ 364 * anything that differs between /u and /d, and no need \ 365 * to restart immediately if we're going to reparse \ 366 * anyway to count parens */ \ 367 *flagp |= RESTART_PARSE; \ 368 return restart_retval; \ 369 } \ 370 } \ 371 } STMT_END 372 373 #define REQUIRE_BRANCHJ(flagp, restart_retval) \ 374 STMT_START { \ 375 RExC_use_BRANCHJ = 1; \ 376 *flagp |= RESTART_PARSE; \ 377 return restart_retval; \ 378 } STMT_END 379 380 /* Until we have completed the parse, we leave RExC_total_parens at 0 or 381 * less. After that, it must always be positive, because the whole re is 382 * considered to be surrounded by virtual parens. Setting it to negative 383 * indicates there is some construct that needs to know the actual number of 384 * parens to be properly handled. And that means an extra pass will be 385 * required after we've counted them all */ 386 #define ALL_PARENS_COUNTED (RExC_total_parens > 0) 387 #define REQUIRE_PARENS_PASS \ 388 STMT_START { /* No-op if have completed a pass */ \ 389 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ 390 } STMT_END 391 #define IN_PARENS_PASS (RExC_total_parens < 0) 392 393 394 /* This is used to return failure (zero) early from the calling function if 395 * various flags in 'flags' are set. Two flags always cause a return: 396 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any 397 * additional flags that should cause a return; 0 if none. If the return will 398 * be done, '*flagp' is first set to be all of the flags that caused the 399 * return. */ 400 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ 401 STMT_START { \ 402 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ 403 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ 404 return 0; \ 405 } \ 406 } STMT_END 407 408 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) 409 410 #define RETURN_FAIL_ON_RESTART(flags,flagp) \ 411 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) 412 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ 413 if (MUST_RESTART(*(flagp))) return 0 414 415 /* This converts the named class defined in regcomp.h to its equivalent class 416 * number defined in handy.h. */ 417 #define namedclass_to_classnum(class) ((int) ((class) / 2)) 418 #define classnum_to_namedclass(classnum) ((classnum) * 2) 419 420 #define _invlist_union_complement_2nd(a, b, output) \ 421 _invlist_union_maybe_complement_2nd(a, b, TRUE, output) 422 #define _invlist_intersection_complement_2nd(a, b, output) \ 423 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) 424 425 /* About scan_data_t. 426 427 During optimisation we recurse through the regexp program performing 428 various inplace (keyhole style) optimisations. In addition study_chunk 429 and scan_commit populate this data structure with information about 430 what strings MUST appear in the pattern. We look for the longest 431 string that must appear at a fixed location, and we look for the 432 longest string that may appear at a floating location. So for instance 433 in the pattern: 434 435 /FOO[xX]A.*B[xX]BAR/ 436 437 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 438 strings (because they follow a .* construct). study_chunk will identify 439 both FOO and BAR as being the longest fixed and floating strings respectively. 440 441 The strings can be composites, for instance 442 443 /(f)(o)(o)/ 444 445 will result in a composite fixed substring 'foo'. 446 447 For each string some basic information is maintained: 448 449 - min_offset 450 This is the position the string must appear at, or not before. 451 It also implicitly (when combined with minlenp) tells us how many 452 characters must match before the string we are searching for. 453 Likewise when combined with minlenp and the length of the string it 454 tells us how many characters must appear after the string we have 455 found. 456 457 - max_offset 458 Only used for floating strings. This is the rightmost point that 459 the string can appear at. If set to SSize_t_MAX it indicates that the 460 string can occur infinitely far to the right. 461 For fixed strings, it is equal to min_offset. 462 463 - minlenp 464 A pointer to the minimum number of characters of the pattern that the 465 string was found inside. This is important as in the case of positive 466 lookahead or positive lookbehind we can have multiple patterns 467 involved. Consider 468 469 /(?=FOO).*F/ 470 471 The minimum length of the pattern overall is 3, the minimum length 472 of the lookahead part is 3, but the minimum length of the part that 473 will actually match is 1. So 'FOO's minimum length is 3, but the 474 minimum length for the F is 1. This is important as the minimum length 475 is used to determine offsets in front of and behind the string being 476 looked for. Since strings can be composites this is the length of the 477 pattern at the time it was committed with a scan_commit. Note that 478 the length is calculated by study_chunk, so that the minimum lengths 479 are not known until the full pattern has been compiled, thus the 480 pointer to the value. 481 482 - lookbehind 483 484 In the case of lookbehind the string being searched for can be 485 offset past the start point of the final matching string. 486 If this value was just blithely removed from the min_offset it would 487 invalidate some of the calculations for how many chars must match 488 before or after (as they are derived from min_offset and minlen and 489 the length of the string being searched for). 490 When the final pattern is compiled and the data is moved from the 491 scan_data_t structure into the regexp structure the information 492 about lookbehind is factored in, with the information that would 493 have been lost precalculated in the end_shift field for the 494 associated string. 495 496 The fields pos_min and pos_delta are used to store the minimum offset 497 and the delta to the maximum offset at the current point in the pattern. 498 499 */ 500 501 struct scan_data_substrs { 502 SV *str; /* longest substring found in pattern */ 503 SSize_t min_offset; /* earliest point in string it can appear */ 504 SSize_t max_offset; /* latest point in string it can appear */ 505 SSize_t *minlenp; /* pointer to the minlen relevant to the string */ 506 SSize_t lookbehind; /* is the pos of the string modified by LB */ 507 I32 flags; /* per substring SF_* and SCF_* flags */ 508 }; 509 510 typedef struct scan_data_t { 511 /*I32 len_min; unused */ 512 /*I32 len_delta; unused */ 513 SSize_t pos_min; 514 SSize_t pos_delta; 515 SV *last_found; 516 SSize_t last_end; /* min value, <0 unless valid. */ 517 SSize_t last_start_min; 518 SSize_t last_start_max; 519 U8 cur_is_floating; /* whether the last_* values should be set as 520 * the next fixed (0) or floating (1) 521 * substring */ 522 523 /* [0] is longest fixed substring so far, [1] is longest float so far */ 524 struct scan_data_substrs substrs[2]; 525 526 I32 flags; /* common SF_* and SCF_* flags */ 527 I32 whilem_c; 528 SSize_t *last_closep; 529 regnode_ssc *start_class; 530 } scan_data_t; 531 532 /* 533 * Forward declarations for pregcomp()'s friends. 534 */ 535 536 static const scan_data_t zero_scan_data = { 537 0, 0, NULL, 0, 0, 0, 0, 538 { 539 { NULL, 0, 0, 0, 0, 0 }, 540 { NULL, 0, 0, 0, 0, 0 }, 541 }, 542 0, 0, NULL, NULL 543 }; 544 545 /* study flags */ 546 547 #define SF_BEFORE_SEOL 0x0001 548 #define SF_BEFORE_MEOL 0x0002 549 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 550 551 #define SF_IS_INF 0x0040 552 #define SF_HAS_PAR 0x0080 553 #define SF_IN_PAR 0x0100 554 #define SF_HAS_EVAL 0x0200 555 556 557 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the 558 * longest substring in the pattern. When it is not set the optimiser keeps 559 * track of position, but does not keep track of the actual strings seen, 560 * 561 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but 562 * /foo/i will not. 563 * 564 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" 565 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be 566 * turned off because of the alternation (BRANCH). */ 567 #define SCF_DO_SUBSTR 0x0400 568 569 #define SCF_DO_STCLASS_AND 0x0800 570 #define SCF_DO_STCLASS_OR 0x1000 571 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 572 #define SCF_WHILEM_VISITED_POS 0x2000 573 574 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 575 #define SCF_SEEN_ACCEPT 0x8000 576 #define SCF_TRIE_DOING_RESTUDY 0x10000 577 #define SCF_IN_DEFINE 0x20000 578 579 580 581 582 #define UTF cBOOL(RExC_utf8) 583 584 /* The enums for all these are ordered so things work out correctly */ 585 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) 586 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ 587 == REGEX_DEPENDS_CHARSET) 588 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) 589 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ 590 >= REGEX_UNICODE_CHARSET) 591 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 592 == REGEX_ASCII_RESTRICTED_CHARSET) 593 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 594 >= REGEX_ASCII_RESTRICTED_CHARSET) 595 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ 596 == REGEX_ASCII_MORE_RESTRICTED_CHARSET) 597 598 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) 599 600 /* For programs that want to be strictly Unicode compatible by dying if any 601 * attempt is made to match a non-Unicode code point against a Unicode 602 * property. */ 603 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) 604 605 #define OOB_NAMEDCLASS -1 606 607 /* There is no code point that is out-of-bounds, so this is problematic. But 608 * its only current use is to initialize a variable that is always set before 609 * looked at. */ 610 #define OOB_UNICODE 0xDEADBEEF 611 612 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 613 614 615 /* length of regex to show in messages that don't mark a position within */ 616 #define RegexLengthToShowInErrorMessages 127 617 618 /* 619 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 620 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 621 * op/pragma/warn/regcomp. 622 */ 623 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 624 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 625 626 #define REPORT_LOCATION " in regex; marked by " MARKER1 \ 627 " in m/%" UTF8f MARKER2 "%" UTF8f "/" 628 629 /* The code in this file in places uses one level of recursion with parsing 630 * rebased to an alternate string constructed by us in memory. This can take 631 * the form of something that is completely different from the input, or 632 * something that uses the input as part of the alternate. In the first case, 633 * there should be no possibility of an error, as we are in complete control of 634 * the alternate string. But in the second case we don't completely control 635 * the input portion, so there may be errors in that. Here's an example: 636 * /[abc\x{DF}def]/ui 637 * is handled specially because \x{df} folds to a sequence of more than one 638 * character: 'ss'. What is done is to create and parse an alternate string, 639 * which looks like this: 640 * /(?:\x{DF}|[abc\x{DF}def])/ui 641 * where it uses the input unchanged in the middle of something it constructs, 642 * which is a branch for the DF outside the character class, and clustering 643 * parens around the whole thing. (It knows enough to skip the DF inside the 644 * class while in this substitute parse.) 'abc' and 'def' may have errors that 645 * need to be reported. The general situation looks like this: 646 * 647 * |<------- identical ------>| 648 * sI tI xI eI 649 * Input: --------------------------------------------------------------- 650 * Constructed: --------------------------------------------------- 651 * sC tC xC eC EC 652 * |<------- identical ------>| 653 * 654 * sI..eI is the portion of the input pattern we are concerned with here. 655 * sC..EC is the constructed substitute parse string. 656 * sC..tC is constructed by us 657 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. 658 * In the diagram, these are vertically aligned. 659 * eC..EC is also constructed by us. 660 * xC is the position in the substitute parse string where we found a 661 * problem. 662 * xI is the position in the original pattern corresponding to xC. 663 * 664 * We want to display a message showing the real input string. Thus we need to 665 * translate from xC to xI. We know that xC >= tC, since the portion of the 666 * string sC..tC has been constructed by us, and so shouldn't have errors. We 667 * get: 668 * xI = tI + (xC - tC) 669 * 670 * When the substitute parse is constructed, the code needs to set: 671 * RExC_start (sC) 672 * RExC_end (eC) 673 * RExC_copy_start_in_input (tI) 674 * RExC_copy_start_in_constructed (tC) 675 * and restore them when done. 676 * 677 * During normal processing of the input pattern, both 678 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to 679 * sI, so that xC equals xI. 680 */ 681 682 #define sI RExC_precomp 683 #define eI RExC_precomp_end 684 #define sC RExC_start 685 #define eC RExC_end 686 #define tI RExC_copy_start_in_input 687 #define tC RExC_copy_start_in_constructed 688 #define xI(xC) (tI + (xC - tC)) 689 #define xI_offset(xC) (xI(xC) - sI) 690 691 #define REPORT_LOCATION_ARGS(xC) \ 692 UTF8fARG(UTF, \ 693 (xI(xC) > eI) /* Don't run off end */ \ 694 ? eI - sI /* Length before the <--HERE */ \ 695 : ((xI_offset(xC) >= 0) \ 696 ? xI_offset(xC) \ 697 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ 698 IVdf " trying to output message for " \ 699 " pattern %.*s", \ 700 __FILE__, __LINE__, (IV) xI_offset(xC), \ 701 ((int) (eC - sC)), sC), 0)), \ 702 sI), /* The input pattern printed up to the <--HERE */ \ 703 UTF8fARG(UTF, \ 704 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ 705 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ 706 707 /* Used to point after bad bytes for an error message, but avoid skipping 708 * past a nul byte. */ 709 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) 710 711 /* Set up to clean up after our imminent demise */ 712 #define PREPARE_TO_DIE \ 713 STMT_START { \ 714 if (RExC_rx_sv) \ 715 SAVEFREESV(RExC_rx_sv); \ 716 if (RExC_open_parens) \ 717 SAVEFREEPV(RExC_open_parens); \ 718 if (RExC_close_parens) \ 719 SAVEFREEPV(RExC_close_parens); \ 720 } STMT_END 721 722 /* 723 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 724 * arg. Show regex, up to a maximum length. If it's too long, chop and add 725 * "...". 726 */ 727 #define _FAIL(code) STMT_START { \ 728 const char *ellipses = ""; \ 729 IV len = RExC_precomp_end - RExC_precomp; \ 730 \ 731 PREPARE_TO_DIE; \ 732 if (len > RegexLengthToShowInErrorMessages) { \ 733 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 734 len = RegexLengthToShowInErrorMessages - 10; \ 735 ellipses = "..."; \ 736 } \ 737 code; \ 738 } STMT_END 739 740 #define FAIL(msg) _FAIL( \ 741 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ 742 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 743 744 #define FAIL2(msg,arg) _FAIL( \ 745 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ 746 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 747 748 /* 749 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 750 */ 751 #define Simple_vFAIL(m) STMT_START { \ 752 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 753 m, REPORT_LOCATION_ARGS(RExC_parse)); \ 754 } STMT_END 755 756 /* 757 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 758 */ 759 #define vFAIL(m) STMT_START { \ 760 PREPARE_TO_DIE; \ 761 Simple_vFAIL(m); \ 762 } STMT_END 763 764 /* 765 * Like Simple_vFAIL(), but accepts two arguments. 766 */ 767 #define Simple_vFAIL2(m,a1) STMT_START { \ 768 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 769 REPORT_LOCATION_ARGS(RExC_parse)); \ 770 } STMT_END 771 772 /* 773 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 774 */ 775 #define vFAIL2(m,a1) STMT_START { \ 776 PREPARE_TO_DIE; \ 777 Simple_vFAIL2(m, a1); \ 778 } STMT_END 779 780 781 /* 782 * Like Simple_vFAIL(), but accepts three arguments. 783 */ 784 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 785 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ 786 REPORT_LOCATION_ARGS(RExC_parse)); \ 787 } STMT_END 788 789 /* 790 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 791 */ 792 #define vFAIL3(m,a1,a2) STMT_START { \ 793 PREPARE_TO_DIE; \ 794 Simple_vFAIL3(m, a1, a2); \ 795 } STMT_END 796 797 /* 798 * Like Simple_vFAIL(), but accepts four arguments. 799 */ 800 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 801 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ 802 REPORT_LOCATION_ARGS(RExC_parse)); \ 803 } STMT_END 804 805 #define vFAIL4(m,a1,a2,a3) STMT_START { \ 806 PREPARE_TO_DIE; \ 807 Simple_vFAIL4(m, a1, a2, a3); \ 808 } STMT_END 809 810 /* A specialized version of vFAIL2 that works with UTF8f */ 811 #define vFAIL2utf8f(m, a1) STMT_START { \ 812 PREPARE_TO_DIE; \ 813 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 814 REPORT_LOCATION_ARGS(RExC_parse)); \ 815 } STMT_END 816 817 #define vFAIL3utf8f(m, a1, a2) STMT_START { \ 818 PREPARE_TO_DIE; \ 819 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ 820 REPORT_LOCATION_ARGS(RExC_parse)); \ 821 } STMT_END 822 823 /* Setting this to NULL is a signal to not output warnings */ 824 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ 825 STMT_START { \ 826 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ 827 RExC_copy_start_in_constructed = NULL; \ 828 } STMT_END 829 #define RESTORE_WARNINGS \ 830 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed 831 832 /* Since a warning can be generated multiple times as the input is reparsed, we 833 * output it the first time we come to that point in the parse, but suppress it 834 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not 835 * generate any warnings */ 836 #define TO_OUTPUT_WARNINGS(loc) \ 837 ( RExC_copy_start_in_constructed \ 838 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) 839 840 /* After we've emitted a warning, we save the position in the input so we don't 841 * output it again */ 842 #define UPDATE_WARNINGS_LOC(loc) \ 843 STMT_START { \ 844 if (TO_OUTPUT_WARNINGS(loc)) { \ 845 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ 846 - RExC_precomp; \ 847 } \ 848 } STMT_END 849 850 /* 'warns' is the output of the packWARNx macro used in 'code' */ 851 #define _WARN_HELPER(loc, warns, code) \ 852 STMT_START { \ 853 if (! RExC_copy_start_in_constructed) { \ 854 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ 855 " expected at '%s'", \ 856 __FILE__, __LINE__, loc); \ 857 } \ 858 if (TO_OUTPUT_WARNINGS(loc)) { \ 859 if (ckDEAD(warns)) \ 860 PREPARE_TO_DIE; \ 861 code; \ 862 UPDATE_WARNINGS_LOC(loc); \ 863 } \ 864 } STMT_END 865 866 /* m is not necessarily a "literal string", in this macro */ 867 #define reg_warn_non_literal_string(loc, m) \ 868 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 869 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 870 "%s" REPORT_LOCATION, \ 871 m, REPORT_LOCATION_ARGS(loc))) 872 873 #define ckWARNreg(loc,m) \ 874 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 875 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 876 m REPORT_LOCATION, \ 877 REPORT_LOCATION_ARGS(loc))) 878 879 #define vWARN(loc, m) \ 880 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 881 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 882 m REPORT_LOCATION, \ 883 REPORT_LOCATION_ARGS(loc))) \ 884 885 #define vWARN_dep(loc, m) \ 886 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ 887 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ 888 m REPORT_LOCATION, \ 889 REPORT_LOCATION_ARGS(loc))) 890 891 #define ckWARNdep(loc,m) \ 892 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ 893 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 894 m REPORT_LOCATION, \ 895 REPORT_LOCATION_ARGS(loc))) 896 897 #define ckWARNregdep(loc,m) \ 898 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 899 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ 900 WARN_REGEXP), \ 901 m REPORT_LOCATION, \ 902 REPORT_LOCATION_ARGS(loc))) 903 904 #define ckWARN2reg_d(loc,m, a1) \ 905 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 906 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ 907 m REPORT_LOCATION, \ 908 a1, REPORT_LOCATION_ARGS(loc))) 909 910 #define ckWARN2reg(loc, m, a1) \ 911 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 912 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 913 m REPORT_LOCATION, \ 914 a1, REPORT_LOCATION_ARGS(loc))) 915 916 #define vWARN3(loc, m, a1, a2) \ 917 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 918 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 919 m REPORT_LOCATION, \ 920 a1, a2, REPORT_LOCATION_ARGS(loc))) 921 922 #define ckWARN3reg(loc, m, a1, a2) \ 923 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 924 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 925 m REPORT_LOCATION, \ 926 a1, a2, \ 927 REPORT_LOCATION_ARGS(loc))) 928 929 #define vWARN4(loc, m, a1, a2, a3) \ 930 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 931 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 932 m REPORT_LOCATION, \ 933 a1, a2, a3, \ 934 REPORT_LOCATION_ARGS(loc))) 935 936 #define ckWARN4reg(loc, m, a1, a2, a3) \ 937 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 938 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 939 m REPORT_LOCATION, \ 940 a1, a2, a3, \ 941 REPORT_LOCATION_ARGS(loc))) 942 943 #define vWARN5(loc, m, a1, a2, a3, a4) \ 944 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 945 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 946 m REPORT_LOCATION, \ 947 a1, a2, a3, a4, \ 948 REPORT_LOCATION_ARGS(loc))) 949 950 #define ckWARNexperimental(loc, class, m) \ 951 _WARN_HELPER(loc, packWARN(class), \ 952 Perl_ck_warner_d(aTHX_ packWARN(class), \ 953 m REPORT_LOCATION, \ 954 REPORT_LOCATION_ARGS(loc))) 955 956 /* Convert between a pointer to a node and its offset from the beginning of the 957 * program */ 958 #define REGNODE_p(offset) (RExC_emit_start + (offset)) 959 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start) 960 961 /* Macros for recording node offsets. 20001227 mjd@plover.com 962 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 963 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 964 * Element 0 holds the number n. 965 * Position is 1 indexed. 966 */ 967 #ifndef RE_TRACK_PATTERN_OFFSETS 968 #define Set_Node_Offset_To_R(offset,byte) 969 #define Set_Node_Offset(node,byte) 970 #define Set_Cur_Node_Offset 971 #define Set_Node_Length_To_R(node,len) 972 #define Set_Node_Length(node,len) 973 #define Set_Node_Cur_Length(node,start) 974 #define Node_Offset(n) 975 #define Node_Length(n) 976 #define Set_Node_Offset_Length(node,offset,len) 977 #define ProgLen(ri) ri->u.proglen 978 #define SetProgLen(ri,x) ri->u.proglen = x 979 #define Track_Code(code) 980 #else 981 #define ProgLen(ri) ri->u.offsets[0] 982 #define SetProgLen(ri,x) ri->u.offsets[0] = x 983 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \ 984 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 985 __LINE__, (int)(offset), (int)(byte))); \ 986 if((offset) < 0) { \ 987 Perl_croak(aTHX_ "value of node is %d in Offset macro", \ 988 (int)(offset)); \ 989 } else { \ 990 RExC_offsets[2*(offset)-1] = (byte); \ 991 } \ 992 } STMT_END 993 994 #define Set_Node_Offset(node,byte) \ 995 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start) 996 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 997 998 #define Set_Node_Length_To_R(node,len) STMT_START { \ 999 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 1000 __LINE__, (int)(node), (int)(len))); \ 1001 if((node) < 0) { \ 1002 Perl_croak(aTHX_ "value of node is %d in Length macro", \ 1003 (int)(node)); \ 1004 } else { \ 1005 RExC_offsets[2*(node)] = (len); \ 1006 } \ 1007 } STMT_END 1008 1009 #define Set_Node_Length(node,len) \ 1010 Set_Node_Length_To_R(REGNODE_OFFSET(node), len) 1011 #define Set_Node_Cur_Length(node, start) \ 1012 Set_Node_Length(node, RExC_parse - start) 1013 1014 /* Get offsets and lengths */ 1015 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1]) 1016 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))]) 1017 1018 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 1019 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \ 1020 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \ 1021 } STMT_END 1022 1023 #define Track_Code(code) STMT_START { code } STMT_END 1024 #endif 1025 1026 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 1027 #define EXPERIMENTAL_INPLACESCAN 1028 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ 1029 1030 #ifdef DEBUGGING 1031 int 1032 Perl_re_printf(pTHX_ const char *fmt, ...) 1033 { 1034 va_list ap; 1035 int result; 1036 PerlIO *f= Perl_debug_log; 1037 PERL_ARGS_ASSERT_RE_PRINTF; 1038 va_start(ap, fmt); 1039 result = PerlIO_vprintf(f, fmt, ap); 1040 va_end(ap); 1041 return result; 1042 } 1043 1044 int 1045 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) 1046 { 1047 va_list ap; 1048 int result; 1049 PerlIO *f= Perl_debug_log; 1050 PERL_ARGS_ASSERT_RE_INDENTF; 1051 va_start(ap, depth); 1052 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); 1053 result = PerlIO_vprintf(f, fmt, ap); 1054 va_end(ap); 1055 return result; 1056 } 1057 #endif /* DEBUGGING */ 1058 1059 #define DEBUG_RExC_seen() \ 1060 DEBUG_OPTIMISE_MORE_r({ \ 1061 Perl_re_printf( aTHX_ "RExC_seen: "); \ 1062 \ 1063 if (RExC_seen & REG_ZERO_LEN_SEEN) \ 1064 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ 1065 \ 1066 if (RExC_seen & REG_LOOKBEHIND_SEEN) \ 1067 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ 1068 \ 1069 if (RExC_seen & REG_GPOS_SEEN) \ 1070 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ 1071 \ 1072 if (RExC_seen & REG_RECURSE_SEEN) \ 1073 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ 1074 \ 1075 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ 1076 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ 1077 \ 1078 if (RExC_seen & REG_VERBARG_SEEN) \ 1079 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ 1080 \ 1081 if (RExC_seen & REG_CUTGROUP_SEEN) \ 1082 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ 1083 \ 1084 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ 1085 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ 1086 \ 1087 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ 1088 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ 1089 \ 1090 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ 1091 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ 1092 \ 1093 Perl_re_printf( aTHX_ "\n"); \ 1094 }); 1095 1096 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ 1097 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) 1098 1099 1100 #ifdef DEBUGGING 1101 static void 1102 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, 1103 const char *close_str) 1104 { 1105 if (!flags) 1106 return; 1107 1108 Perl_re_printf( aTHX_ "%s", open_str); 1109 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); 1110 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); 1111 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); 1112 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); 1113 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); 1114 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); 1115 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); 1116 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); 1117 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); 1118 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); 1119 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); 1120 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); 1121 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); 1122 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); 1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); 1124 Perl_re_printf( aTHX_ "%s", close_str); 1125 } 1126 1127 1128 static void 1129 S_debug_studydata(pTHX_ const char *where, scan_data_t *data, 1130 U32 depth, int is_inf) 1131 { 1132 GET_RE_DEBUG_FLAGS_DECL; 1133 1134 DEBUG_OPTIMISE_MORE_r({ 1135 if (!data) 1136 return; 1137 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, 1138 depth, 1139 where, 1140 (IV)data->pos_min, 1141 (IV)data->pos_delta, 1142 (UV)data->flags 1143 ); 1144 1145 S_debug_show_study_flags(aTHX_ data->flags," [","]"); 1146 1147 Perl_re_printf( aTHX_ 1148 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", 1149 (IV)data->whilem_c, 1150 (IV)(data->last_closep ? *((data)->last_closep) : -1), 1151 is_inf ? "INF " : "" 1152 ); 1153 1154 if (data->last_found) { 1155 int i; 1156 Perl_re_printf(aTHX_ 1157 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, 1158 SvPVX_const(data->last_found), 1159 (IV)data->last_end, 1160 (IV)data->last_start_min, 1161 (IV)data->last_start_max 1162 ); 1163 1164 for (i = 0; i < 2; i++) { 1165 Perl_re_printf(aTHX_ 1166 " %s%s: '%s' @ %" IVdf "/%" IVdf, 1167 data->cur_is_floating == i ? "*" : "", 1168 i ? "Float" : "Fixed", 1169 SvPVX_const(data->substrs[i].str), 1170 (IV)data->substrs[i].min_offset, 1171 (IV)data->substrs[i].max_offset 1172 ); 1173 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); 1174 } 1175 } 1176 1177 Perl_re_printf( aTHX_ "\n"); 1178 }); 1179 } 1180 1181 1182 static void 1183 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, 1184 regnode *scan, U32 depth, U32 flags) 1185 { 1186 GET_RE_DEBUG_FLAGS_DECL; 1187 1188 DEBUG_OPTIMISE_r({ 1189 regnode *Next; 1190 1191 if (!scan) 1192 return; 1193 Next = regnext(scan); 1194 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 1195 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", 1196 depth, 1197 str, 1198 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), 1199 Next ? (REG_NODE_NUM(Next)) : 0 ); 1200 S_debug_show_study_flags(aTHX_ flags," [ ","]"); 1201 Perl_re_printf( aTHX_ "\n"); 1202 }); 1203 } 1204 1205 1206 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \ 1207 S_debug_studydata(aTHX_ where, data, depth, is_inf) 1208 1209 # define DEBUG_PEEP(str, scan, depth, flags) \ 1210 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags) 1211 1212 #else 1213 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP 1214 # define DEBUG_PEEP(str, scan, depth, flags) NOOP 1215 #endif 1216 1217 1218 /* ========================================================= 1219 * BEGIN edit_distance stuff. 1220 * 1221 * This calculates how many single character changes of any type are needed to 1222 * transform a string into another one. It is taken from version 3.1 of 1223 * 1224 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS 1225 */ 1226 1227 /* Our unsorted dictionary linked list. */ 1228 /* Note we use UVs, not chars. */ 1229 1230 struct dictionary{ 1231 UV key; 1232 UV value; 1233 struct dictionary* next; 1234 }; 1235 typedef struct dictionary item; 1236 1237 1238 PERL_STATIC_INLINE item* 1239 push(UV key, item* curr) 1240 { 1241 item* head; 1242 Newx(head, 1, item); 1243 head->key = key; 1244 head->value = 0; 1245 head->next = curr; 1246 return head; 1247 } 1248 1249 1250 PERL_STATIC_INLINE item* 1251 find(item* head, UV key) 1252 { 1253 item* iterator = head; 1254 while (iterator){ 1255 if (iterator->key == key){ 1256 return iterator; 1257 } 1258 iterator = iterator->next; 1259 } 1260 1261 return NULL; 1262 } 1263 1264 PERL_STATIC_INLINE item* 1265 uniquePush(item* head, UV key) 1266 { 1267 item* iterator = head; 1268 1269 while (iterator){ 1270 if (iterator->key == key) { 1271 return head; 1272 } 1273 iterator = iterator->next; 1274 } 1275 1276 return push(key, head); 1277 } 1278 1279 PERL_STATIC_INLINE void 1280 dict_free(item* head) 1281 { 1282 item* iterator = head; 1283 1284 while (iterator) { 1285 item* temp = iterator; 1286 iterator = iterator->next; 1287 Safefree(temp); 1288 } 1289 1290 head = NULL; 1291 } 1292 1293 /* End of Dictionary Stuff */ 1294 1295 /* All calculations/work are done here */ 1296 STATIC int 1297 S_edit_distance(const UV* src, 1298 const UV* tgt, 1299 const STRLEN x, /* length of src[] */ 1300 const STRLEN y, /* length of tgt[] */ 1301 const SSize_t maxDistance 1302 ) 1303 { 1304 item *head = NULL; 1305 UV swapCount, swapScore, targetCharCount, i, j; 1306 UV *scores; 1307 UV score_ceil = x + y; 1308 1309 PERL_ARGS_ASSERT_EDIT_DISTANCE; 1310 1311 /* intialize matrix start values */ 1312 Newx(scores, ( (x + 2) * (y + 2)), UV); 1313 scores[0] = score_ceil; 1314 scores[1 * (y + 2) + 0] = score_ceil; 1315 scores[0 * (y + 2) + 1] = score_ceil; 1316 scores[1 * (y + 2) + 1] = 0; 1317 head = uniquePush(uniquePush(head, src[0]), tgt[0]); 1318 1319 /* work loops */ 1320 /* i = src index */ 1321 /* j = tgt index */ 1322 for (i=1;i<=x;i++) { 1323 if (i < x) 1324 head = uniquePush(head, src[i]); 1325 scores[(i+1) * (y + 2) + 1] = i; 1326 scores[(i+1) * (y + 2) + 0] = score_ceil; 1327 swapCount = 0; 1328 1329 for (j=1;j<=y;j++) { 1330 if (i == 1) { 1331 if(j < y) 1332 head = uniquePush(head, tgt[j]); 1333 scores[1 * (y + 2) + (j + 1)] = j; 1334 scores[0 * (y + 2) + (j + 1)] = score_ceil; 1335 } 1336 1337 targetCharCount = find(head, tgt[j-1])->value; 1338 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; 1339 1340 if (src[i-1] != tgt[j-1]){ 1341 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); 1342 } 1343 else { 1344 swapCount = j; 1345 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); 1346 } 1347 } 1348 1349 find(head, src[i-1])->value = i; 1350 } 1351 1352 { 1353 IV score = scores[(x+1) * (y + 2) + (y + 1)]; 1354 dict_free(head); 1355 Safefree(scores); 1356 return (maxDistance != 0 && maxDistance < score)?(-1):score; 1357 } 1358 } 1359 1360 /* END of edit_distance() stuff 1361 * ========================================================= */ 1362 1363 /* is c a control character for which we have a mnemonic? */ 1364 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 1365 1366 STATIC const char * 1367 S_cntrl_to_mnemonic(const U8 c) 1368 { 1369 /* Returns the mnemonic string that represents character 'c', if one 1370 * exists; NULL otherwise. The only ones that exist for the purposes of 1371 * this routine are a few control characters */ 1372 1373 switch (c) { 1374 case '\a': return "\\a"; 1375 case '\b': return "\\b"; 1376 case ESC_NATIVE: return "\\e"; 1377 case '\f': return "\\f"; 1378 case '\n': return "\\n"; 1379 case '\r': return "\\r"; 1380 case '\t': return "\\t"; 1381 } 1382 1383 return NULL; 1384 } 1385 1386 /* Mark that we cannot extend a found fixed substring at this point. 1387 Update the longest found anchored substring or the longest found 1388 floating substrings if needed. */ 1389 1390 STATIC void 1391 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, 1392 SSize_t *minlenp, int is_inf) 1393 { 1394 const STRLEN l = CHR_SVLEN(data->last_found); 1395 SV * const longest_sv = data->substrs[data->cur_is_floating].str; 1396 const STRLEN old_l = CHR_SVLEN(longest_sv); 1397 GET_RE_DEBUG_FLAGS_DECL; 1398 1399 PERL_ARGS_ASSERT_SCAN_COMMIT; 1400 1401 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 1402 const U8 i = data->cur_is_floating; 1403 SvSetMagicSV(longest_sv, data->last_found); 1404 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; 1405 1406 if (!i) /* fixed */ 1407 data->substrs[0].max_offset = data->substrs[0].min_offset; 1408 else { /* float */ 1409 data->substrs[1].max_offset = (l 1410 ? data->last_start_max 1411 : (data->pos_delta > SSize_t_MAX - data->pos_min 1412 ? SSize_t_MAX 1413 : data->pos_min + data->pos_delta)); 1414 if (is_inf 1415 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX) 1416 data->substrs[1].max_offset = SSize_t_MAX; 1417 } 1418 1419 if (data->flags & SF_BEFORE_EOL) 1420 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL); 1421 else 1422 data->substrs[i].flags &= ~SF_BEFORE_EOL; 1423 data->substrs[i].minlenp = minlenp; 1424 data->substrs[i].lookbehind = 0; 1425 } 1426 1427 SvCUR_set(data->last_found, 0); 1428 { 1429 SV * const sv = data->last_found; 1430 if (SvUTF8(sv) && SvMAGICAL(sv)) { 1431 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 1432 if (mg) 1433 mg->mg_len = 0; 1434 } 1435 } 1436 data->last_end = -1; 1437 data->flags &= ~SF_BEFORE_EOL; 1438 DEBUG_STUDYDATA("commit", data, 0, is_inf); 1439 } 1440 1441 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion 1442 * list that describes which code points it matches */ 1443 1444 STATIC void 1445 S_ssc_anything(pTHX_ regnode_ssc *ssc) 1446 { 1447 /* Set the SSC 'ssc' to match an empty string or any code point */ 1448 1449 PERL_ARGS_ASSERT_SSC_ANYTHING; 1450 1451 assert(is_ANYOF_SYNTHETIC(ssc)); 1452 1453 /* mortalize so won't leak */ 1454 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); 1455 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ 1456 } 1457 1458 STATIC int 1459 S_ssc_is_anything(const regnode_ssc *ssc) 1460 { 1461 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code 1462 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys 1463 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted 1464 * in any way, so there's no point in using it */ 1465 1466 UV start, end; 1467 bool ret; 1468 1469 PERL_ARGS_ASSERT_SSC_IS_ANYTHING; 1470 1471 assert(is_ANYOF_SYNTHETIC(ssc)); 1472 1473 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { 1474 return FALSE; 1475 } 1476 1477 /* See if the list consists solely of the range 0 - Infinity */ 1478 invlist_iterinit(ssc->invlist); 1479 ret = invlist_iternext(ssc->invlist, &start, &end) 1480 && start == 0 1481 && end == UV_MAX; 1482 1483 invlist_iterfinish(ssc->invlist); 1484 1485 if (ret) { 1486 return TRUE; 1487 } 1488 1489 /* If e.g., both \w and \W are set, matches everything */ 1490 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1491 int i; 1492 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { 1493 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { 1494 return TRUE; 1495 } 1496 } 1497 } 1498 1499 return FALSE; 1500 } 1501 1502 STATIC void 1503 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) 1504 { 1505 /* Initializes the SSC 'ssc'. This includes setting it to match an empty 1506 * string, any code point, or any posix class under locale */ 1507 1508 PERL_ARGS_ASSERT_SSC_INIT; 1509 1510 Zero(ssc, 1, regnode_ssc); 1511 set_ANYOF_SYNTHETIC(ssc); 1512 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); 1513 ssc_anything(ssc); 1514 1515 /* If any portion of the regex is to operate under locale rules that aren't 1516 * fully known at compile time, initialization includes it. The reason 1517 * this isn't done for all regexes is that the optimizer was written under 1518 * the assumption that locale was all-or-nothing. Given the complexity and 1519 * lack of documentation in the optimizer, and that there are inadequate 1520 * test cases for locale, many parts of it may not work properly, it is 1521 * safest to avoid locale unless necessary. */ 1522 if (RExC_contains_locale) { 1523 ANYOF_POSIXL_SETALL(ssc); 1524 } 1525 else { 1526 ANYOF_POSIXL_ZERO(ssc); 1527 } 1528 } 1529 1530 STATIC int 1531 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, 1532 const regnode_ssc *ssc) 1533 { 1534 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only 1535 * to the list of code points matched, and locale posix classes; hence does 1536 * not check its flags) */ 1537 1538 UV start, end; 1539 bool ret; 1540 1541 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; 1542 1543 assert(is_ANYOF_SYNTHETIC(ssc)); 1544 1545 invlist_iterinit(ssc->invlist); 1546 ret = invlist_iternext(ssc->invlist, &start, &end) 1547 && start == 0 1548 && end == UV_MAX; 1549 1550 invlist_iterfinish(ssc->invlist); 1551 1552 if (! ret) { 1553 return FALSE; 1554 } 1555 1556 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { 1557 return FALSE; 1558 } 1559 1560 return TRUE; 1561 } 1562 1563 #define INVLIST_INDEX 0 1564 #define ONLY_LOCALE_MATCHES_INDEX 1 1565 #define DEFERRED_USER_DEFINED_INDEX 2 1566 1567 STATIC SV* 1568 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, 1569 const regnode_charclass* const node) 1570 { 1571 /* Returns a mortal inversion list defining which code points are matched 1572 * by 'node', which is of type ANYOF. Handles complementing the result if 1573 * appropriate. If some code points aren't knowable at this time, the 1574 * returned list must, and will, contain every code point that is a 1575 * possibility. */ 1576 1577 dVAR; 1578 SV* invlist = NULL; 1579 SV* only_utf8_locale_invlist = NULL; 1580 unsigned int i; 1581 const U32 n = ARG(node); 1582 bool new_node_has_latin1 = FALSE; 1583 const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node); 1584 1585 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; 1586 1587 /* Look at the data structure created by S_set_ANYOF_arg() */ 1588 if (n != ANYOF_ONLY_HAS_BITMAP) { 1589 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); 1590 AV * const av = MUTABLE_AV(SvRV(rv)); 1591 SV **const ary = AvARRAY(av); 1592 assert(RExC_rxi->data->what[n] == 's'); 1593 1594 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { 1595 1596 /* Here there are things that won't be known until runtime -- we 1597 * have to assume it could be anything */ 1598 invlist = sv_2mortal(_new_invlist(1)); 1599 return _add_range_to_invlist(invlist, 0, UV_MAX); 1600 } 1601 else if (ary[INVLIST_INDEX]) { 1602 1603 /* Use the node's inversion list */ 1604 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); 1605 } 1606 1607 /* Get the code points valid only under UTF-8 locales */ 1608 if ( (flags & ANYOFL_FOLD) 1609 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) 1610 { 1611 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; 1612 } 1613 } 1614 1615 if (! invlist) { 1616 invlist = sv_2mortal(_new_invlist(0)); 1617 } 1618 1619 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS 1620 * code points, and an inversion list for the others, but if there are code 1621 * points that should match only conditionally on the target string being 1622 * UTF-8, those are placed in the inversion list, and not the bitmap. 1623 * Since there are circumstances under which they could match, they are 1624 * included in the SSC. But if the ANYOF node is to be inverted, we have 1625 * to exclude them here, so that when we invert below, the end result 1626 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We 1627 * have to do this here before we add the unconditionally matched code 1628 * points */ 1629 if (flags & ANYOF_INVERT) { 1630 _invlist_intersection_complement_2nd(invlist, 1631 PL_UpperLatin1, 1632 &invlist); 1633 } 1634 1635 /* Add in the points from the bit map */ 1636 if (OP(node) != ANYOFH) { 1637 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 1638 if (ANYOF_BITMAP_TEST(node, i)) { 1639 unsigned int start = i++; 1640 1641 for (; i < NUM_ANYOF_CODE_POINTS 1642 && ANYOF_BITMAP_TEST(node, i); ++i) 1643 { 1644 /* empty */ 1645 } 1646 invlist = _add_range_to_invlist(invlist, start, i-1); 1647 new_node_has_latin1 = TRUE; 1648 } 1649 } 1650 } 1651 1652 /* If this can match all upper Latin1 code points, have to add them 1653 * as well. But don't add them if inverting, as when that gets done below, 1654 * it would exclude all these characters, including the ones it shouldn't 1655 * that were added just above */ 1656 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD 1657 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) 1658 { 1659 _invlist_union(invlist, PL_UpperLatin1, &invlist); 1660 } 1661 1662 /* Similarly for these */ 1663 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 1664 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); 1665 } 1666 1667 if (flags & ANYOF_INVERT) { 1668 _invlist_invert(invlist); 1669 } 1670 else if (flags & ANYOFL_FOLD) { 1671 if (new_node_has_latin1) { 1672 1673 /* Under /li, any 0-255 could fold to any other 0-255, depending on 1674 * the locale. We can skip this if there are no 0-255 at all. */ 1675 _invlist_union(invlist, PL_Latin1, &invlist); 1676 1677 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I); 1678 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 1679 } 1680 else { 1681 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { 1682 invlist = add_cp_to_invlist(invlist, 'I'); 1683 } 1684 if (_invlist_contains_cp(invlist, 1685 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 1686 { 1687 invlist = add_cp_to_invlist(invlist, 'i'); 1688 } 1689 } 1690 } 1691 1692 /* Similarly add the UTF-8 locale possible matches. These have to be 1693 * deferred until after the non-UTF-8 locale ones are taken care of just 1694 * above, or it leads to wrong results under ANYOF_INVERT */ 1695 if (only_utf8_locale_invlist) { 1696 _invlist_union_maybe_complement_2nd(invlist, 1697 only_utf8_locale_invlist, 1698 flags & ANYOF_INVERT, 1699 &invlist); 1700 } 1701 1702 return invlist; 1703 } 1704 1705 /* These two functions currently do the exact same thing */ 1706 #define ssc_init_zero ssc_init 1707 1708 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) 1709 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) 1710 1711 /* 'AND' a given class with another one. Can create false positives. 'ssc' 1712 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be 1713 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */ 1714 1715 STATIC void 1716 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1717 const regnode_charclass *and_with) 1718 { 1719 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either 1720 * another SSC or a regular ANYOF class. Can create false positives. */ 1721 1722 SV* anded_cp_list; 1723 U8 and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with); 1724 U8 anded_flags; 1725 1726 PERL_ARGS_ASSERT_SSC_AND; 1727 1728 assert(is_ANYOF_SYNTHETIC(ssc)); 1729 1730 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract 1731 * the code point inversion list and just the relevant flags */ 1732 if (is_ANYOF_SYNTHETIC(and_with)) { 1733 anded_cp_list = ((regnode_ssc *)and_with)->invlist; 1734 anded_flags = and_with_flags; 1735 1736 /* XXX This is a kludge around what appears to be deficiencies in the 1737 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, 1738 * there are paths through the optimizer where it doesn't get weeded 1739 * out when it should. And if we don't make some extra provision for 1740 * it like the code just below, it doesn't get added when it should. 1741 * This solution is to add it only when AND'ing, which is here, and 1742 * only when what is being AND'ed is the pristine, original node 1743 * matching anything. Thus it is like adding it to ssc_anything() but 1744 * only when the result is to be AND'ed. Probably the same solution 1745 * could be adopted for the same problem we have with /l matching, 1746 * which is solved differently in S_ssc_init(), and that would lead to 1747 * fewer false positives than that solution has. But if this solution 1748 * creates bugs, the consequences are only that a warning isn't raised 1749 * that should be; while the consequences for having /l bugs is 1750 * incorrect matches */ 1751 if (ssc_is_anything((regnode_ssc *)and_with)) { 1752 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 1753 } 1754 } 1755 else { 1756 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); 1757 if (OP(and_with) == ANYOFD) { 1758 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; 1759 } 1760 else { 1761 anded_flags = and_with_flags 1762 &( ANYOF_COMMON_FLAGS 1763 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 1764 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); 1765 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) { 1766 anded_flags &= 1767 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 1768 } 1769 } 1770 } 1771 1772 ANYOF_FLAGS(ssc) &= anded_flags; 1773 1774 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1775 * C2 is the list of code points in 'and-with'; P2, its posix classes. 1776 * 'and_with' may be inverted. When not inverted, we have the situation of 1777 * computing: 1778 * (C1 | P1) & (C2 | P2) 1779 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) 1780 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1781 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) 1782 * <= ((C1 & C2) | P1 | P2) 1783 * Alternatively, the last few steps could be: 1784 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1785 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) 1786 * <= (C1 | C2 | (P1 & P2)) 1787 * We favor the second approach if either P1 or P2 is non-empty. This is 1788 * because these components are a barrier to doing optimizations, as what 1789 * they match cannot be known until the moment of matching as they are 1790 * dependent on the current locale, 'AND"ing them likely will reduce or 1791 * eliminate them. 1792 * But we can do better if we know that C1,P1 are in their initial state (a 1793 * frequent occurrence), each matching everything: 1794 * (<everything>) & (C2 | P2) = C2 | P2 1795 * Similarly, if C2,P2 are in their initial state (again a frequent 1796 * occurrence), the result is a no-op 1797 * (C1 | P1) & (<everything>) = C1 | P1 1798 * 1799 * Inverted, we have 1800 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) 1801 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) 1802 * <= (C1 & ~C2) | (P1 & ~P2) 1803 * */ 1804 1805 if ((and_with_flags & ANYOF_INVERT) 1806 && ! is_ANYOF_SYNTHETIC(and_with)) 1807 { 1808 unsigned int i; 1809 1810 ssc_intersection(ssc, 1811 anded_cp_list, 1812 FALSE /* Has already been inverted */ 1813 ); 1814 1815 /* If either P1 or P2 is empty, the intersection will be also; can skip 1816 * the loop */ 1817 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { 1818 ANYOF_POSIXL_ZERO(ssc); 1819 } 1820 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1821 1822 /* Note that the Posix class component P from 'and_with' actually 1823 * looks like: 1824 * P = Pa | Pb | ... | Pn 1825 * where each component is one posix class, such as in [\w\s]. 1826 * Thus 1827 * ~P = ~(Pa | Pb | ... | Pn) 1828 * = ~Pa & ~Pb & ... & ~Pn 1829 * <= ~Pa | ~Pb | ... | ~Pn 1830 * The last is something we can easily calculate, but unfortunately 1831 * is likely to have many false positives. We could do better 1832 * in some (but certainly not all) instances if two classes in 1833 * P have known relationships. For example 1834 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: 1835 * So 1836 * :lower: & :print: = :lower: 1837 * And similarly for classes that must be disjoint. For example, 1838 * since \s and \w can have no elements in common based on rules in 1839 * the POSIX standard, 1840 * \w & ^\S = nothing 1841 * Unfortunately, some vendor locales do not meet the Posix 1842 * standard, in particular almost everything by Microsoft. 1843 * The loop below just changes e.g., \w into \W and vice versa */ 1844 1845 regnode_charclass_posixl temp; 1846 int add = 1; /* To calculate the index of the complement */ 1847 1848 Zero(&temp, 1, regnode_charclass_posixl); 1849 ANYOF_POSIXL_ZERO(&temp); 1850 for (i = 0; i < ANYOF_MAX; i++) { 1851 assert(i % 2 != 0 1852 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) 1853 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); 1854 1855 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { 1856 ANYOF_POSIXL_SET(&temp, i + add); 1857 } 1858 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ 1859 } 1860 ANYOF_POSIXL_AND(&temp, ssc); 1861 1862 } /* else ssc already has no posixes */ 1863 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC 1864 in its initial state */ 1865 else if (! is_ANYOF_SYNTHETIC(and_with) 1866 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) 1867 { 1868 /* But if 'ssc' is in its initial state, the result is just 'and_with'; 1869 * copy it over 'ssc' */ 1870 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { 1871 if (is_ANYOF_SYNTHETIC(and_with)) { 1872 StructCopy(and_with, ssc, regnode_ssc); 1873 } 1874 else { 1875 ssc->invlist = anded_cp_list; 1876 ANYOF_POSIXL_ZERO(ssc); 1877 if (and_with_flags & ANYOF_MATCHES_POSIXL) { 1878 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); 1879 } 1880 } 1881 } 1882 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) 1883 || (and_with_flags & ANYOF_MATCHES_POSIXL)) 1884 { 1885 /* One or the other of P1, P2 is non-empty. */ 1886 if (and_with_flags & ANYOF_MATCHES_POSIXL) { 1887 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); 1888 } 1889 ssc_union(ssc, anded_cp_list, FALSE); 1890 } 1891 else { /* P1 = P2 = empty */ 1892 ssc_intersection(ssc, anded_cp_list, FALSE); 1893 } 1894 } 1895 } 1896 1897 STATIC void 1898 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1899 const regnode_charclass *or_with) 1900 { 1901 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either 1902 * another SSC or a regular ANYOF class. Can create false positives if 1903 * 'or_with' is to be inverted. */ 1904 1905 SV* ored_cp_list; 1906 U8 ored_flags; 1907 U8 or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with); 1908 1909 PERL_ARGS_ASSERT_SSC_OR; 1910 1911 assert(is_ANYOF_SYNTHETIC(ssc)); 1912 1913 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract 1914 * the code point inversion list and just the relevant flags */ 1915 if (is_ANYOF_SYNTHETIC(or_with)) { 1916 ored_cp_list = ((regnode_ssc*) or_with)->invlist; 1917 ored_flags = or_with_flags; 1918 } 1919 else { 1920 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); 1921 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; 1922 if (OP(or_with) != ANYOFD) { 1923 ored_flags 1924 |= or_with_flags 1925 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 1926 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); 1927 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) { 1928 ored_flags |= 1929 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 1930 } 1931 } 1932 } 1933 1934 ANYOF_FLAGS(ssc) |= ored_flags; 1935 1936 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1937 * C2 is the list of code points in 'or-with'; P2, its posix classes. 1938 * 'or_with' may be inverted. When not inverted, we have the simple 1939 * situation of computing: 1940 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) 1941 * If P1|P2 yields a situation with both a class and its complement are 1942 * set, like having both \w and \W, this matches all code points, and we 1943 * can delete these from the P component of the ssc going forward. XXX We 1944 * might be able to delete all the P components, but I (khw) am not certain 1945 * about this, and it is better to be safe. 1946 * 1947 * Inverted, we have 1948 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) 1949 * <= (C1 | P1) | ~C2 1950 * <= (C1 | ~C2) | P1 1951 * (which results in actually simpler code than the non-inverted case) 1952 * */ 1953 1954 if ((or_with_flags & ANYOF_INVERT) 1955 && ! is_ANYOF_SYNTHETIC(or_with)) 1956 { 1957 /* We ignore P2, leaving P1 going forward */ 1958 } /* else Not inverted */ 1959 else if (or_with_flags & ANYOF_MATCHES_POSIXL) { 1960 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); 1961 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1962 unsigned int i; 1963 for (i = 0; i < ANYOF_MAX; i += 2) { 1964 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) 1965 { 1966 ssc_match_all_cp(ssc); 1967 ANYOF_POSIXL_CLEAR(ssc, i); 1968 ANYOF_POSIXL_CLEAR(ssc, i+1); 1969 } 1970 } 1971 } 1972 } 1973 1974 ssc_union(ssc, 1975 ored_cp_list, 1976 FALSE /* Already has been inverted */ 1977 ); 1978 } 1979 1980 PERL_STATIC_INLINE void 1981 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) 1982 { 1983 PERL_ARGS_ASSERT_SSC_UNION; 1984 1985 assert(is_ANYOF_SYNTHETIC(ssc)); 1986 1987 _invlist_union_maybe_complement_2nd(ssc->invlist, 1988 invlist, 1989 invert2nd, 1990 &ssc->invlist); 1991 } 1992 1993 PERL_STATIC_INLINE void 1994 S_ssc_intersection(pTHX_ regnode_ssc *ssc, 1995 SV* const invlist, 1996 const bool invert2nd) 1997 { 1998 PERL_ARGS_ASSERT_SSC_INTERSECTION; 1999 2000 assert(is_ANYOF_SYNTHETIC(ssc)); 2001 2002 _invlist_intersection_maybe_complement_2nd(ssc->invlist, 2003 invlist, 2004 invert2nd, 2005 &ssc->invlist); 2006 } 2007 2008 PERL_STATIC_INLINE void 2009 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) 2010 { 2011 PERL_ARGS_ASSERT_SSC_ADD_RANGE; 2012 2013 assert(is_ANYOF_SYNTHETIC(ssc)); 2014 2015 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); 2016 } 2017 2018 PERL_STATIC_INLINE void 2019 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) 2020 { 2021 /* AND just the single code point 'cp' into the SSC 'ssc' */ 2022 2023 SV* cp_list = _new_invlist(2); 2024 2025 PERL_ARGS_ASSERT_SSC_CP_AND; 2026 2027 assert(is_ANYOF_SYNTHETIC(ssc)); 2028 2029 cp_list = add_cp_to_invlist(cp_list, cp); 2030 ssc_intersection(ssc, cp_list, 2031 FALSE /* Not inverted */ 2032 ); 2033 SvREFCNT_dec_NN(cp_list); 2034 } 2035 2036 PERL_STATIC_INLINE void 2037 S_ssc_clear_locale(regnode_ssc *ssc) 2038 { 2039 /* Set the SSC 'ssc' to not match any locale things */ 2040 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; 2041 2042 assert(is_ANYOF_SYNTHETIC(ssc)); 2043 2044 ANYOF_POSIXL_ZERO(ssc); 2045 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; 2046 } 2047 2048 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C 2049 2050 STATIC bool 2051 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) 2052 { 2053 /* The synthetic start class is used to hopefully quickly winnow down 2054 * places where a pattern could start a match in the target string. If it 2055 * doesn't really narrow things down that much, there isn't much point to 2056 * having the overhead of using it. This function uses some very crude 2057 * heuristics to decide if to use the ssc or not. 2058 * 2059 * It returns TRUE if 'ssc' rules out more than half what it considers to 2060 * be the "likely" possible matches, but of course it doesn't know what the 2061 * actual things being matched are going to be; these are only guesses 2062 * 2063 * For /l matches, it assumes that the only likely matches are going to be 2064 * in the 0-255 range, uniformly distributed, so half of that is 127 2065 * For /a and /d matches, it assumes that the likely matches will be just 2066 * the ASCII range, so half of that is 63 2067 * For /u and there isn't anything matching above the Latin1 range, it 2068 * assumes that that is the only range likely to be matched, and uses 2069 * half that as the cut-off: 127. If anything matches above Latin1, 2070 * it assumes that all of Unicode could match (uniformly), except for 2071 * non-Unicode code points and things in the General Category "Other" 2072 * (unassigned, private use, surrogates, controls and formats). This 2073 * is a much large number. */ 2074 2075 U32 count = 0; /* Running total of number of code points matched by 2076 'ssc' */ 2077 UV start, end; /* Start and end points of current range in inversion 2078 XXX outdated. UTF-8 locales are common, what about invert? list */ 2079 const U32 max_code_points = (LOC) 2080 ? 256 2081 : (( ! UNI_SEMANTICS 2082 || invlist_highest(ssc->invlist) < 256) 2083 ? 128 2084 : NON_OTHER_COUNT); 2085 const U32 max_match = max_code_points / 2; 2086 2087 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; 2088 2089 invlist_iterinit(ssc->invlist); 2090 while (invlist_iternext(ssc->invlist, &start, &end)) { 2091 if (start >= max_code_points) { 2092 break; 2093 } 2094 end = MIN(end, max_code_points - 1); 2095 count += end - start + 1; 2096 if (count >= max_match) { 2097 invlist_iterfinish(ssc->invlist); 2098 return FALSE; 2099 } 2100 } 2101 2102 return TRUE; 2103 } 2104 2105 2106 STATIC void 2107 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) 2108 { 2109 /* The inversion list in the SSC is marked mortal; now we need a more 2110 * permanent copy, which is stored the same way that is done in a regular 2111 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit 2112 * map */ 2113 2114 SV* invlist = invlist_clone(ssc->invlist, NULL); 2115 2116 PERL_ARGS_ASSERT_SSC_FINALIZE; 2117 2118 assert(is_ANYOF_SYNTHETIC(ssc)); 2119 2120 /* The code in this file assumes that all but these flags aren't relevant 2121 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared 2122 * by the time we reach here */ 2123 assert(! (ANYOF_FLAGS(ssc) 2124 & ~( ANYOF_COMMON_FLAGS 2125 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 2126 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP))); 2127 2128 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); 2129 2130 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); 2131 2132 /* Make sure is clone-safe */ 2133 ssc->invlist = NULL; 2134 2135 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 2136 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; 2137 OP(ssc) = ANYOFPOSIXL; 2138 } 2139 else if (RExC_contains_locale) { 2140 OP(ssc) = ANYOFL; 2141 } 2142 2143 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); 2144 } 2145 2146 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 2147 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 2148 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 2149 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ 2150 ? (TRIE_LIST_CUR( idx ) - 1) \ 2151 : 0 ) 2152 2153 2154 #ifdef DEBUGGING 2155 /* 2156 dump_trie(trie,widecharmap,revcharmap) 2157 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 2158 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 2159 2160 These routines dump out a trie in a somewhat readable format. 2161 The _interim_ variants are used for debugging the interim 2162 tables that are used to generate the final compressed 2163 representation which is what dump_trie expects. 2164 2165 Part of the reason for their existence is to provide a form 2166 of documentation as to how the different representations function. 2167 2168 */ 2169 2170 /* 2171 Dumps the final compressed table form of the trie to Perl_debug_log. 2172 Used for debugging make_trie(). 2173 */ 2174 2175 STATIC void 2176 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 2177 AV *revcharmap, U32 depth) 2178 { 2179 U32 state; 2180 SV *sv=sv_newmortal(); 2181 int colwidth= widecharmap ? 6 : 4; 2182 U16 word; 2183 GET_RE_DEBUG_FLAGS_DECL; 2184 2185 PERL_ARGS_ASSERT_DUMP_TRIE; 2186 2187 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", 2188 depth+1, "Match","Base","Ofs" ); 2189 2190 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 2191 SV ** const tmp = av_fetch( revcharmap, state, 0); 2192 if ( tmp ) { 2193 Perl_re_printf( aTHX_ "%*s", 2194 colwidth, 2195 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 2196 PL_colors[0], PL_colors[1], 2197 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2198 PERL_PV_ESCAPE_FIRSTCHAR 2199 ) 2200 ); 2201 } 2202 } 2203 Perl_re_printf( aTHX_ "\n"); 2204 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); 2205 2206 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 2207 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); 2208 Perl_re_printf( aTHX_ "\n"); 2209 2210 for( state = 1 ; state < trie->statecount ; state++ ) { 2211 const U32 base = trie->states[ state ].trans.base; 2212 2213 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); 2214 2215 if ( trie->states[ state ].wordnum ) { 2216 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); 2217 } else { 2218 Perl_re_printf( aTHX_ "%6s", "" ); 2219 } 2220 2221 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); 2222 2223 if ( base ) { 2224 U32 ofs = 0; 2225 2226 while( ( base + ofs < trie->uniquecharcount ) || 2227 ( base + ofs - trie->uniquecharcount < trie->lasttrans 2228 && trie->trans[ base + ofs - trie->uniquecharcount ].check 2229 != state)) 2230 ofs++; 2231 2232 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); 2233 2234 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 2235 if ( ( base + ofs >= trie->uniquecharcount ) 2236 && ( base + ofs - trie->uniquecharcount 2237 < trie->lasttrans ) 2238 && trie->trans[ base + ofs 2239 - trie->uniquecharcount ].check == state ) 2240 { 2241 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, 2242 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next 2243 ); 2244 } else { 2245 Perl_re_printf( aTHX_ "%*s", colwidth," ." ); 2246 } 2247 } 2248 2249 Perl_re_printf( aTHX_ "]"); 2250 2251 } 2252 Perl_re_printf( aTHX_ "\n" ); 2253 } 2254 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", 2255 depth); 2256 for (word=1; word <= trie->wordcount; word++) { 2257 Perl_re_printf( aTHX_ " %d:(%d,%d)", 2258 (int)word, (int)(trie->wordinfo[word].prev), 2259 (int)(trie->wordinfo[word].len)); 2260 } 2261 Perl_re_printf( aTHX_ "\n" ); 2262 } 2263 /* 2264 Dumps a fully constructed but uncompressed trie in list form. 2265 List tries normally only are used for construction when the number of 2266 possible chars (trie->uniquecharcount) is very high. 2267 Used for debugging make_trie(). 2268 */ 2269 STATIC void 2270 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 2271 HV *widecharmap, AV *revcharmap, U32 next_alloc, 2272 U32 depth) 2273 { 2274 U32 state; 2275 SV *sv=sv_newmortal(); 2276 int colwidth= widecharmap ? 6 : 4; 2277 GET_RE_DEBUG_FLAGS_DECL; 2278 2279 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 2280 2281 /* print out the table precompression. */ 2282 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", 2283 depth+1 ); 2284 Perl_re_indentf( aTHX_ "%s", 2285 depth+1, "------:-----+-----------------\n" ); 2286 2287 for( state=1 ; state < next_alloc ; state ++ ) { 2288 U16 charid; 2289 2290 Perl_re_indentf( aTHX_ " %4" UVXf " :", 2291 depth+1, (UV)state ); 2292 if ( ! trie->states[ state ].wordnum ) { 2293 Perl_re_printf( aTHX_ "%5s| ",""); 2294 } else { 2295 Perl_re_printf( aTHX_ "W%4x| ", 2296 trie->states[ state ].wordnum 2297 ); 2298 } 2299 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 2300 SV ** const tmp = av_fetch( revcharmap, 2301 TRIE_LIST_ITEM(state, charid).forid, 0); 2302 if ( tmp ) { 2303 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", 2304 colwidth, 2305 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 2306 colwidth, 2307 PL_colors[0], PL_colors[1], 2308 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 2309 | PERL_PV_ESCAPE_FIRSTCHAR 2310 ) , 2311 TRIE_LIST_ITEM(state, charid).forid, 2312 (UV)TRIE_LIST_ITEM(state, charid).newstate 2313 ); 2314 if (!(charid % 10)) 2315 Perl_re_printf( aTHX_ "\n%*s| ", 2316 (int)((depth * 2) + 14), ""); 2317 } 2318 } 2319 Perl_re_printf( aTHX_ "\n"); 2320 } 2321 } 2322 2323 /* 2324 Dumps a fully constructed but uncompressed trie in table form. 2325 This is the normal DFA style state transition table, with a few 2326 twists to facilitate compression later. 2327 Used for debugging make_trie(). 2328 */ 2329 STATIC void 2330 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 2331 HV *widecharmap, AV *revcharmap, U32 next_alloc, 2332 U32 depth) 2333 { 2334 U32 state; 2335 U16 charid; 2336 SV *sv=sv_newmortal(); 2337 int colwidth= widecharmap ? 6 : 4; 2338 GET_RE_DEBUG_FLAGS_DECL; 2339 2340 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 2341 2342 /* 2343 print out the table precompression so that we can do a visual check 2344 that they are identical. 2345 */ 2346 2347 Perl_re_indentf( aTHX_ "Char : ", depth+1 ); 2348 2349 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 2350 SV ** const tmp = av_fetch( revcharmap, charid, 0); 2351 if ( tmp ) { 2352 Perl_re_printf( aTHX_ "%*s", 2353 colwidth, 2354 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 2355 PL_colors[0], PL_colors[1], 2356 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2357 PERL_PV_ESCAPE_FIRSTCHAR 2358 ) 2359 ); 2360 } 2361 } 2362 2363 Perl_re_printf( aTHX_ "\n"); 2364 Perl_re_indentf( aTHX_ "State+-", depth+1 ); 2365 2366 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 2367 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); 2368 } 2369 2370 Perl_re_printf( aTHX_ "\n" ); 2371 2372 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 2373 2374 Perl_re_indentf( aTHX_ "%4" UVXf " : ", 2375 depth+1, 2376 (UV)TRIE_NODENUM( state ) ); 2377 2378 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 2379 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 2380 if (v) 2381 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); 2382 else 2383 Perl_re_printf( aTHX_ "%*s", colwidth, "." ); 2384 } 2385 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 2386 Perl_re_printf( aTHX_ " (%4" UVXf ")\n", 2387 (UV)trie->trans[ state ].check ); 2388 } else { 2389 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", 2390 (UV)trie->trans[ state ].check, 2391 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 2392 } 2393 } 2394 } 2395 2396 #endif 2397 2398 2399 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 2400 startbranch: the first branch in the whole branch sequence 2401 first : start branch of sequence of branch-exact nodes. 2402 May be the same as startbranch 2403 last : Thing following the last branch. 2404 May be the same as tail. 2405 tail : item following the branch sequence 2406 count : words in the sequence 2407 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ 2408 depth : indent depth 2409 2410 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 2411 2412 A trie is an N'ary tree where the branches are determined by digital 2413 decomposition of the key. IE, at the root node you look up the 1st character and 2414 follow that branch repeat until you find the end of the branches. Nodes can be 2415 marked as "accepting" meaning they represent a complete word. Eg: 2416 2417 /he|she|his|hers/ 2418 2419 would convert into the following structure. Numbers represent states, letters 2420 following numbers represent valid transitions on the letter from that state, if 2421 the number is in square brackets it represents an accepting state, otherwise it 2422 will be in parenthesis. 2423 2424 +-h->+-e->[3]-+-r->(8)-+-s->[9] 2425 | | 2426 | (2) 2427 | | 2428 (1) +-i->(6)-+-s->[7] 2429 | 2430 +-s->(3)-+-h->(4)-+-e->[5] 2431 2432 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 2433 2434 This shows that when matching against the string 'hers' we will begin at state 1 2435 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 2436 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 2437 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 2438 single traverse. We store a mapping from accepting to state to which word was 2439 matched, and then when we have multiple possibilities we try to complete the 2440 rest of the regex in the order in which they occurred in the alternation. 2441 2442 The only prior NFA like behaviour that would be changed by the TRIE support is 2443 the silent ignoring of duplicate alternations which are of the form: 2444 2445 / (DUPE|DUPE) X? (?{ ... }) Y /x 2446 2447 Thus EVAL blocks following a trie may be called a different number of times with 2448 and without the optimisation. With the optimisations dupes will be silently 2449 ignored. This inconsistent behaviour of EVAL type nodes is well established as 2450 the following demonstrates: 2451 2452 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 2453 2454 which prints out 'word' three times, but 2455 2456 'words'=~/(word|word|word)(?{ print $1 })S/ 2457 2458 which doesnt print it out at all. This is due to other optimisations kicking in. 2459 2460 Example of what happens on a structural level: 2461 2462 The regexp /(ac|ad|ab)+/ will produce the following debug output: 2463 2464 1: CURLYM[1] {1,32767}(18) 2465 5: BRANCH(8) 2466 6: EXACT <ac>(16) 2467 8: BRANCH(11) 2468 9: EXACT <ad>(16) 2469 11: BRANCH(14) 2470 12: EXACT <ab>(16) 2471 16: SUCCEED(0) 2472 17: NOTHING(18) 2473 18: END(0) 2474 2475 This would be optimizable with startbranch=5, first=5, last=16, tail=16 2476 and should turn into: 2477 2478 1: CURLYM[1] {1,32767}(18) 2479 5: TRIE(16) 2480 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 2481 <ac> 2482 <ad> 2483 <ab> 2484 16: SUCCEED(0) 2485 17: NOTHING(18) 2486 18: END(0) 2487 2488 Cases where tail != last would be like /(?foo|bar)baz/: 2489 2490 1: BRANCH(4) 2491 2: EXACT <foo>(8) 2492 4: BRANCH(7) 2493 5: EXACT <bar>(8) 2494 7: TAIL(8) 2495 8: EXACT <baz>(10) 2496 10: END(0) 2497 2498 which would be optimizable with startbranch=1, first=1, last=7, tail=8 2499 and would end up looking like: 2500 2501 1: TRIE(8) 2502 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 2503 <foo> 2504 <bar> 2505 7: TAIL(8) 2506 8: EXACT <baz>(10) 2507 10: END(0) 2508 2509 d = uvchr_to_utf8_flags(d, uv, 0); 2510 2511 is the recommended Unicode-aware way of saying 2512 2513 *(d++) = uv; 2514 */ 2515 2516 #define TRIE_STORE_REVCHAR(val) \ 2517 STMT_START { \ 2518 if (UTF) { \ 2519 SV *zlopp = newSV(UTF8_MAXBYTES); \ 2520 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 2521 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ 2522 SvCUR_set(zlopp, kapow - flrbbbbb); \ 2523 SvPOK_on(zlopp); \ 2524 SvUTF8_on(zlopp); \ 2525 av_push(revcharmap, zlopp); \ 2526 } else { \ 2527 char ooooff = (char)val; \ 2528 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 2529 } \ 2530 } STMT_END 2531 2532 /* This gets the next character from the input, folding it if not already 2533 * folded. */ 2534 #define TRIE_READ_CHAR STMT_START { \ 2535 wordlen++; \ 2536 if ( UTF ) { \ 2537 /* if it is UTF then it is either already folded, or does not need \ 2538 * folding */ \ 2539 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ 2540 } \ 2541 else if (folder == PL_fold_latin1) { \ 2542 /* This folder implies Unicode rules, which in the range expressible \ 2543 * by not UTF is the lower case, with the two exceptions, one of \ 2544 * which should have been taken care of before calling this */ \ 2545 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ 2546 uvc = toLOWER_L1(*uc); \ 2547 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ 2548 len = 1; \ 2549 } else { \ 2550 /* raw data, will be folded later if needed */ \ 2551 uvc = (U32)*uc; \ 2552 len = 1; \ 2553 } \ 2554 } STMT_END 2555 2556 2557 2558 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 2559 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 2560 U32 ging = TRIE_LIST_LEN( state ) * 2; \ 2561 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 2562 TRIE_LIST_LEN( state ) = ging; \ 2563 } \ 2564 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 2565 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 2566 TRIE_LIST_CUR( state )++; \ 2567 } STMT_END 2568 2569 #define TRIE_LIST_NEW(state) STMT_START { \ 2570 Newx( trie->states[ state ].trans.list, \ 2571 4, reg_trie_trans_le ); \ 2572 TRIE_LIST_CUR( state ) = 1; \ 2573 TRIE_LIST_LEN( state ) = 4; \ 2574 } STMT_END 2575 2576 #define TRIE_HANDLE_WORD(state) STMT_START { \ 2577 U16 dupe= trie->states[ state ].wordnum; \ 2578 regnode * const noper_next = regnext( noper ); \ 2579 \ 2580 DEBUG_r({ \ 2581 /* store the word for dumping */ \ 2582 SV* tmp; \ 2583 if (OP(noper) != NOTHING) \ 2584 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 2585 else \ 2586 tmp = newSVpvn_utf8( "", 0, UTF ); \ 2587 av_push( trie_words, tmp ); \ 2588 }); \ 2589 \ 2590 curword++; \ 2591 trie->wordinfo[curword].prev = 0; \ 2592 trie->wordinfo[curword].len = wordlen; \ 2593 trie->wordinfo[curword].accept = state; \ 2594 \ 2595 if ( noper_next < tail ) { \ 2596 if (!trie->jump) \ 2597 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ 2598 sizeof(U16) ); \ 2599 trie->jump[curword] = (U16)(noper_next - convert); \ 2600 if (!jumper) \ 2601 jumper = noper_next; \ 2602 if (!nextbranch) \ 2603 nextbranch= regnext(cur); \ 2604 } \ 2605 \ 2606 if ( dupe ) { \ 2607 /* It's a dupe. Pre-insert into the wordinfo[].prev */\ 2608 /* chain, so that when the bits of chain are later */\ 2609 /* linked together, the dups appear in the chain */\ 2610 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ 2611 trie->wordinfo[dupe].prev = curword; \ 2612 } else { \ 2613 /* we haven't inserted this word yet. */ \ 2614 trie->states[ state ].wordnum = curword; \ 2615 } \ 2616 } STMT_END 2617 2618 2619 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 2620 ( ( base + charid >= ucharcount \ 2621 && base + charid < ubound \ 2622 && state == trie->trans[ base - ucharcount + charid ].check \ 2623 && trie->trans[ base - ucharcount + charid ].next ) \ 2624 ? trie->trans[ base - ucharcount + charid ].next \ 2625 : ( state==1 ? special : 0 ) \ 2626 ) 2627 2628 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ 2629 STMT_START { \ 2630 TRIE_BITMAP_SET(trie, uvc); \ 2631 /* store the folded codepoint */ \ 2632 if ( folder ) \ 2633 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ 2634 \ 2635 if ( !UTF ) { \ 2636 /* store first byte of utf8 representation of */ \ 2637 /* variant codepoints */ \ 2638 if (! UVCHR_IS_INVARIANT(uvc)) { \ 2639 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ 2640 } \ 2641 } \ 2642 } STMT_END 2643 #define MADE_TRIE 1 2644 #define MADE_JUMP_TRIE 2 2645 #define MADE_EXACT_TRIE 4 2646 2647 STATIC I32 2648 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, 2649 regnode *first, regnode *last, regnode *tail, 2650 U32 word_count, U32 flags, U32 depth) 2651 { 2652 /* first pass, loop through and scan words */ 2653 reg_trie_data *trie; 2654 HV *widecharmap = NULL; 2655 AV *revcharmap = newAV(); 2656 regnode *cur; 2657 STRLEN len = 0; 2658 UV uvc = 0; 2659 U16 curword = 0; 2660 U32 next_alloc = 0; 2661 regnode *jumper = NULL; 2662 regnode *nextbranch = NULL; 2663 regnode *convert = NULL; 2664 U32 *prev_states; /* temp array mapping each state to previous one */ 2665 /* we just use folder as a flag in utf8 */ 2666 const U8 * folder = NULL; 2667 2668 /* in the below add_data call we are storing either 'tu' or 'tuaa' 2669 * which stands for one trie structure, one hash, optionally followed 2670 * by two arrays */ 2671 #ifdef DEBUGGING 2672 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa")); 2673 AV *trie_words = NULL; 2674 /* along with revcharmap, this only used during construction but both are 2675 * useful during debugging so we store them in the struct when debugging. 2676 */ 2677 #else 2678 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); 2679 STRLEN trie_charcount=0; 2680 #endif 2681 SV *re_trie_maxbuff; 2682 GET_RE_DEBUG_FLAGS_DECL; 2683 2684 PERL_ARGS_ASSERT_MAKE_TRIE; 2685 #ifndef DEBUGGING 2686 PERL_UNUSED_ARG(depth); 2687 #endif 2688 2689 switch (flags) { 2690 case EXACT: case EXACT_ONLY8: case EXACTL: break; 2691 case EXACTFAA: 2692 case EXACTFUP: 2693 case EXACTFU: 2694 case EXACTFLU8: folder = PL_fold_latin1; break; 2695 case EXACTF: folder = PL_fold; break; 2696 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); 2697 } 2698 2699 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 2700 trie->refcount = 1; 2701 trie->startstate = 1; 2702 trie->wordcount = word_count; 2703 RExC_rxi->data->data[ data_slot ] = (void*)trie; 2704 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 2705 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL) 2706 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 2707 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( 2708 trie->wordcount+1, sizeof(reg_trie_wordinfo)); 2709 2710 DEBUG_r({ 2711 trie_words = newAV(); 2712 }); 2713 2714 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); 2715 assert(re_trie_maxbuff); 2716 if (!SvIOK(re_trie_maxbuff)) { 2717 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2718 } 2719 DEBUG_TRIE_COMPILE_r({ 2720 Perl_re_indentf( aTHX_ 2721 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 2722 depth+1, 2723 REG_NODE_NUM(startbranch), REG_NODE_NUM(first), 2724 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); 2725 }); 2726 2727 /* Find the node we are going to overwrite */ 2728 if ( first == startbranch && OP( last ) != BRANCH ) { 2729 /* whole branch chain */ 2730 convert = first; 2731 } else { 2732 /* branch sub-chain */ 2733 convert = NEXTOPER( first ); 2734 } 2735 2736 /* -- First loop and Setup -- 2737 2738 We first traverse the branches and scan each word to determine if it 2739 contains widechars, and how many unique chars there are, this is 2740 important as we have to build a table with at least as many columns as we 2741 have unique chars. 2742 2743 We use an array of integers to represent the character codes 0..255 2744 (trie->charmap) and we use a an HV* to store Unicode characters. We use 2745 the native representation of the character value as the key and IV's for 2746 the coded index. 2747 2748 *TODO* If we keep track of how many times each character is used we can 2749 remap the columns so that the table compression later on is more 2750 efficient in terms of memory by ensuring the most common value is in the 2751 middle and the least common are on the outside. IMO this would be better 2752 than a most to least common mapping as theres a decent chance the most 2753 common letter will share a node with the least common, meaning the node 2754 will not be compressible. With a middle is most common approach the worst 2755 case is when we have the least common nodes twice. 2756 2757 */ 2758 2759 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2760 regnode *noper = NEXTOPER( cur ); 2761 const U8 *uc; 2762 const U8 *e; 2763 int foldlen = 0; 2764 U32 wordlen = 0; /* required init */ 2765 STRLEN minchars = 0; 2766 STRLEN maxchars = 0; 2767 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the 2768 bitmap?*/ 2769 2770 if (OP(noper) == NOTHING) { 2771 /* skip past a NOTHING at the start of an alternation 2772 * eg, /(?:)a|(?:b)/ should be the same as /a|b/ 2773 * 2774 * If the next node is not something we are supposed to process 2775 * we will just ignore it due to the condition guarding the 2776 * next block. 2777 */ 2778 2779 regnode *noper_next= regnext(noper); 2780 if (noper_next < tail) 2781 noper= noper_next; 2782 } 2783 2784 if ( noper < tail 2785 && ( OP(noper) == flags 2786 || (flags == EXACT && OP(noper) == EXACT_ONLY8) 2787 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 2788 || OP(noper) == EXACTFUP)))) 2789 { 2790 uc= (U8*)STRING(noper); 2791 e= uc + STR_LEN(noper); 2792 } else { 2793 trie->minlen= 0; 2794 continue; 2795 } 2796 2797 2798 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ 2799 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 2800 regardless of encoding */ 2801 if (OP( noper ) == EXACTFUP) { 2802 /* false positives are ok, so just set this */ 2803 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); 2804 } 2805 } 2806 2807 for ( ; uc < e ; uc += len ) { /* Look at each char in the current 2808 branch */ 2809 TRIE_CHARCOUNT(trie)++; 2810 TRIE_READ_CHAR; 2811 2812 /* TRIE_READ_CHAR returns the current character, or its fold if /i 2813 * is in effect. Under /i, this character can match itself, or 2814 * anything that folds to it. If not under /i, it can match just 2815 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN 2816 * all fold to k, and all are single characters. But some folds 2817 * expand to more than one character, so for example LATIN SMALL 2818 * LIGATURE FFI folds to the three character sequence 'ffi'. If 2819 * the string beginning at 'uc' is 'ffi', it could be matched by 2820 * three characters, or just by the one ligature character. (It 2821 * could also be matched by two characters: LATIN SMALL LIGATURE FF 2822 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). 2823 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also 2824 * match.) The trie needs to know the minimum and maximum number 2825 * of characters that could match so that it can use size alone to 2826 * quickly reject many match attempts. The max is simple: it is 2827 * the number of folded characters in this branch (since a fold is 2828 * never shorter than what folds to it. */ 2829 2830 maxchars++; 2831 2832 /* And the min is equal to the max if not under /i (indicated by 2833 * 'folder' being NULL), or there are no multi-character folds. If 2834 * there is a multi-character fold, the min is incremented just 2835 * once, for the character that folds to the sequence. Each 2836 * character in the sequence needs to be added to the list below of 2837 * characters in the trie, but we count only the first towards the 2838 * min number of characters needed. This is done through the 2839 * variable 'foldlen', which is returned by the macros that look 2840 * for these sequences as the number of bytes the sequence 2841 * occupies. Each time through the loop, we decrement 'foldlen' by 2842 * how many bytes the current char occupies. Only when it reaches 2843 * 0 do we increment 'minchars' or look for another multi-character 2844 * sequence. */ 2845 if (folder == NULL) { 2846 minchars++; 2847 } 2848 else if (foldlen > 0) { 2849 foldlen -= (UTF) ? UTF8SKIP(uc) : 1; 2850 } 2851 else { 2852 minchars++; 2853 2854 /* See if *uc is the beginning of a multi-character fold. If 2855 * so, we decrement the length remaining to look at, to account 2856 * for the current character this iteration. (We can use 'uc' 2857 * instead of the fold returned by TRIE_READ_CHAR because for 2858 * non-UTF, the latin1_safe macro is smart enough to account 2859 * for all the unfolded characters, and because for UTF, the 2860 * string will already have been folded earlier in the 2861 * compilation process */ 2862 if (UTF) { 2863 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { 2864 foldlen -= UTF8SKIP(uc); 2865 } 2866 } 2867 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { 2868 foldlen--; 2869 } 2870 } 2871 2872 /* The current character (and any potential folds) should be added 2873 * to the possible matching characters for this position in this 2874 * branch */ 2875 if ( uvc < 256 ) { 2876 if ( folder ) { 2877 U8 folded= folder[ (U8) uvc ]; 2878 if ( !trie->charmap[ folded ] ) { 2879 trie->charmap[ folded ]=( ++trie->uniquecharcount ); 2880 TRIE_STORE_REVCHAR( folded ); 2881 } 2882 } 2883 if ( !trie->charmap[ uvc ] ) { 2884 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 2885 TRIE_STORE_REVCHAR( uvc ); 2886 } 2887 if ( set_bit ) { 2888 /* store the codepoint in the bitmap, and its folded 2889 * equivalent. */ 2890 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); 2891 set_bit = 0; /* We've done our bit :-) */ 2892 } 2893 } else { 2894 2895 /* XXX We could come up with the list of code points that fold 2896 * to this using PL_utf8_foldclosures, except not for 2897 * multi-char folds, as there may be multiple combinations 2898 * there that could work, which needs to wait until runtime to 2899 * resolve (The comment about LIGATURE FFI above is such an 2900 * example */ 2901 2902 SV** svpp; 2903 if ( !widecharmap ) 2904 widecharmap = newHV(); 2905 2906 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 2907 2908 if ( !svpp ) 2909 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); 2910 2911 if ( !SvTRUE( *svpp ) ) { 2912 sv_setiv( *svpp, ++trie->uniquecharcount ); 2913 TRIE_STORE_REVCHAR(uvc); 2914 } 2915 } 2916 } /* end loop through characters in this branch of the trie */ 2917 2918 /* We take the min and max for this branch and combine to find the min 2919 * and max for all branches processed so far */ 2920 if( cur == first ) { 2921 trie->minlen = minchars; 2922 trie->maxlen = maxchars; 2923 } else if (minchars < trie->minlen) { 2924 trie->minlen = minchars; 2925 } else if (maxchars > trie->maxlen) { 2926 trie->maxlen = maxchars; 2927 } 2928 } /* end first pass */ 2929 DEBUG_TRIE_COMPILE_r( 2930 Perl_re_indentf( aTHX_ 2931 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 2932 depth+1, 2933 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 2934 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 2935 (int)trie->minlen, (int)trie->maxlen ) 2936 ); 2937 2938 /* 2939 We now know what we are dealing with in terms of unique chars and 2940 string sizes so we can calculate how much memory a naive 2941 representation using a flat table will take. If it's over a reasonable 2942 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 2943 conservative but potentially much slower representation using an array 2944 of lists. 2945 2946 At the end we convert both representations into the same compressed 2947 form that will be used in regexec.c for matching with. The latter 2948 is a form that cannot be used to construct with but has memory 2949 properties similar to the list form and access properties similar 2950 to the table form making it both suitable for fast searches and 2951 small enough that its feasable to store for the duration of a program. 2952 2953 See the comment in the code where the compressed table is produced 2954 inplace from the flat tabe representation for an explanation of how 2955 the compression works. 2956 2957 */ 2958 2959 2960 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); 2961 prev_states[1] = 0; 2962 2963 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) 2964 > SvIV(re_trie_maxbuff) ) 2965 { 2966 /* 2967 Second Pass -- Array Of Lists Representation 2968 2969 Each state will be represented by a list of charid:state records 2970 (reg_trie_trans_le) the first such element holds the CUR and LEN 2971 points of the allocated array. (See defines above). 2972 2973 We build the initial structure using the lists, and then convert 2974 it into the compressed table form which allows faster lookups 2975 (but cant be modified once converted). 2976 */ 2977 2978 STRLEN transcount = 1; 2979 2980 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", 2981 depth+1)); 2982 2983 trie->states = (reg_trie_state *) 2984 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 2985 sizeof(reg_trie_state) ); 2986 TRIE_LIST_NEW(1); 2987 next_alloc = 2; 2988 2989 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2990 2991 regnode *noper = NEXTOPER( cur ); 2992 U32 state = 1; /* required init */ 2993 U16 charid = 0; /* sanity init */ 2994 U32 wordlen = 0; /* required init */ 2995 2996 if (OP(noper) == NOTHING) { 2997 regnode *noper_next= regnext(noper); 2998 if (noper_next < tail) 2999 noper= noper_next; 3000 /* we will undo this assignment if noper does not 3001 * point at a trieable type in the else clause of 3002 * the following statement. */ 3003 } 3004 3005 if ( noper < tail 3006 && ( OP(noper) == flags 3007 || (flags == EXACT && OP(noper) == EXACT_ONLY8) 3008 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 3009 || OP(noper) == EXACTFUP)))) 3010 { 3011 const U8 *uc= (U8*)STRING(noper); 3012 const U8 *e= uc + STR_LEN(noper); 3013 3014 for ( ; uc < e ; uc += len ) { 3015 3016 TRIE_READ_CHAR; 3017 3018 if ( uvc < 256 ) { 3019 charid = trie->charmap[ uvc ]; 3020 } else { 3021 SV** const svpp = hv_fetch( widecharmap, 3022 (char*)&uvc, 3023 sizeof( UV ), 3024 0); 3025 if ( !svpp ) { 3026 charid = 0; 3027 } else { 3028 charid=(U16)SvIV( *svpp ); 3029 } 3030 } 3031 /* charid is now 0 if we dont know the char read, or 3032 * nonzero if we do */ 3033 if ( charid ) { 3034 3035 U16 check; 3036 U32 newstate = 0; 3037 3038 charid--; 3039 if ( !trie->states[ state ].trans.list ) { 3040 TRIE_LIST_NEW( state ); 3041 } 3042 for ( check = 1; 3043 check <= TRIE_LIST_USED( state ); 3044 check++ ) 3045 { 3046 if ( TRIE_LIST_ITEM( state, check ).forid 3047 == charid ) 3048 { 3049 newstate = TRIE_LIST_ITEM( state, check ).newstate; 3050 break; 3051 } 3052 } 3053 if ( ! newstate ) { 3054 newstate = next_alloc++; 3055 prev_states[newstate] = state; 3056 TRIE_LIST_PUSH( state, charid, newstate ); 3057 transcount++; 3058 } 3059 state = newstate; 3060 } else { 3061 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); 3062 } 3063 } 3064 } else { 3065 /* If we end up here it is because we skipped past a NOTHING, but did not end up 3066 * on a trieable type. So we need to reset noper back to point at the first regop 3067 * in the branch before we call TRIE_HANDLE_WORD() 3068 */ 3069 noper= NEXTOPER(cur); 3070 } 3071 TRIE_HANDLE_WORD(state); 3072 3073 } /* end second pass */ 3074 3075 /* next alloc is the NEXT state to be allocated */ 3076 trie->statecount = next_alloc; 3077 trie->states = (reg_trie_state *) 3078 PerlMemShared_realloc( trie->states, 3079 next_alloc 3080 * sizeof(reg_trie_state) ); 3081 3082 /* and now dump it out before we compress it */ 3083 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 3084 revcharmap, next_alloc, 3085 depth+1) 3086 ); 3087 3088 trie->trans = (reg_trie_trans *) 3089 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 3090 { 3091 U32 state; 3092 U32 tp = 0; 3093 U32 zp = 0; 3094 3095 3096 for( state=1 ; state < next_alloc ; state ++ ) { 3097 U32 base=0; 3098 3099 /* 3100 DEBUG_TRIE_COMPILE_MORE_r( 3101 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) 3102 ); 3103 */ 3104 3105 if (trie->states[state].trans.list) { 3106 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 3107 U16 maxid=minid; 3108 U16 idx; 3109 3110 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 3111 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 3112 if ( forid < minid ) { 3113 minid=forid; 3114 } else if ( forid > maxid ) { 3115 maxid=forid; 3116 } 3117 } 3118 if ( transcount < tp + maxid - minid + 1) { 3119 transcount *= 2; 3120 trie->trans = (reg_trie_trans *) 3121 PerlMemShared_realloc( trie->trans, 3122 transcount 3123 * sizeof(reg_trie_trans) ); 3124 Zero( trie->trans + (transcount / 2), 3125 transcount / 2, 3126 reg_trie_trans ); 3127 } 3128 base = trie->uniquecharcount + tp - minid; 3129 if ( maxid == minid ) { 3130 U32 set = 0; 3131 for ( ; zp < tp ; zp++ ) { 3132 if ( ! trie->trans[ zp ].next ) { 3133 base = trie->uniquecharcount + zp - minid; 3134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 3135 1).newstate; 3136 trie->trans[ zp ].check = state; 3137 set = 1; 3138 break; 3139 } 3140 } 3141 if ( !set ) { 3142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 3143 1).newstate; 3144 trie->trans[ tp ].check = state; 3145 tp++; 3146 zp = tp; 3147 } 3148 } else { 3149 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 3150 const U32 tid = base 3151 - trie->uniquecharcount 3152 + TRIE_LIST_ITEM( state, idx ).forid; 3153 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, 3154 idx ).newstate; 3155 trie->trans[ tid ].check = state; 3156 } 3157 tp += ( maxid - minid + 1 ); 3158 } 3159 Safefree(trie->states[ state ].trans.list); 3160 } 3161 /* 3162 DEBUG_TRIE_COMPILE_MORE_r( 3163 Perl_re_printf( aTHX_ " base: %d\n",base); 3164 ); 3165 */ 3166 trie->states[ state ].trans.base=base; 3167 } 3168 trie->lasttrans = tp + 1; 3169 } 3170 } else { 3171 /* 3172 Second Pass -- Flat Table Representation. 3173 3174 we dont use the 0 slot of either trans[] or states[] so we add 1 to 3175 each. We know that we will need Charcount+1 trans at most to store 3176 the data (one row per char at worst case) So we preallocate both 3177 structures assuming worst case. 3178 3179 We then construct the trie using only the .next slots of the entry 3180 structs. 3181 3182 We use the .check field of the first entry of the node temporarily 3183 to make compression both faster and easier by keeping track of how 3184 many non zero fields are in the node. 3185 3186 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 3187 transition. 3188 3189 There are two terms at use here: state as a TRIE_NODEIDX() which is 3190 a number representing the first entry of the node, and state as a 3191 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) 3192 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) 3193 if there are 2 entrys per node. eg: 3194 3195 A B A B 3196 1. 2 4 1. 3 7 3197 2. 0 3 3. 0 5 3198 3. 0 0 5. 0 0 3199 4. 0 0 7. 0 0 3200 3201 The table is internally in the right hand, idx form. However as we 3202 also have to deal with the states array which is indexed by nodenum 3203 we have to use TRIE_NODENUM() to convert. 3204 3205 */ 3206 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", 3207 depth+1)); 3208 3209 trie->trans = (reg_trie_trans *) 3210 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 3211 * trie->uniquecharcount + 1, 3212 sizeof(reg_trie_trans) ); 3213 trie->states = (reg_trie_state *) 3214 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 3215 sizeof(reg_trie_state) ); 3216 next_alloc = trie->uniquecharcount + 1; 3217 3218 3219 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 3220 3221 regnode *noper = NEXTOPER( cur ); 3222 3223 U32 state = 1; /* required init */ 3224 3225 U16 charid = 0; /* sanity init */ 3226 U32 accept_state = 0; /* sanity init */ 3227 3228 U32 wordlen = 0; /* required init */ 3229 3230 if (OP(noper) == NOTHING) { 3231 regnode *noper_next= regnext(noper); 3232 if (noper_next < tail) 3233 noper= noper_next; 3234 /* we will undo this assignment if noper does not 3235 * point at a trieable type in the else clause of 3236 * the following statement. */ 3237 } 3238 3239 if ( noper < tail 3240 && ( OP(noper) == flags 3241 || (flags == EXACT && OP(noper) == EXACT_ONLY8) 3242 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8 3243 || OP(noper) == EXACTFUP)))) 3244 { 3245 const U8 *uc= (U8*)STRING(noper); 3246 const U8 *e= uc + STR_LEN(noper); 3247 3248 for ( ; uc < e ; uc += len ) { 3249 3250 TRIE_READ_CHAR; 3251 3252 if ( uvc < 256 ) { 3253 charid = trie->charmap[ uvc ]; 3254 } else { 3255 SV* const * const svpp = hv_fetch( widecharmap, 3256 (char*)&uvc, 3257 sizeof( UV ), 3258 0); 3259 charid = svpp ? (U16)SvIV(*svpp) : 0; 3260 } 3261 if ( charid ) { 3262 charid--; 3263 if ( !trie->trans[ state + charid ].next ) { 3264 trie->trans[ state + charid ].next = next_alloc; 3265 trie->trans[ state ].check++; 3266 prev_states[TRIE_NODENUM(next_alloc)] 3267 = TRIE_NODENUM(state); 3268 next_alloc += trie->uniquecharcount; 3269 } 3270 state = trie->trans[ state + charid ].next; 3271 } else { 3272 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); 3273 } 3274 /* charid is now 0 if we dont know the char read, or 3275 * nonzero if we do */ 3276 } 3277 } else { 3278 /* If we end up here it is because we skipped past a NOTHING, but did not end up 3279 * on a trieable type. So we need to reset noper back to point at the first regop 3280 * in the branch before we call TRIE_HANDLE_WORD(). 3281 */ 3282 noper= NEXTOPER(cur); 3283 } 3284 accept_state = TRIE_NODENUM( state ); 3285 TRIE_HANDLE_WORD(accept_state); 3286 3287 } /* end second pass */ 3288 3289 /* and now dump it out before we compress it */ 3290 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 3291 revcharmap, 3292 next_alloc, depth+1)); 3293 3294 { 3295 /* 3296 * Inplace compress the table.* 3297 3298 For sparse data sets the table constructed by the trie algorithm will 3299 be mostly 0/FAIL transitions or to put it another way mostly empty. 3300 (Note that leaf nodes will not contain any transitions.) 3301 3302 This algorithm compresses the tables by eliminating most such 3303 transitions, at the cost of a modest bit of extra work during lookup: 3304 3305 - Each states[] entry contains a .base field which indicates the 3306 index in the state[] array wheres its transition data is stored. 3307 3308 - If .base is 0 there are no valid transitions from that node. 3309 3310 - If .base is nonzero then charid is added to it to find an entry in 3311 the trans array. 3312 3313 -If trans[states[state].base+charid].check!=state then the 3314 transition is taken to be a 0/Fail transition. Thus if there are fail 3315 transitions at the front of the node then the .base offset will point 3316 somewhere inside the previous nodes data (or maybe even into a node 3317 even earlier), but the .check field determines if the transition is 3318 valid. 3319 3320 XXX - wrong maybe? 3321 The following process inplace converts the table to the compressed 3322 table: We first do not compress the root node 1,and mark all its 3323 .check pointers as 1 and set its .base pointer as 1 as well. This 3324 allows us to do a DFA construction from the compressed table later, 3325 and ensures that any .base pointers we calculate later are greater 3326 than 0. 3327 3328 - We set 'pos' to indicate the first entry of the second node. 3329 3330 - We then iterate over the columns of the node, finding the first and 3331 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 3332 and set the .check pointers accordingly, and advance pos 3333 appropriately and repreat for the next node. Note that when we copy 3334 the next pointers we have to convert them from the original 3335 NODEIDX form to NODENUM form as the former is not valid post 3336 compression. 3337 3338 - If a node has no transitions used we mark its base as 0 and do not 3339 advance the pos pointer. 3340 3341 - If a node only has one transition we use a second pointer into the 3342 structure to fill in allocated fail transitions from other states. 3343 This pointer is independent of the main pointer and scans forward 3344 looking for null transitions that are allocated to a state. When it 3345 finds one it writes the single transition into the "hole". If the 3346 pointer doesnt find one the single transition is appended as normal. 3347 3348 - Once compressed we can Renew/realloc the structures to release the 3349 excess space. 3350 3351 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 3352 specifically Fig 3.47 and the associated pseudocode. 3353 3354 demq 3355 */ 3356 const U32 laststate = TRIE_NODENUM( next_alloc ); 3357 U32 state, charid; 3358 U32 pos = 0, zp=0; 3359 trie->statecount = laststate; 3360 3361 for ( state = 1 ; state < laststate ; state++ ) { 3362 U8 flag = 0; 3363 const U32 stateidx = TRIE_NODEIDX( state ); 3364 const U32 o_used = trie->trans[ stateidx ].check; 3365 U32 used = trie->trans[ stateidx ].check; 3366 trie->trans[ stateidx ].check = 0; 3367 3368 for ( charid = 0; 3369 used && charid < trie->uniquecharcount; 3370 charid++ ) 3371 { 3372 if ( flag || trie->trans[ stateidx + charid ].next ) { 3373 if ( trie->trans[ stateidx + charid ].next ) { 3374 if (o_used == 1) { 3375 for ( ; zp < pos ; zp++ ) { 3376 if ( ! trie->trans[ zp ].next ) { 3377 break; 3378 } 3379 } 3380 trie->states[ state ].trans.base 3381 = zp 3382 + trie->uniquecharcount 3383 - charid ; 3384 trie->trans[ zp ].next 3385 = SAFE_TRIE_NODENUM( trie->trans[ stateidx 3386 + charid ].next ); 3387 trie->trans[ zp ].check = state; 3388 if ( ++zp > pos ) pos = zp; 3389 break; 3390 } 3391 used--; 3392 } 3393 if ( !flag ) { 3394 flag = 1; 3395 trie->states[ state ].trans.base 3396 = pos + trie->uniquecharcount - charid ; 3397 } 3398 trie->trans[ pos ].next 3399 = SAFE_TRIE_NODENUM( 3400 trie->trans[ stateidx + charid ].next ); 3401 trie->trans[ pos ].check = state; 3402 pos++; 3403 } 3404 } 3405 } 3406 trie->lasttrans = pos + 1; 3407 trie->states = (reg_trie_state *) 3408 PerlMemShared_realloc( trie->states, laststate 3409 * sizeof(reg_trie_state) ); 3410 DEBUG_TRIE_COMPILE_MORE_r( 3411 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", 3412 depth+1, 3413 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount 3414 + 1 ), 3415 (IV)next_alloc, 3416 (IV)pos, 3417 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 3418 ); 3419 3420 } /* end table compress */ 3421 } 3422 DEBUG_TRIE_COMPILE_MORE_r( 3423 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", 3424 depth+1, 3425 (UV)trie->statecount, 3426 (UV)trie->lasttrans) 3427 ); 3428 /* resize the trans array to remove unused space */ 3429 trie->trans = (reg_trie_trans *) 3430 PerlMemShared_realloc( trie->trans, trie->lasttrans 3431 * sizeof(reg_trie_trans) ); 3432 3433 { /* Modify the program and insert the new TRIE node */ 3434 U8 nodetype =(U8)(flags & 0xFF); 3435 char *str=NULL; 3436 3437 #ifdef DEBUGGING 3438 regnode *optimize = NULL; 3439 #ifdef RE_TRACK_PATTERN_OFFSETS 3440 3441 U32 mjd_offset = 0; 3442 U32 mjd_nodelen = 0; 3443 #endif /* RE_TRACK_PATTERN_OFFSETS */ 3444 #endif /* DEBUGGING */ 3445 /* 3446 This means we convert either the first branch or the first Exact, 3447 depending on whether the thing following (in 'last') is a branch 3448 or not and whther first is the startbranch (ie is it a sub part of 3449 the alternation or is it the whole thing.) 3450 Assuming its a sub part we convert the EXACT otherwise we convert 3451 the whole branch sequence, including the first. 3452 */ 3453 /* Find the node we are going to overwrite */ 3454 if ( first != startbranch || OP( last ) == BRANCH ) { 3455 /* branch sub-chain */ 3456 NEXT_OFF( first ) = (U16)(last - first); 3457 #ifdef RE_TRACK_PATTERN_OFFSETS 3458 DEBUG_r({ 3459 mjd_offset= Node_Offset((convert)); 3460 mjd_nodelen= Node_Length((convert)); 3461 }); 3462 #endif 3463 /* whole branch chain */ 3464 } 3465 #ifdef RE_TRACK_PATTERN_OFFSETS 3466 else { 3467 DEBUG_r({ 3468 const regnode *nop = NEXTOPER( convert ); 3469 mjd_offset= Node_Offset((nop)); 3470 mjd_nodelen= Node_Length((nop)); 3471 }); 3472 } 3473 DEBUG_OPTIMISE_r( 3474 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n", 3475 depth+1, 3476 (UV)mjd_offset, (UV)mjd_nodelen) 3477 ); 3478 #endif 3479 /* But first we check to see if there is a common prefix we can 3480 split out as an EXACT and put in front of the TRIE node. */ 3481 trie->startstate= 1; 3482 if ( trie->bitmap && !widecharmap && !trie->jump ) { 3483 /* we want to find the first state that has more than 3484 * one transition, if that state is not the first state 3485 * then we have a common prefix which we can remove. 3486 */ 3487 U32 state; 3488 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 3489 U32 ofs = 0; 3490 I32 first_ofs = -1; /* keeps track of the ofs of the first 3491 transition, -1 means none */ 3492 U32 count = 0; 3493 const U32 base = trie->states[ state ].trans.base; 3494 3495 /* does this state terminate an alternation? */ 3496 if ( trie->states[state].wordnum ) 3497 count = 1; 3498 3499 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 3500 if ( ( base + ofs >= trie->uniquecharcount ) && 3501 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 3502 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 3503 { 3504 if ( ++count > 1 ) { 3505 /* we have more than one transition */ 3506 SV **tmp; 3507 U8 *ch; 3508 /* if this is the first state there is no common prefix 3509 * to extract, so we can exit */ 3510 if ( state == 1 ) break; 3511 tmp = av_fetch( revcharmap, ofs, 0); 3512 ch = (U8*)SvPV_nolen_const( *tmp ); 3513 3514 /* if we are on count 2 then we need to initialize the 3515 * bitmap, and store the previous char if there was one 3516 * in it*/ 3517 if ( count == 2 ) { 3518 /* clear the bitmap */ 3519 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 3520 DEBUG_OPTIMISE_r( 3521 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", 3522 depth+1, 3523 (UV)state)); 3524 if (first_ofs >= 0) { 3525 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0); 3526 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 3527 3528 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); 3529 DEBUG_OPTIMISE_r( 3530 Perl_re_printf( aTHX_ "%s", (char*)ch) 3531 ); 3532 } 3533 } 3534 /* store the current firstchar in the bitmap */ 3535 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); 3536 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); 3537 } 3538 first_ofs = ofs; 3539 } 3540 } 3541 if ( count == 1 ) { 3542 /* This state has only one transition, its transition is part 3543 * of a common prefix - we need to concatenate the char it 3544 * represents to what we have so far. */ 3545 SV **tmp = av_fetch( revcharmap, first_ofs, 0); 3546 STRLEN len; 3547 char *ch = SvPV( *tmp, len ); 3548 DEBUG_OPTIMISE_r({ 3549 SV *sv=sv_newmortal(); 3550 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", 3551 depth+1, 3552 (UV)state, (UV)first_ofs, 3553 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 3554 PL_colors[0], PL_colors[1], 3555 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 3556 PERL_PV_ESCAPE_FIRSTCHAR 3557 ) 3558 ); 3559 }); 3560 if ( state==1 ) { 3561 OP( convert ) = nodetype; 3562 str=STRING(convert); 3563 STR_LEN(convert)=0; 3564 } 3565 STR_LEN(convert) += len; 3566 while (len--) 3567 *str++ = *ch++; 3568 } else { 3569 #ifdef DEBUGGING 3570 if (state>1) 3571 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); 3572 #endif 3573 break; 3574 } 3575 } 3576 trie->prefixlen = (state-1); 3577 if (str) { 3578 regnode *n = convert+NODE_SZ_STR(convert); 3579 NEXT_OFF(convert) = NODE_SZ_STR(convert); 3580 trie->startstate = state; 3581 trie->minlen -= (state - 1); 3582 trie->maxlen -= (state - 1); 3583 #ifdef DEBUGGING 3584 /* At least the UNICOS C compiler choked on this 3585 * being argument to DEBUG_r(), so let's just have 3586 * it right here. */ 3587 if ( 3588 #ifdef PERL_EXT_RE_BUILD 3589 1 3590 #else 3591 DEBUG_r_TEST 3592 #endif 3593 ) { 3594 regnode *fix = convert; 3595 U32 word = trie->wordcount; 3596 #ifdef RE_TRACK_PATTERN_OFFSETS 3597 mjd_nodelen++; 3598 #endif 3599 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 3600 while( ++fix < n ) { 3601 Set_Node_Offset_Length(fix, 0, 0); 3602 } 3603 while (word--) { 3604 SV ** const tmp = av_fetch( trie_words, word, 0 ); 3605 if (tmp) { 3606 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 3607 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 3608 else 3609 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 3610 } 3611 } 3612 } 3613 #endif 3614 if (trie->maxlen) { 3615 convert = n; 3616 } else { 3617 NEXT_OFF(convert) = (U16)(tail - convert); 3618 DEBUG_r(optimize= n); 3619 } 3620 } 3621 } 3622 if (!jumper) 3623 jumper = last; 3624 if ( trie->maxlen ) { 3625 NEXT_OFF( convert ) = (U16)(tail - convert); 3626 ARG_SET( convert, data_slot ); 3627 /* Store the offset to the first unabsorbed branch in 3628 jump[0], which is otherwise unused by the jump logic. 3629 We use this when dumping a trie and during optimisation. */ 3630 if (trie->jump) 3631 trie->jump[0] = (U16)(nextbranch - convert); 3632 3633 /* If the start state is not accepting (meaning there is no empty string/NOTHING) 3634 * and there is a bitmap 3635 * and the first "jump target" node we found leaves enough room 3636 * then convert the TRIE node into a TRIEC node, with the bitmap 3637 * embedded inline in the opcode - this is hypothetically faster. 3638 */ 3639 if ( !trie->states[trie->startstate].wordnum 3640 && trie->bitmap 3641 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 3642 { 3643 OP( convert ) = TRIEC; 3644 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 3645 PerlMemShared_free(trie->bitmap); 3646 trie->bitmap= NULL; 3647 } else 3648 OP( convert ) = TRIE; 3649 3650 /* store the type in the flags */ 3651 convert->flags = nodetype; 3652 DEBUG_r({ 3653 optimize = convert 3654 + NODE_STEP_REGNODE 3655 + regarglen[ OP( convert ) ]; 3656 }); 3657 /* XXX We really should free up the resource in trie now, 3658 as we won't use them - (which resources?) dmq */ 3659 } 3660 /* needed for dumping*/ 3661 DEBUG_r(if (optimize) { 3662 regnode *opt = convert; 3663 3664 while ( ++opt < optimize) { 3665 Set_Node_Offset_Length(opt, 0, 0); 3666 } 3667 /* 3668 Try to clean up some of the debris left after the 3669 optimisation. 3670 */ 3671 while( optimize < jumper ) { 3672 Track_Code( mjd_nodelen += Node_Length((optimize)); ); 3673 OP( optimize ) = OPTIMIZED; 3674 Set_Node_Offset_Length(optimize, 0, 0); 3675 optimize++; 3676 } 3677 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen); 3678 }); 3679 } /* end node insert */ 3680 3681 /* Finish populating the prev field of the wordinfo array. Walk back 3682 * from each accept state until we find another accept state, and if 3683 * so, point the first word's .prev field at the second word. If the 3684 * second already has a .prev field set, stop now. This will be the 3685 * case either if we've already processed that word's accept state, 3686 * or that state had multiple words, and the overspill words were 3687 * already linked up earlier. 3688 */ 3689 { 3690 U16 word; 3691 U32 state; 3692 U16 prev; 3693 3694 for (word=1; word <= trie->wordcount; word++) { 3695 prev = 0; 3696 if (trie->wordinfo[word].prev) 3697 continue; 3698 state = trie->wordinfo[word].accept; 3699 while (state) { 3700 state = prev_states[state]; 3701 if (!state) 3702 break; 3703 prev = trie->states[state].wordnum; 3704 if (prev) 3705 break; 3706 } 3707 trie->wordinfo[word].prev = prev; 3708 } 3709 Safefree(prev_states); 3710 } 3711 3712 3713 /* and now dump out the compressed format */ 3714 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 3715 3716 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 3717 #ifdef DEBUGGING 3718 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 3719 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 3720 #else 3721 SvREFCNT_dec_NN(revcharmap); 3722 #endif 3723 return trie->jump 3724 ? MADE_JUMP_TRIE 3725 : trie->startstate>1 3726 ? MADE_EXACT_TRIE 3727 : MADE_TRIE; 3728 } 3729 3730 STATIC regnode * 3731 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) 3732 { 3733 /* The Trie is constructed and compressed now so we can build a fail array if 3734 * it's needed 3735 3736 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3737 3.32 in the 3738 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, 3739 Ullman 1985/88 3740 ISBN 0-201-10088-6 3741 3742 We find the fail state for each state in the trie, this state is the longest 3743 proper suffix of the current state's 'word' that is also a proper prefix of 3744 another word in our trie. State 1 represents the word '' and is thus the 3745 default fail state. This allows the DFA not to have to restart after its 3746 tried and failed a word at a given point, it simply continues as though it 3747 had been matching the other word in the first place. 3748 Consider 3749 'abcdgu'=~/abcdefg|cdgu/ 3750 When we get to 'd' we are still matching the first word, we would encounter 3751 'g' which would fail, which would bring us to the state representing 'd' in 3752 the second word where we would try 'g' and succeed, proceeding to match 3753 'cdgu'. 3754 */ 3755 /* add a fail transition */ 3756 const U32 trie_offset = ARG(source); 3757 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 3758 U32 *q; 3759 const U32 ucharcount = trie->uniquecharcount; 3760 const U32 numstates = trie->statecount; 3761 const U32 ubound = trie->lasttrans + ucharcount; 3762 U32 q_read = 0; 3763 U32 q_write = 0; 3764 U32 charid; 3765 U32 base = trie->states[ 1 ].trans.base; 3766 U32 *fail; 3767 reg_ac_data *aho; 3768 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); 3769 regnode *stclass; 3770 GET_RE_DEBUG_FLAGS_DECL; 3771 3772 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; 3773 PERL_UNUSED_CONTEXT; 3774 #ifndef DEBUGGING 3775 PERL_UNUSED_ARG(depth); 3776 #endif 3777 3778 if ( OP(source) == TRIE ) { 3779 struct regnode_1 *op = (struct regnode_1 *) 3780 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 3781 StructCopy(source, op, struct regnode_1); 3782 stclass = (regnode *)op; 3783 } else { 3784 struct regnode_charclass *op = (struct regnode_charclass *) 3785 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 3786 StructCopy(source, op, struct regnode_charclass); 3787 stclass = (regnode *)op; 3788 } 3789 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ 3790 3791 ARG_SET( stclass, data_slot ); 3792 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 3793 RExC_rxi->data->data[ data_slot ] = (void*)aho; 3794 aho->trie=trie_offset; 3795 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 3796 Copy( trie->states, aho->states, numstates, reg_trie_state ); 3797 Newx( q, numstates, U32); 3798 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 3799 aho->refcount = 1; 3800 fail = aho->fail; 3801 /* initialize fail[0..1] to be 1 so that we always have 3802 a valid final fail state */ 3803 fail[ 0 ] = fail[ 1 ] = 1; 3804 3805 for ( charid = 0; charid < ucharcount ; charid++ ) { 3806 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 3807 if ( newstate ) { 3808 q[ q_write ] = newstate; 3809 /* set to point at the root */ 3810 fail[ q[ q_write++ ] ]=1; 3811 } 3812 } 3813 while ( q_read < q_write) { 3814 const U32 cur = q[ q_read++ % numstates ]; 3815 base = trie->states[ cur ].trans.base; 3816 3817 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 3818 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 3819 if (ch_state) { 3820 U32 fail_state = cur; 3821 U32 fail_base; 3822 do { 3823 fail_state = fail[ fail_state ]; 3824 fail_base = aho->states[ fail_state ].trans.base; 3825 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 3826 3827 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 3828 fail[ ch_state ] = fail_state; 3829 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 3830 { 3831 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 3832 } 3833 q[ q_write++ % numstates] = ch_state; 3834 } 3835 } 3836 } 3837 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 3838 when we fail in state 1, this allows us to use the 3839 charclass scan to find a valid start char. This is based on the principle 3840 that theres a good chance the string being searched contains lots of stuff 3841 that cant be a start char. 3842 */ 3843 fail[ 0 ] = fail[ 1 ] = 0; 3844 DEBUG_TRIE_COMPILE_r({ 3845 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", 3846 depth, (UV)numstates 3847 ); 3848 for( q_read=1; q_read<numstates; q_read++ ) { 3849 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); 3850 } 3851 Perl_re_printf( aTHX_ "\n"); 3852 }); 3853 Safefree(q); 3854 /*RExC_seen |= REG_TRIEDFA_SEEN;*/ 3855 return stclass; 3856 } 3857 3858 3859 /* The below joins as many adjacent EXACTish nodes as possible into a single 3860 * one. The regop may be changed if the node(s) contain certain sequences that 3861 * require special handling. The joining is only done if: 3862 * 1) there is room in the current conglomerated node to entirely contain the 3863 * next one. 3864 * 2) they are compatible node types 3865 * 3866 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and 3867 * these get optimized out 3868 * 3869 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full 3870 * as possible, even if that means splitting an existing node so that its first 3871 * part is moved to the preceeding node. This would maximise the efficiency of 3872 * memEQ during matching. 3873 * 3874 * If a node is to match under /i (folded), the number of characters it matches 3875 * can be different than its character length if it contains a multi-character 3876 * fold. *min_subtract is set to the total delta number of characters of the 3877 * input nodes. 3878 * 3879 * And *unfolded_multi_char is set to indicate whether or not the node contains 3880 * an unfolded multi-char fold. This happens when it won't be known until 3881 * runtime whether the fold is valid or not; namely 3882 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the 3883 * target string being matched against turns out to be UTF-8 is that fold 3884 * valid; or 3885 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at 3886 * runtime. 3887 * (Multi-char folds whose components are all above the Latin1 range are not 3888 * run-time locale dependent, and have already been folded by the time this 3889 * function is called.) 3890 * 3891 * This is as good a place as any to discuss the design of handling these 3892 * multi-character fold sequences. It's been wrong in Perl for a very long 3893 * time. There are three code points in Unicode whose multi-character folds 3894 * were long ago discovered to mess things up. The previous designs for 3895 * dealing with these involved assigning a special node for them. This 3896 * approach doesn't always work, as evidenced by this example: 3897 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches 3898 * Both sides fold to "sss", but if the pattern is parsed to create a node that 3899 * would match just the \xDF, it won't be able to handle the case where a 3900 * successful match would have to cross the node's boundary. The new approach 3901 * that hopefully generally solves the problem generates an EXACTFUP node 3902 * that is "sss" in this case. 3903 * 3904 * It turns out that there are problems with all multi-character folds, and not 3905 * just these three. Now the code is general, for all such cases. The 3906 * approach taken is: 3907 * 1) This routine examines each EXACTFish node that could contain multi- 3908 * character folded sequences. Since a single character can fold into 3909 * such a sequence, the minimum match length for this node is less than 3910 * the number of characters in the node. This routine returns in 3911 * *min_subtract how many characters to subtract from the the actual 3912 * length of the string to get a real minimum match length; it is 0 if 3913 * there are no multi-char foldeds. This delta is used by the caller to 3914 * adjust the min length of the match, and the delta between min and max, 3915 * so that the optimizer doesn't reject these possibilities based on size 3916 * constraints. 3917 * 3918 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) 3919 * under /u, we fold it to 'ss' in regatom(), and in this routine, after 3920 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 3921 * EXACTFU nodes. The node type of such nodes is then changed to 3922 * EXACTFUP, indicating it is problematic, and needs careful handling. 3923 * (The procedures in step 1) above are sufficient to handle this case in 3924 * UTF-8 encoded nodes.) The reason this is problematic is that this is 3925 * the only case where there is a possible fold length change in non-UTF-8 3926 * patterns. By reserving a special node type for problematic cases, the 3927 * far more common regular EXACTFU nodes can be processed faster. 3928 * regexec.c takes advantage of this. 3929 * 3930 * EXACTFUP has been created as a grab-bag for (hopefully uncommon) 3931 * problematic cases. These all only occur when the pattern is not 3932 * UTF-8. In addition to the 'ss' sequence where there is a possible fold 3933 * length change, it handles the situation where the string cannot be 3934 * entirely folded. The strings in an EXACTFish node are folded as much 3935 * as possible during compilation in regcomp.c. This saves effort in 3936 * regex matching. By using an EXACTFUP node when it is not possible to 3937 * fully fold at compile time, regexec.c can know that everything in an 3938 * EXACTFU node is folded, so folding can be skipped at runtime. The only 3939 * case where folding in EXACTFU nodes can't be done at compile time is 3940 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This 3941 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes 3942 * handle two very different cases. Alternatively, there could have been 3943 * a node type where there are length changes, one for unfolded, and one 3944 * for both. If yet another special case needed to be created, the number 3945 * of required node types would have to go to 7. khw figures that even 3946 * though there are plenty of node types to spare, that the maintenance 3947 * cost wasn't worth the small speedup of doing it that way, especially 3948 * since he thinks the MICRO SIGN is rarely encountered in practice. 3949 * 3950 * There are other cases where folding isn't done at compile time, but 3951 * none of them are under /u, and hence not for EXACTFU nodes. The folds 3952 * in EXACTFL nodes aren't known until runtime, and vary as the locale 3953 * changes. Some folds in EXACTF depend on if the runtime target string 3954 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di 3955 * when no fold in it depends on the UTF-8ness of the target string.) 3956 * 3957 * 3) A problem remains for unfolded multi-char folds. (These occur when the 3958 * validity of the fold won't be known until runtime, and so must remain 3959 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA 3960 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot 3961 * be an EXACTF node with a UTF-8 pattern.) They also occur for various 3962 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) 3963 * The reason this is a problem is that the optimizer part of regexec.c 3964 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption 3965 * that a character in the pattern corresponds to at most a single 3966 * character in the target string. (And I do mean character, and not byte 3967 * here, unlike other parts of the documentation that have never been 3968 * updated to account for multibyte Unicode.) Sharp s in EXACTF and 3969 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA 3970 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in 3971 * EXACTFL nodes, violate the assumption, and they are the only instances 3972 * where it is violated. I'm reluctant to try to change the assumption, 3973 * as the code involved is impenetrable to me (khw), so instead the code 3974 * here punts. This routine examines EXACTFL nodes, and (when the pattern 3975 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a 3976 * boolean indicating whether or not the node contains such a fold. When 3977 * it is true, the caller sets a flag that later causes the optimizer in 3978 * this file to not set values for the floating and fixed string lengths, 3979 * and thus avoids the optimizer code in regexec.c that makes the invalid 3980 * assumption. Thus, there is no optimization based on string lengths for 3981 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern 3982 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the 3983 * assumption is wrong only in these cases is that all other non-UTF-8 3984 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to 3985 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in 3986 * EXACTF nodes because we don't know at compile time if it actually 3987 * matches 'ss' or not. For EXACTF nodes it will match iff the target 3988 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it 3989 * always matches; and EXACTFAA where it never does. In an EXACTFAA node 3990 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the 3991 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 3992 * string would require the pattern to be forced into UTF-8, the overhead 3993 * of which we want to avoid. Similarly the unfolded multi-char folds in 3994 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 3995 * locale.) 3996 * 3997 * Similarly, the code that generates tries doesn't currently handle 3998 * not-already-folded multi-char folds, and it looks like a pain to change 3999 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s 4000 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, 4001 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people 4002 * using /iaa matching will be doing so almost entirely with ASCII 4003 * strings, so this should rarely be encountered in practice */ 4004 4005 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ 4006 if (PL_regkind[OP(scan)] == EXACT) \ 4007 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1) 4008 4009 STATIC U32 4010 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, 4011 UV *min_subtract, bool *unfolded_multi_char, 4012 U32 flags, regnode *val, U32 depth) 4013 { 4014 /* Merge several consecutive EXACTish nodes into one. */ 4015 4016 regnode *n = regnext(scan); 4017 U32 stringok = 1; 4018 regnode *next = scan + NODE_SZ_STR(scan); 4019 U32 merged = 0; 4020 U32 stopnow = 0; 4021 #ifdef DEBUGGING 4022 regnode *stop = scan; 4023 GET_RE_DEBUG_FLAGS_DECL; 4024 #else 4025 PERL_UNUSED_ARG(depth); 4026 #endif 4027 4028 PERL_ARGS_ASSERT_JOIN_EXACT; 4029 #ifndef EXPERIMENTAL_INPLACESCAN 4030 PERL_UNUSED_ARG(flags); 4031 PERL_UNUSED_ARG(val); 4032 #endif 4033 DEBUG_PEEP("join", scan, depth, 0); 4034 4035 assert(PL_regkind[OP(scan)] == EXACT); 4036 4037 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge 4038 * EXACT ones that are mergeable to the current one. */ 4039 while ( n 4040 && ( PL_regkind[OP(n)] == NOTHING 4041 || (stringok && PL_regkind[OP(n)] == EXACT)) 4042 && NEXT_OFF(n) 4043 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) 4044 { 4045 4046 if (OP(n) == TAIL || n > next) 4047 stringok = 0; 4048 if (PL_regkind[OP(n)] == NOTHING) { 4049 DEBUG_PEEP("skip:", n, depth, 0); 4050 NEXT_OFF(scan) += NEXT_OFF(n); 4051 next = n + NODE_STEP_REGNODE; 4052 #ifdef DEBUGGING 4053 if (stringok) 4054 stop = n; 4055 #endif 4056 n = regnext(n); 4057 } 4058 else if (stringok) { 4059 const unsigned int oldl = STR_LEN(scan); 4060 regnode * const nnext = regnext(n); 4061 4062 /* XXX I (khw) kind of doubt that this works on platforms (should 4063 * Perl ever run on one) where U8_MAX is above 255 because of lots 4064 * of other assumptions */ 4065 /* Don't join if the sum can't fit into a single node */ 4066 if (oldl + STR_LEN(n) > U8_MAX) 4067 break; 4068 4069 /* Joining something that requires UTF-8 with something that 4070 * doesn't, means the result requires UTF-8. */ 4071 if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) { 4072 OP(scan) = EXACT_ONLY8; 4073 } 4074 else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) { 4075 ; /* join is compatible, no need to change OP */ 4076 } 4077 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) { 4078 OP(scan) = EXACTFU_ONLY8; 4079 } 4080 else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) { 4081 ; /* join is compatible, no need to change OP */ 4082 } 4083 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { 4084 ; /* join is compatible, no need to change OP */ 4085 } 4086 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { 4087 4088 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, 4089 * which can join with EXACTFU ones. We check for this case 4090 * here. These need to be resolved to either EXACTFU or 4091 * EXACTF at joining time. They have nothing in them that 4092 * would forbid them from being the more desirable EXACTFU 4093 * nodes except that they begin and/or end with a single [Ss]. 4094 * The reason this is problematic is because they could be 4095 * joined in this loop with an adjacent node that ends and/or 4096 * begins with [Ss] which would then form the sequence 'ss', 4097 * which matches differently under /di than /ui, in which case 4098 * EXACTFU can't be used. If the 'ss' sequence doesn't get 4099 * formed, the nodes get absorbed into any adjacent EXACTFU 4100 * node. And if the only adjacent node is EXACTF, they get 4101 * absorbed into that, under the theory that a longer node is 4102 * better than two shorter ones, even if one is EXACTFU. Note 4103 * that EXACTFU_ONLY8 is generated only for UTF-8 patterns, 4104 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ 4105 4106 if (STRING(n)[STR_LEN(n)-1] == 's') { 4107 4108 /* Here the joined node would end with 's'. If the node 4109 * following the combination is an EXACTF one, it's better to 4110 * join this trailing edge 's' node with that one, leaving the 4111 * current one in 'scan' be the more desirable EXACTFU */ 4112 if (OP(nnext) == EXACTF) { 4113 break; 4114 } 4115 4116 OP(scan) = EXACTFU_S_EDGE; 4117 4118 } /* Otherwise, the beginning 's' of the 2nd node just 4119 becomes an interior 's' in 'scan' */ 4120 } 4121 else if (OP(scan) == EXACTF && OP(n) == EXACTF) { 4122 ; /* join is compatible, no need to change OP */ 4123 } 4124 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { 4125 4126 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE 4127 * nodes. But the latter nodes can be also joined with EXACTFU 4128 * ones, and that is a better outcome, so if the node following 4129 * 'n' is EXACTFU, quit now so that those two can be joined 4130 * later */ 4131 if (OP(nnext) == EXACTFU) { 4132 break; 4133 } 4134 4135 /* The join is compatible, and the combined node will be 4136 * EXACTF. (These don't care if they begin or end with 's' */ 4137 } 4138 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { 4139 if ( STRING(scan)[STR_LEN(scan)-1] == 's' 4140 && STRING(n)[0] == 's') 4141 { 4142 /* When combined, we have the sequence 'ss', which means we 4143 * have to remain /di */ 4144 OP(scan) = EXACTF; 4145 } 4146 } 4147 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { 4148 if (STRING(n)[0] == 's') { 4149 ; /* Here the join is compatible and the combined node 4150 starts with 's', no need to change OP */ 4151 } 4152 else { /* Now the trailing 's' is in the interior */ 4153 OP(scan) = EXACTFU; 4154 } 4155 } 4156 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { 4157 4158 /* The join is compatible, and the combined node will be 4159 * EXACTF. (These don't care if they begin or end with 's' */ 4160 OP(scan) = EXACTF; 4161 } 4162 else if (OP(scan) != OP(n)) { 4163 4164 /* The only other compatible joinings are the same node type */ 4165 break; 4166 } 4167 4168 DEBUG_PEEP("merg", n, depth, 0); 4169 merged++; 4170 4171 NEXT_OFF(scan) += NEXT_OFF(n); 4172 STR_LEN(scan) += STR_LEN(n); 4173 next = n + NODE_SZ_STR(n); 4174 /* Now we can overwrite *n : */ 4175 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 4176 #ifdef DEBUGGING 4177 stop = next - 1; 4178 #endif 4179 n = nnext; 4180 if (stopnow) break; 4181 } 4182 4183 #ifdef EXPERIMENTAL_INPLACESCAN 4184 if (flags && !NEXT_OFF(n)) { 4185 DEBUG_PEEP("atch", val, depth, 0); 4186 if (reg_off_by_arg[OP(n)]) { 4187 ARG_SET(n, val - n); 4188 } 4189 else { 4190 NEXT_OFF(n) = val - n; 4191 } 4192 stopnow = 1; 4193 } 4194 #endif 4195 } 4196 4197 /* This temporary node can now be turned into EXACTFU, and must, as 4198 * regexec.c doesn't handle it */ 4199 if (OP(scan) == EXACTFU_S_EDGE) { 4200 OP(scan) = EXACTFU; 4201 } 4202 4203 *min_subtract = 0; 4204 *unfolded_multi_char = FALSE; 4205 4206 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We 4207 * can now analyze for sequences of problematic code points. (Prior to 4208 * this final joining, sequences could have been split over boundaries, and 4209 * hence missed). The sequences only happen in folding, hence for any 4210 * non-EXACT EXACTish node */ 4211 if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) { 4212 U8* s0 = (U8*) STRING(scan); 4213 U8* s = s0; 4214 U8* s_end = s0 + STR_LEN(scan); 4215 4216 int total_count_delta = 0; /* Total delta number of characters that 4217 multi-char folds expand to */ 4218 4219 /* One pass is made over the node's string looking for all the 4220 * possibilities. To avoid some tests in the loop, there are two main 4221 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and 4222 * non-UTF-8 */ 4223 if (UTF) { 4224 U8* folded = NULL; 4225 4226 if (OP(scan) == EXACTFL) { 4227 U8 *d; 4228 4229 /* An EXACTFL node would already have been changed to another 4230 * node type unless there is at least one character in it that 4231 * is problematic; likely a character whose fold definition 4232 * won't be known until runtime, and so has yet to be folded. 4233 * For all but the UTF-8 locale, folds are 1-1 in length, but 4234 * to handle the UTF-8 case, we need to create a temporary 4235 * folded copy using UTF-8 locale rules in order to analyze it. 4236 * This is because our macros that look to see if a sequence is 4237 * a multi-char fold assume everything is folded (otherwise the 4238 * tests in those macros would be too complicated and slow). 4239 * Note that here, the non-problematic folds will have already 4240 * been done, so we can just copy such characters. We actually 4241 * don't completely fold the EXACTFL string. We skip the 4242 * unfolded multi-char folds, as that would just create work 4243 * below to figure out the size they already are */ 4244 4245 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); 4246 d = folded; 4247 while (s < s_end) { 4248 STRLEN s_len = UTF8SKIP(s); 4249 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { 4250 Copy(s, d, s_len, U8); 4251 d += s_len; 4252 } 4253 else if (is_FOLDS_TO_MULTI_utf8(s)) { 4254 *unfolded_multi_char = TRUE; 4255 Copy(s, d, s_len, U8); 4256 d += s_len; 4257 } 4258 else if (isASCII(*s)) { 4259 *(d++) = toFOLD(*s); 4260 } 4261 else { 4262 STRLEN len; 4263 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); 4264 d += len; 4265 } 4266 s += s_len; 4267 } 4268 4269 /* Point the remainder of the routine to look at our temporary 4270 * folded copy */ 4271 s = folded; 4272 s_end = d; 4273 } /* End of creating folded copy of EXACTFL string */ 4274 4275 /* Examine the string for a multi-character fold sequence. UTF-8 4276 * patterns have all characters pre-folded by the time this code is 4277 * executed */ 4278 while (s < s_end - 1) /* Can stop 1 before the end, as minimum 4279 length sequence we are looking for is 2 */ 4280 { 4281 int count = 0; /* How many characters in a multi-char fold */ 4282 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); 4283 if (! len) { /* Not a multi-char fold: get next char */ 4284 s += UTF8SKIP(s); 4285 continue; 4286 } 4287 4288 { /* Here is a generic multi-char fold. */ 4289 U8* multi_end = s + len; 4290 4291 /* Count how many characters are in it. In the case of 4292 * /aa, no folds which contain ASCII code points are 4293 * allowed, so check for those, and skip if found. */ 4294 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { 4295 count = utf8_length(s, multi_end); 4296 s = multi_end; 4297 } 4298 else { 4299 while (s < multi_end) { 4300 if (isASCII(*s)) { 4301 s++; 4302 goto next_iteration; 4303 } 4304 else { 4305 s += UTF8SKIP(s); 4306 } 4307 count++; 4308 } 4309 } 4310 } 4311 4312 /* The delta is how long the sequence is minus 1 (1 is how long 4313 * the character that folds to the sequence is) */ 4314 total_count_delta += count - 1; 4315 next_iteration: ; 4316 } 4317 4318 /* We created a temporary folded copy of the string in EXACTFL 4319 * nodes. Therefore we need to be sure it doesn't go below zero, 4320 * as the real string could be shorter */ 4321 if (OP(scan) == EXACTFL) { 4322 int total_chars = utf8_length((U8*) STRING(scan), 4323 (U8*) STRING(scan) + STR_LEN(scan)); 4324 if (total_count_delta > total_chars) { 4325 total_count_delta = total_chars; 4326 } 4327 } 4328 4329 *min_subtract += total_count_delta; 4330 Safefree(folded); 4331 } 4332 else if (OP(scan) == EXACTFAA) { 4333 4334 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char 4335 * fold to the ASCII range (and there are no existing ones in the 4336 * upper latin1 range). But, as outlined in the comments preceding 4337 * this function, we need to flag any occurrences of the sharp s. 4338 * This character forbids trie formation (because of added 4339 * complexity) */ 4340 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 4341 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 4342 || UNICODE_DOT_DOT_VERSION > 0) 4343 while (s < s_end) { 4344 if (*s == LATIN_SMALL_LETTER_SHARP_S) { 4345 OP(scan) = EXACTFAA_NO_TRIE; 4346 *unfolded_multi_char = TRUE; 4347 break; 4348 } 4349 s++; 4350 } 4351 } 4352 else { 4353 4354 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char 4355 * folds that are all Latin1. As explained in the comments 4356 * preceding this function, we look also for the sharp s in EXACTF 4357 * and EXACTFL nodes; it can be in the final position. Otherwise 4358 * we can stop looking 1 byte earlier because have to find at least 4359 * two characters for a multi-fold */ 4360 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) 4361 ? s_end 4362 : s_end -1; 4363 4364 while (s < upper) { 4365 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); 4366 if (! len) { /* Not a multi-char fold. */ 4367 if (*s == LATIN_SMALL_LETTER_SHARP_S 4368 && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) 4369 { 4370 *unfolded_multi_char = TRUE; 4371 } 4372 s++; 4373 continue; 4374 } 4375 4376 if (len == 2 4377 && isALPHA_FOLD_EQ(*s, 's') 4378 && isALPHA_FOLD_EQ(*(s+1), 's')) 4379 { 4380 4381 /* EXACTF nodes need to know that the minimum length 4382 * changed so that a sharp s in the string can match this 4383 * ss in the pattern, but they remain EXACTF nodes, as they 4384 * won't match this unless the target string is is UTF-8, 4385 * which we don't know until runtime. EXACTFL nodes can't 4386 * transform into EXACTFU nodes */ 4387 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { 4388 OP(scan) = EXACTFUP; 4389 } 4390 } 4391 4392 *min_subtract += len - 1; 4393 s += len; 4394 } 4395 #endif 4396 } 4397 4398 if ( STR_LEN(scan) == 1 4399 && isALPHA_A(* STRING(scan)) 4400 && ( OP(scan) == EXACTFAA 4401 || ( OP(scan) == EXACTFU 4402 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan))))) 4403 { 4404 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ 4405 4406 /* Replace a length 1 ASCII fold pair node with an ANYOFM node, 4407 * with the mask set to the complement of the bit that differs 4408 * between upper and lower case, and the lowest code point of the 4409 * pair (which the '&' forces) */ 4410 OP(scan) = ANYOFM; 4411 ARG_SET(scan, *STRING(scan) & mask); 4412 FLAGS(scan) = mask; 4413 } 4414 } 4415 4416 #ifdef DEBUGGING 4417 /* Allow dumping but overwriting the collection of skipped 4418 * ops and/or strings with fake optimized ops */ 4419 n = scan + NODE_SZ_STR(scan); 4420 while (n <= stop) { 4421 OP(n) = OPTIMIZED; 4422 FLAGS(n) = 0; 4423 NEXT_OFF(n) = 0; 4424 n++; 4425 } 4426 #endif 4427 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); 4428 return stopnow; 4429 } 4430 4431 /* REx optimizer. Converts nodes into quicker variants "in place". 4432 Finds fixed substrings. */ 4433 4434 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 4435 to the position after last scanned or to NULL. */ 4436 4437 #define INIT_AND_WITHP \ 4438 assert(!and_withp); \ 4439 Newx(and_withp, 1, regnode_ssc); \ 4440 SAVEFREEPV(and_withp) 4441 4442 4443 static void 4444 S_unwind_scan_frames(pTHX_ const void *p) 4445 { 4446 scan_frame *f= (scan_frame *)p; 4447 do { 4448 scan_frame *n= f->next_frame; 4449 Safefree(f); 4450 f= n; 4451 } while (f); 4452 } 4453 4454 /* Follow the next-chain of the current node and optimize away 4455 all the NOTHINGs from it. 4456 */ 4457 STATIC void 4458 S_rck_elide_nothing(pTHX_ regnode *node) 4459 { 4460 dVAR; 4461 4462 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; 4463 4464 if (OP(node) != CURLYX) { 4465 const int max = (reg_off_by_arg[OP(node)] 4466 ? I32_MAX 4467 /* I32 may be smaller than U16 on CRAYs! */ 4468 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 4469 int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node)); 4470 int noff; 4471 regnode *n = node; 4472 4473 /* Skip NOTHING and LONGJMP. */ 4474 while ( 4475 (n = regnext(n)) 4476 && ( 4477 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 4478 || ((OP(n) == LONGJMP) && (noff = ARG(n))) 4479 ) 4480 && off + noff < max 4481 ) { 4482 off += noff; 4483 } 4484 if (reg_off_by_arg[OP(node)]) 4485 ARG(node) = off; 4486 else 4487 NEXT_OFF(node) = off; 4488 } 4489 return; 4490 } 4491 4492 /* the return from this sub is the minimum length that could possibly match */ 4493 STATIC SSize_t 4494 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 4495 SSize_t *minlenp, SSize_t *deltap, 4496 regnode *last, 4497 scan_data_t *data, 4498 I32 stopparen, 4499 U32 recursed_depth, 4500 regnode_ssc *and_withp, 4501 U32 flags, U32 depth, bool was_mutate_ok) 4502 /* scanp: Start here (read-write). */ 4503 /* deltap: Write maxlen-minlen here. */ 4504 /* last: Stop before this one. */ 4505 /* data: string data about the pattern */ 4506 /* stopparen: treat close N as END */ 4507 /* recursed: which subroutines have we recursed into */ 4508 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 4509 { 4510 dVAR; 4511 /* There must be at least this number of characters to match */ 4512 SSize_t min = 0; 4513 I32 pars = 0, code; 4514 regnode *scan = *scanp, *next; 4515 SSize_t delta = 0; 4516 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 4517 int is_inf_internal = 0; /* The studied chunk is infinite */ 4518 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 4519 scan_data_t data_fake; 4520 SV *re_trie_maxbuff = NULL; 4521 regnode *first_non_open = scan; 4522 SSize_t stopmin = SSize_t_MAX; 4523 scan_frame *frame = NULL; 4524 GET_RE_DEBUG_FLAGS_DECL; 4525 4526 PERL_ARGS_ASSERT_STUDY_CHUNK; 4527 RExC_study_started= 1; 4528 4529 Zero(&data_fake, 1, scan_data_t); 4530 4531 if ( depth == 0 ) { 4532 while (first_non_open && OP(first_non_open) == OPEN) 4533 first_non_open=regnext(first_non_open); 4534 } 4535 4536 4537 fake_study_recurse: 4538 DEBUG_r( 4539 RExC_study_chunk_recursed_count++; 4540 ); 4541 DEBUG_OPTIMISE_MORE_r( 4542 { 4543 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", 4544 depth, (long)stopparen, 4545 (unsigned long)RExC_study_chunk_recursed_count, 4546 (unsigned long)depth, (unsigned long)recursed_depth, 4547 scan, 4548 last); 4549 if (recursed_depth) { 4550 U32 i; 4551 U32 j; 4552 for ( j = 0 ; j < recursed_depth ; j++ ) { 4553 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { 4554 if ( 4555 PAREN_TEST(RExC_study_chunk_recursed + 4556 ( j * RExC_study_chunk_recursed_bytes), i ) 4557 && ( 4558 !j || 4559 !PAREN_TEST(RExC_study_chunk_recursed + 4560 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i) 4561 ) 4562 ) { 4563 Perl_re_printf( aTHX_ " %d",(int)i); 4564 break; 4565 } 4566 } 4567 if ( j + 1 < recursed_depth ) { 4568 Perl_re_printf( aTHX_ ","); 4569 } 4570 } 4571 } 4572 Perl_re_printf( aTHX_ "\n"); 4573 } 4574 ); 4575 while ( scan && OP(scan) != END && scan < last ){ 4576 UV min_subtract = 0; /* How mmany chars to subtract from the minimum 4577 node length to get a real minimum (because 4578 the folded version may be shorter) */ 4579 bool unfolded_multi_char = FALSE; 4580 /* avoid mutating ops if we are anywhere within the recursed or 4581 * enframed handling for a GOSUB: the outermost level will handle it. 4582 */ 4583 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); 4584 /* Peephole optimizer: */ 4585 DEBUG_STUDYDATA("Peep", data, depth, is_inf); 4586 DEBUG_PEEP("Peep", scan, depth, flags); 4587 4588 4589 /* The reason we do this here is that we need to deal with things like 4590 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT 4591 * parsing code, as each (?:..) is handled by a different invocation of 4592 * reg() -- Yves 4593 */ 4594 if (mutate_ok) 4595 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); 4596 4597 /* Follow the next-chain of the current node and optimize 4598 away all the NOTHINGs from it. 4599 */ 4600 rck_elide_nothing(scan); 4601 4602 /* The principal pseudo-switch. Cannot be a switch, since we 4603 look into several different things. */ 4604 if ( OP(scan) == DEFINEP ) { 4605 SSize_t minlen = 0; 4606 SSize_t deltanext = 0; 4607 SSize_t fake_last_close = 0; 4608 I32 f = SCF_IN_DEFINE; 4609 4610 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 4611 scan = regnext(scan); 4612 assert( OP(scan) == IFTHEN ); 4613 DEBUG_PEEP("expect IFTHEN", scan, depth, flags); 4614 4615 data_fake.last_closep= &fake_last_close; 4616 minlen = *minlenp; 4617 next = regnext(scan); 4618 scan = NEXTOPER(NEXTOPER(scan)); 4619 DEBUG_PEEP("scan", scan, depth, flags); 4620 DEBUG_PEEP("next", next, depth, flags); 4621 4622 /* we suppose the run is continuous, last=next... 4623 * NOTE we dont use the return here! */ 4624 /* DEFINEP study_chunk() recursion */ 4625 (void)study_chunk(pRExC_state, &scan, &minlen, 4626 &deltanext, next, &data_fake, stopparen, 4627 recursed_depth, NULL, f, depth+1, mutate_ok); 4628 4629 scan = next; 4630 } else 4631 if ( 4632 OP(scan) == BRANCH || 4633 OP(scan) == BRANCHJ || 4634 OP(scan) == IFTHEN 4635 ) { 4636 next = regnext(scan); 4637 code = OP(scan); 4638 4639 /* The op(next)==code check below is to see if we 4640 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" 4641 * IFTHEN is special as it might not appear in pairs. 4642 * Not sure whether BRANCH-BRANCHJ is possible, regardless 4643 * we dont handle it cleanly. */ 4644 if (OP(next) == code || code == IFTHEN) { 4645 /* NOTE - There is similar code to this block below for 4646 * handling TRIE nodes on a re-study. If you change stuff here 4647 * check there too. */ 4648 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; 4649 regnode_ssc accum; 4650 regnode * const startbranch=scan; 4651 4652 if (flags & SCF_DO_SUBSTR) { 4653 /* Cannot merge strings after this. */ 4654 scan_commit(pRExC_state, data, minlenp, is_inf); 4655 } 4656 4657 if (flags & SCF_DO_STCLASS) 4658 ssc_init_zero(pRExC_state, &accum); 4659 4660 while (OP(scan) == code) { 4661 SSize_t deltanext, minnext, fake; 4662 I32 f = 0; 4663 regnode_ssc this_class; 4664 4665 DEBUG_PEEP("Branch", scan, depth, flags); 4666 4667 num++; 4668 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 4669 if (data) { 4670 data_fake.whilem_c = data->whilem_c; 4671 data_fake.last_closep = data->last_closep; 4672 } 4673 else 4674 data_fake.last_closep = &fake; 4675 4676 data_fake.pos_delta = delta; 4677 next = regnext(scan); 4678 4679 scan = NEXTOPER(scan); /* everything */ 4680 if (code != BRANCH) /* everything but BRANCH */ 4681 scan = NEXTOPER(scan); 4682 4683 if (flags & SCF_DO_STCLASS) { 4684 ssc_init(pRExC_state, &this_class); 4685 data_fake.start_class = &this_class; 4686 f = SCF_DO_STCLASS_AND; 4687 } 4688 if (flags & SCF_WHILEM_VISITED_POS) 4689 f |= SCF_WHILEM_VISITED_POS; 4690 4691 /* we suppose the run is continuous, last=next...*/ 4692 /* recurse study_chunk() for each BRANCH in an alternation */ 4693 minnext = study_chunk(pRExC_state, &scan, minlenp, 4694 &deltanext, next, &data_fake, stopparen, 4695 recursed_depth, NULL, f, depth+1, 4696 mutate_ok); 4697 4698 if (min1 > minnext) 4699 min1 = minnext; 4700 if (deltanext == SSize_t_MAX) { 4701 is_inf = is_inf_internal = 1; 4702 max1 = SSize_t_MAX; 4703 } else if (max1 < minnext + deltanext) 4704 max1 = minnext + deltanext; 4705 scan = next; 4706 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 4707 pars++; 4708 if (data_fake.flags & SCF_SEEN_ACCEPT) { 4709 if ( stopmin > minnext) 4710 stopmin = min + min1; 4711 flags &= ~SCF_DO_SUBSTR; 4712 if (data) 4713 data->flags |= SCF_SEEN_ACCEPT; 4714 } 4715 if (data) { 4716 if (data_fake.flags & SF_HAS_EVAL) 4717 data->flags |= SF_HAS_EVAL; 4718 data->whilem_c = data_fake.whilem_c; 4719 } 4720 if (flags & SCF_DO_STCLASS) 4721 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); 4722 } 4723 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 4724 min1 = 0; 4725 if (flags & SCF_DO_SUBSTR) { 4726 data->pos_min += min1; 4727 if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) 4728 data->pos_delta = SSize_t_MAX; 4729 else 4730 data->pos_delta += max1 - min1; 4731 if (max1 != min1 || is_inf) 4732 data->cur_is_floating = 1; 4733 } 4734 min += min1; 4735 if (delta == SSize_t_MAX 4736 || SSize_t_MAX - delta - (max1 - min1) < 0) 4737 delta = SSize_t_MAX; 4738 else 4739 delta += max1 - min1; 4740 if (flags & SCF_DO_STCLASS_OR) { 4741 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); 4742 if (min1) { 4743 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4744 flags &= ~SCF_DO_STCLASS; 4745 } 4746 } 4747 else if (flags & SCF_DO_STCLASS_AND) { 4748 if (min1) { 4749 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 4750 flags &= ~SCF_DO_STCLASS; 4751 } 4752 else { 4753 /* Switch to OR mode: cache the old value of 4754 * data->start_class */ 4755 INIT_AND_WITHP; 4756 StructCopy(data->start_class, and_withp, regnode_ssc); 4757 flags &= ~SCF_DO_STCLASS_AND; 4758 StructCopy(&accum, data->start_class, regnode_ssc); 4759 flags |= SCF_DO_STCLASS_OR; 4760 } 4761 } 4762 4763 if (PERL_ENABLE_TRIE_OPTIMISATION 4764 && OP(startbranch) == BRANCH 4765 && mutate_ok 4766 ) { 4767 /* demq. 4768 4769 Assuming this was/is a branch we are dealing with: 'scan' 4770 now points at the item that follows the branch sequence, 4771 whatever it is. We now start at the beginning of the 4772 sequence and look for subsequences of 4773 4774 BRANCH->EXACT=>x1 4775 BRANCH->EXACT=>x2 4776 tail 4777 4778 which would be constructed from a pattern like 4779 /A|LIST|OF|WORDS/ 4780 4781 If we can find such a subsequence we need to turn the first 4782 element into a trie and then add the subsequent branch exact 4783 strings to the trie. 4784 4785 We have two cases 4786 4787 1. patterns where the whole set of branches can be 4788 converted. 4789 4790 2. patterns where only a subset can be converted. 4791 4792 In case 1 we can replace the whole set with a single regop 4793 for the trie. In case 2 we need to keep the start and end 4794 branches so 4795 4796 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 4797 becomes BRANCH TRIE; BRANCH X; 4798 4799 There is an additional case, that being where there is a 4800 common prefix, which gets split out into an EXACT like node 4801 preceding the TRIE node. 4802 4803 If x(1..n)==tail then we can do a simple trie, if not we make 4804 a "jump" trie, such that when we match the appropriate word 4805 we "jump" to the appropriate tail node. Essentially we turn 4806 a nested if into a case structure of sorts. 4807 4808 */ 4809 4810 int made=0; 4811 if (!re_trie_maxbuff) { 4812 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 4813 if (!SvIOK(re_trie_maxbuff)) 4814 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 4815 } 4816 if ( SvIV(re_trie_maxbuff)>=0 ) { 4817 regnode *cur; 4818 regnode *first = (regnode *)NULL; 4819 regnode *last = (regnode *)NULL; 4820 regnode *tail = scan; 4821 U8 trietype = 0; 4822 U32 count=0; 4823 4824 /* var tail is used because there may be a TAIL 4825 regop in the way. Ie, the exacts will point to the 4826 thing following the TAIL, but the last branch will 4827 point at the TAIL. So we advance tail. If we 4828 have nested (?:) we may have to move through several 4829 tails. 4830 */ 4831 4832 while ( OP( tail ) == TAIL ) { 4833 /* this is the TAIL generated by (?:) */ 4834 tail = regnext( tail ); 4835 } 4836 4837 4838 DEBUG_TRIE_COMPILE_r({ 4839 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); 4840 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", 4841 depth+1, 4842 "Looking for TRIE'able sequences. Tail node is ", 4843 (UV) REGNODE_OFFSET(tail), 4844 SvPV_nolen_const( RExC_mysv ) 4845 ); 4846 }); 4847 4848 /* 4849 4850 Step through the branches 4851 cur represents each branch, 4852 noper is the first thing to be matched as part 4853 of that branch 4854 noper_next is the regnext() of that node. 4855 4856 We normally handle a case like this 4857 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also 4858 support building with NOJUMPTRIE, which restricts 4859 the trie logic to structures like /FOO|BAR/. 4860 4861 If noper is a trieable nodetype then the branch is 4862 a possible optimization target. If we are building 4863 under NOJUMPTRIE then we require that noper_next is 4864 the same as scan (our current position in the regex 4865 program). 4866 4867 Once we have two or more consecutive such branches 4868 we can create a trie of the EXACT's contents and 4869 stitch it in place into the program. 4870 4871 If the sequence represents all of the branches in 4872 the alternation we replace the entire thing with a 4873 single TRIE node. 4874 4875 Otherwise when it is a subsequence we need to 4876 stitch it in place and replace only the relevant 4877 branches. This means the first branch has to remain 4878 as it is used by the alternation logic, and its 4879 next pointer, and needs to be repointed at the item 4880 on the branch chain following the last branch we 4881 have optimized away. 4882 4883 This could be either a BRANCH, in which case the 4884 subsequence is internal, or it could be the item 4885 following the branch sequence in which case the 4886 subsequence is at the end (which does not 4887 necessarily mean the first node is the start of the 4888 alternation). 4889 4890 TRIE_TYPE(X) is a define which maps the optype to a 4891 trietype. 4892 4893 optype | trietype 4894 ----------------+----------- 4895 NOTHING | NOTHING 4896 EXACT | EXACT 4897 EXACT_ONLY8 | EXACT 4898 EXACTFU | EXACTFU 4899 EXACTFU_ONLY8 | EXACTFU 4900 EXACTFUP | EXACTFU 4901 EXACTFAA | EXACTFAA 4902 EXACTL | EXACTL 4903 EXACTFLU8 | EXACTFLU8 4904 4905 4906 */ 4907 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ 4908 ? NOTHING \ 4909 : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \ 4910 ? EXACT \ 4911 : ( EXACTFU == (X) \ 4912 || EXACTFU_ONLY8 == (X) \ 4913 || EXACTFUP == (X) ) \ 4914 ? EXACTFU \ 4915 : ( EXACTFAA == (X) ) \ 4916 ? EXACTFAA \ 4917 : ( EXACTL == (X) ) \ 4918 ? EXACTL \ 4919 : ( EXACTFLU8 == (X) ) \ 4920 ? EXACTFLU8 \ 4921 : 0 ) 4922 4923 /* dont use tail as the end marker for this traverse */ 4924 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 4925 regnode * const noper = NEXTOPER( cur ); 4926 U8 noper_type = OP( noper ); 4927 U8 noper_trietype = TRIE_TYPE( noper_type ); 4928 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 4929 regnode * const noper_next = regnext( noper ); 4930 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 4931 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; 4932 #endif 4933 4934 DEBUG_TRIE_COMPILE_r({ 4935 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 4936 Perl_re_indentf( aTHX_ "- %d:%s (%d)", 4937 depth+1, 4938 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); 4939 4940 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); 4941 Perl_re_printf( aTHX_ " -> %d:%s", 4942 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); 4943 4944 if ( noper_next ) { 4945 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); 4946 Perl_re_printf( aTHX_ "\t=> %d:%s\t", 4947 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); 4948 } 4949 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", 4950 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), 4951 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] 4952 ); 4953 }); 4954 4955 /* Is noper a trieable nodetype that can be merged 4956 * with the current trie (if there is one)? */ 4957 if ( noper_trietype 4958 && 4959 ( 4960 ( noper_trietype == NOTHING ) 4961 || ( trietype == NOTHING ) 4962 || ( trietype == noper_trietype ) 4963 ) 4964 #ifdef NOJUMPTRIE 4965 && noper_next >= tail 4966 #endif 4967 && count < U16_MAX) 4968 { 4969 /* Handle mergable triable node Either we are 4970 * the first node in a new trieable sequence, 4971 * in which case we do some bookkeeping, 4972 * otherwise we update the end pointer. */ 4973 if ( !first ) { 4974 first = cur; 4975 if ( noper_trietype == NOTHING ) { 4976 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) 4977 regnode * const noper_next = regnext( noper ); 4978 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 4979 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; 4980 #endif 4981 4982 if ( noper_next_trietype ) { 4983 trietype = noper_next_trietype; 4984 } else if (noper_next_type) { 4985 /* a NOTHING regop is 1 regop wide. 4986 * We need at least two for a trie 4987 * so we can't merge this in */ 4988 first = NULL; 4989 } 4990 } else { 4991 trietype = noper_trietype; 4992 } 4993 } else { 4994 if ( trietype == NOTHING ) 4995 trietype = noper_trietype; 4996 last = cur; 4997 } 4998 if (first) 4999 count++; 5000 } /* end handle mergable triable node */ 5001 else { 5002 /* handle unmergable node - 5003 * noper may either be a triable node which can 5004 * not be tried together with the current trie, 5005 * or a non triable node */ 5006 if ( last ) { 5007 /* If last is set and trietype is not 5008 * NOTHING then we have found at least two 5009 * triable branch sequences in a row of a 5010 * similar trietype so we can turn them 5011 * into a trie. If/when we allow NOTHING to 5012 * start a trie sequence this condition 5013 * will be required, and it isn't expensive 5014 * so we leave it in for now. */ 5015 if ( trietype && trietype != NOTHING ) 5016 make_trie( pRExC_state, 5017 startbranch, first, cur, tail, 5018 count, trietype, depth+1 ); 5019 last = NULL; /* note: we clear/update 5020 first, trietype etc below, 5021 so we dont do it here */ 5022 } 5023 if ( noper_trietype 5024 #ifdef NOJUMPTRIE 5025 && noper_next >= tail 5026 #endif 5027 ){ 5028 /* noper is triable, so we can start a new 5029 * trie sequence */ 5030 count = 1; 5031 first = cur; 5032 trietype = noper_trietype; 5033 } else if (first) { 5034 /* if we already saw a first but the 5035 * current node is not triable then we have 5036 * to reset the first information. */ 5037 count = 0; 5038 first = NULL; 5039 trietype = 0; 5040 } 5041 } /* end handle unmergable node */ 5042 } /* loop over branches */ 5043 DEBUG_TRIE_COMPILE_r({ 5044 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 5045 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", 5046 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); 5047 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", 5048 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), 5049 PL_reg_name[trietype] 5050 ); 5051 5052 }); 5053 if ( last && trietype ) { 5054 if ( trietype != NOTHING ) { 5055 /* the last branch of the sequence was part of 5056 * a trie, so we have to construct it here 5057 * outside of the loop */ 5058 made= make_trie( pRExC_state, startbranch, 5059 first, scan, tail, count, 5060 trietype, depth+1 ); 5061 #ifdef TRIE_STUDY_OPT 5062 if ( ((made == MADE_EXACT_TRIE && 5063 startbranch == first) 5064 || ( first_non_open == first )) && 5065 depth==0 ) { 5066 flags |= SCF_TRIE_RESTUDY; 5067 if ( startbranch == first 5068 && scan >= tail ) 5069 { 5070 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; 5071 } 5072 } 5073 #endif 5074 } else { 5075 /* at this point we know whatever we have is a 5076 * NOTHING sequence/branch AND if 'startbranch' 5077 * is 'first' then we can turn the whole thing 5078 * into a NOTHING 5079 */ 5080 if ( startbranch == first ) { 5081 regnode *opt; 5082 /* the entire thing is a NOTHING sequence, 5083 * something like this: (?:|) So we can 5084 * turn it into a plain NOTHING op. */ 5085 DEBUG_TRIE_COMPILE_r({ 5086 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 5087 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", 5088 depth+1, 5089 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); 5090 5091 }); 5092 OP(startbranch)= NOTHING; 5093 NEXT_OFF(startbranch)= tail - startbranch; 5094 for ( opt= startbranch + 1; opt < tail ; opt++ ) 5095 OP(opt)= OPTIMIZED; 5096 } 5097 } 5098 } /* end if ( last) */ 5099 } /* TRIE_MAXBUF is non zero */ 5100 5101 } /* do trie */ 5102 5103 } 5104 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 5105 scan = NEXTOPER(NEXTOPER(scan)); 5106 } else /* single branch is optimized. */ 5107 scan = NEXTOPER(scan); 5108 continue; 5109 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { 5110 I32 paren = 0; 5111 regnode *start = NULL; 5112 regnode *end = NULL; 5113 U32 my_recursed_depth= recursed_depth; 5114 5115 if (OP(scan) != SUSPEND) { /* GOSUB */ 5116 /* Do setup, note this code has side effects beyond 5117 * the rest of this block. Specifically setting 5118 * RExC_recurse[] must happen at least once during 5119 * study_chunk(). */ 5120 paren = ARG(scan); 5121 RExC_recurse[ARG2L(scan)] = scan; 5122 start = REGNODE_p(RExC_open_parens[paren]); 5123 end = REGNODE_p(RExC_close_parens[paren]); 5124 5125 /* NOTE we MUST always execute the above code, even 5126 * if we do nothing with a GOSUB */ 5127 if ( 5128 ( flags & SCF_IN_DEFINE ) 5129 || 5130 ( 5131 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) 5132 && 5133 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) 5134 ) 5135 ) { 5136 /* no need to do anything here if we are in a define. */ 5137 /* or we are after some kind of infinite construct 5138 * so we can skip recursing into this item. 5139 * Since it is infinite we will not change the maxlen 5140 * or delta, and if we miss something that might raise 5141 * the minlen it will merely pessimise a little. 5142 * 5143 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ 5144 * might result in a minlen of 1 and not of 4, 5145 * but this doesn't make us mismatch, just try a bit 5146 * harder than we should. 5147 * */ 5148 scan= regnext(scan); 5149 continue; 5150 } 5151 5152 if ( 5153 !recursed_depth 5154 || 5155 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) 5156 ) { 5157 /* it is quite possible that there are more efficient ways 5158 * to do this. We maintain a bitmap per level of recursion 5159 * of which patterns we have entered so we can detect if a 5160 * pattern creates a possible infinite loop. When we 5161 * recurse down a level we copy the previous levels bitmap 5162 * down. When we are at recursion level 0 we zero the top 5163 * level bitmap. It would be nice to implement a different 5164 * more efficient way of doing this. In particular the top 5165 * level bitmap may be unnecessary. 5166 */ 5167 if (!recursed_depth) { 5168 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); 5169 } else { 5170 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), 5171 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), 5172 RExC_study_chunk_recursed_bytes, U8); 5173 } 5174 /* we havent recursed into this paren yet, so recurse into it */ 5175 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf); 5176 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); 5177 my_recursed_depth= recursed_depth + 1; 5178 } else { 5179 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf); 5180 /* some form of infinite recursion, assume infinite length 5181 * */ 5182 if (flags & SCF_DO_SUBSTR) { 5183 scan_commit(pRExC_state, data, minlenp, is_inf); 5184 data->cur_is_floating = 1; 5185 } 5186 is_inf = is_inf_internal = 1; 5187 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 5188 ssc_anything(data->start_class); 5189 flags &= ~SCF_DO_STCLASS; 5190 5191 start= NULL; /* reset start so we dont recurse later on. */ 5192 } 5193 } else { 5194 paren = stopparen; 5195 start = scan + 2; 5196 end = regnext(scan); 5197 } 5198 if (start) { 5199 scan_frame *newframe; 5200 assert(end); 5201 if (!RExC_frame_last) { 5202 Newxz(newframe, 1, scan_frame); 5203 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); 5204 RExC_frame_head= newframe; 5205 RExC_frame_count++; 5206 } else if (!RExC_frame_last->next_frame) { 5207 Newxz(newframe, 1, scan_frame); 5208 RExC_frame_last->next_frame= newframe; 5209 newframe->prev_frame= RExC_frame_last; 5210 RExC_frame_count++; 5211 } else { 5212 newframe= RExC_frame_last->next_frame; 5213 } 5214 RExC_frame_last= newframe; 5215 5216 newframe->next_regnode = regnext(scan); 5217 newframe->last_regnode = last; 5218 newframe->stopparen = stopparen; 5219 newframe->prev_recursed_depth = recursed_depth; 5220 newframe->this_prev_frame= frame; 5221 newframe->in_gosub = ( 5222 (frame && frame->in_gosub) || OP(scan) == GOSUB 5223 ); 5224 5225 DEBUG_STUDYDATA("frame-new", data, depth, is_inf); 5226 DEBUG_PEEP("fnew", scan, depth, flags); 5227 5228 frame = newframe; 5229 scan = start; 5230 stopparen = paren; 5231 last = end; 5232 depth = depth + 1; 5233 recursed_depth= my_recursed_depth; 5234 5235 continue; 5236 } 5237 } 5238 else if ( OP(scan) == EXACT 5239 || OP(scan) == EXACT_ONLY8 5240 || OP(scan) == EXACTL) 5241 { 5242 SSize_t l = STR_LEN(scan); 5243 UV uc; 5244 assert(l); 5245 if (UTF) { 5246 const U8 * const s = (U8*)STRING(scan); 5247 uc = utf8_to_uvchr_buf(s, s + l, NULL); 5248 l = utf8_length(s, s + l); 5249 } else { 5250 uc = *((U8*)STRING(scan)); 5251 } 5252 min += l; 5253 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 5254 /* The code below prefers earlier match for fixed 5255 offset, later match for variable offset. */ 5256 if (data->last_end == -1) { /* Update the start info. */ 5257 data->last_start_min = data->pos_min; 5258 data->last_start_max = is_inf 5259 ? SSize_t_MAX : data->pos_min + data->pos_delta; 5260 } 5261 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 5262 if (UTF) 5263 SvUTF8_on(data->last_found); 5264 { 5265 SV * const sv = data->last_found; 5266 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 5267 mg_find(sv, PERL_MAGIC_utf8) : NULL; 5268 if (mg && mg->mg_len >= 0) 5269 mg->mg_len += utf8_length((U8*)STRING(scan), 5270 (U8*)STRING(scan)+STR_LEN(scan)); 5271 } 5272 data->last_end = data->pos_min + l; 5273 data->pos_min += l; /* As in the first entry. */ 5274 data->flags &= ~SF_BEFORE_EOL; 5275 } 5276 5277 /* ANDing the code point leaves at most it, and not in locale, and 5278 * can't match null string */ 5279 if (flags & SCF_DO_STCLASS_AND) { 5280 ssc_cp_and(data->start_class, uc); 5281 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5282 ssc_clear_locale(data->start_class); 5283 } 5284 else if (flags & SCF_DO_STCLASS_OR) { 5285 ssc_add_cp(data->start_class, uc); 5286 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5287 5288 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 5289 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5290 } 5291 flags &= ~SCF_DO_STCLASS; 5292 } 5293 else if (PL_regkind[OP(scan)] == EXACT) { 5294 /* But OP != EXACT!, so is EXACTFish */ 5295 SSize_t l = STR_LEN(scan); 5296 const U8 * s = (U8*)STRING(scan); 5297 5298 /* Search for fixed substrings supports EXACT only. */ 5299 if (flags & SCF_DO_SUBSTR) { 5300 assert(data); 5301 scan_commit(pRExC_state, data, minlenp, is_inf); 5302 } 5303 if (UTF) { 5304 l = utf8_length(s, s + l); 5305 } 5306 if (unfolded_multi_char) { 5307 RExC_seen |= REG_UNFOLDED_MULTI_SEEN; 5308 } 5309 min += l - min_subtract; 5310 assert (min >= 0); 5311 delta += min_subtract; 5312 if (flags & SCF_DO_SUBSTR) { 5313 data->pos_min += l - min_subtract; 5314 if (data->pos_min < 0) { 5315 data->pos_min = 0; 5316 } 5317 data->pos_delta += min_subtract; 5318 if (min_subtract) { 5319 data->cur_is_floating = 1; /* float */ 5320 } 5321 } 5322 5323 if (flags & SCF_DO_STCLASS) { 5324 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); 5325 5326 assert(EXACTF_invlist); 5327 if (flags & SCF_DO_STCLASS_AND) { 5328 if (OP(scan) != EXACTFL) 5329 ssc_clear_locale(data->start_class); 5330 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5331 ANYOF_POSIXL_ZERO(data->start_class); 5332 ssc_intersection(data->start_class, EXACTF_invlist, FALSE); 5333 } 5334 else { /* SCF_DO_STCLASS_OR */ 5335 ssc_union(data->start_class, EXACTF_invlist, FALSE); 5336 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5337 5338 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 5339 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5340 } 5341 flags &= ~SCF_DO_STCLASS; 5342 SvREFCNT_dec(EXACTF_invlist); 5343 } 5344 } 5345 else if (REGNODE_VARIES(OP(scan))) { 5346 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; 5347 I32 fl = 0, f = flags; 5348 regnode * const oscan = scan; 5349 regnode_ssc this_class; 5350 regnode_ssc *oclass = NULL; 5351 I32 next_is_eval = 0; 5352 5353 switch (PL_regkind[OP(scan)]) { 5354 case WHILEM: /* End of (?:...)* . */ 5355 scan = NEXTOPER(scan); 5356 goto finish; 5357 case PLUS: 5358 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 5359 next = NEXTOPER(scan); 5360 if ( OP(next) == EXACT 5361 || OP(next) == EXACT_ONLY8 5362 || OP(next) == EXACTL 5363 || (flags & SCF_DO_STCLASS)) 5364 { 5365 mincount = 1; 5366 maxcount = REG_INFTY; 5367 next = regnext(scan); 5368 scan = NEXTOPER(scan); 5369 goto do_curly; 5370 } 5371 } 5372 if (flags & SCF_DO_SUBSTR) 5373 data->pos_min++; 5374 min++; 5375 /* FALLTHROUGH */ 5376 case STAR: 5377 next = NEXTOPER(scan); 5378 5379 /* This temporary node can now be turned into EXACTFU, and 5380 * must, as regexec.c doesn't handle it */ 5381 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { 5382 OP(next) = EXACTFU; 5383 } 5384 5385 if ( STR_LEN(next) == 1 5386 && isALPHA_A(* STRING(next)) 5387 && ( OP(next) == EXACTFAA 5388 || ( OP(next) == EXACTFU 5389 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) 5390 && mutate_ok 5391 ) { 5392 /* These differ in just one bit */ 5393 U8 mask = ~ ('A' ^ 'a'); 5394 5395 assert(isALPHA_A(* STRING(next))); 5396 5397 /* Then replace it by an ANYOFM node, with 5398 * the mask set to the complement of the 5399 * bit that differs between upper and lower 5400 * case, and the lowest code point of the 5401 * pair (which the '&' forces) */ 5402 OP(next) = ANYOFM; 5403 ARG_SET(next, *STRING(next) & mask); 5404 FLAGS(next) = mask; 5405 } 5406 5407 if (flags & SCF_DO_STCLASS) { 5408 mincount = 0; 5409 maxcount = REG_INFTY; 5410 next = regnext(scan); 5411 scan = NEXTOPER(scan); 5412 goto do_curly; 5413 } 5414 if (flags & SCF_DO_SUBSTR) { 5415 scan_commit(pRExC_state, data, minlenp, is_inf); 5416 /* Cannot extend fixed substrings */ 5417 data->cur_is_floating = 1; /* float */ 5418 } 5419 is_inf = is_inf_internal = 1; 5420 scan = regnext(scan); 5421 goto optimize_curly_tail; 5422 case CURLY: 5423 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 5424 && (scan->flags == stopparen)) 5425 { 5426 mincount = 1; 5427 maxcount = 1; 5428 } else { 5429 mincount = ARG1(scan); 5430 maxcount = ARG2(scan); 5431 } 5432 next = regnext(scan); 5433 if (OP(scan) == CURLYX) { 5434 I32 lp = (data ? *(data->last_closep) : 0); 5435 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 5436 } 5437 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 5438 next_is_eval = (OP(scan) == EVAL); 5439 do_curly: 5440 if (flags & SCF_DO_SUBSTR) { 5441 if (mincount == 0) 5442 scan_commit(pRExC_state, data, minlenp, is_inf); 5443 /* Cannot extend fixed substrings */ 5444 pos_before = data->pos_min; 5445 } 5446 if (data) { 5447 fl = data->flags; 5448 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 5449 if (is_inf) 5450 data->flags |= SF_IS_INF; 5451 } 5452 if (flags & SCF_DO_STCLASS) { 5453 ssc_init(pRExC_state, &this_class); 5454 oclass = data->start_class; 5455 data->start_class = &this_class; 5456 f |= SCF_DO_STCLASS_AND; 5457 f &= ~SCF_DO_STCLASS_OR; 5458 } 5459 /* Exclude from super-linear cache processing any {n,m} 5460 regops for which the combination of input pos and regex 5461 pos is not enough information to determine if a match 5462 will be possible. 5463 5464 For example, in the regex /foo(bar\s*){4,8}baz/ with the 5465 regex pos at the \s*, the prospects for a match depend not 5466 only on the input position but also on how many (bar\s*) 5467 repeats into the {4,8} we are. */ 5468 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) 5469 f &= ~SCF_WHILEM_VISITED_POS; 5470 5471 /* This will finish on WHILEM, setting scan, or on NULL: */ 5472 /* recurse study_chunk() on loop bodies */ 5473 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 5474 last, data, stopparen, recursed_depth, NULL, 5475 (mincount == 0 5476 ? (f & ~SCF_DO_SUBSTR) 5477 : f) 5478 , depth+1, mutate_ok); 5479 5480 if (flags & SCF_DO_STCLASS) 5481 data->start_class = oclass; 5482 if (mincount == 0 || minnext == 0) { 5483 if (flags & SCF_DO_STCLASS_OR) { 5484 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5485 } 5486 else if (flags & SCF_DO_STCLASS_AND) { 5487 /* Switch to OR mode: cache the old value of 5488 * data->start_class */ 5489 INIT_AND_WITHP; 5490 StructCopy(data->start_class, and_withp, regnode_ssc); 5491 flags &= ~SCF_DO_STCLASS_AND; 5492 StructCopy(&this_class, data->start_class, regnode_ssc); 5493 flags |= SCF_DO_STCLASS_OR; 5494 ANYOF_FLAGS(data->start_class) 5495 |= SSC_MATCHES_EMPTY_STRING; 5496 } 5497 } else { /* Non-zero len */ 5498 if (flags & SCF_DO_STCLASS_OR) { 5499 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5500 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5501 } 5502 else if (flags & SCF_DO_STCLASS_AND) 5503 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5504 flags &= ~SCF_DO_STCLASS; 5505 } 5506 if (!scan) /* It was not CURLYX, but CURLY. */ 5507 scan = next; 5508 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) 5509 /* ? quantifier ok, except for (?{ ... }) */ 5510 && (next_is_eval || !(mincount == 0 && maxcount == 1)) 5511 && (minnext == 0) && (deltanext == 0) 5512 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 5513 && maxcount <= REG_INFTY/3) /* Complement check for big 5514 count */ 5515 { 5516 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), 5517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), 5518 "Quantifier unexpected on zero-length expression " 5519 "in regex m/%" UTF8f "/", 5520 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, 5521 RExC_precomp))); 5522 } 5523 5524 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) 5525 || min >= SSize_t_MAX - minnext * mincount ) 5526 { 5527 FAIL("Regexp out of space"); 5528 } 5529 5530 min += minnext * mincount; 5531 is_inf_internal |= deltanext == SSize_t_MAX 5532 || (maxcount == REG_INFTY && minnext + deltanext > 0); 5533 is_inf |= is_inf_internal; 5534 if (is_inf) { 5535 delta = SSize_t_MAX; 5536 } else { 5537 delta += (minnext + deltanext) * maxcount 5538 - minnext * mincount; 5539 } 5540 /* Try powerful optimization CURLYX => CURLYN. */ 5541 if ( OP(oscan) == CURLYX && data 5542 && data->flags & SF_IN_PAR 5543 && !(data->flags & SF_HAS_EVAL) 5544 && !deltanext && minnext == 1 5545 && mutate_ok 5546 ) { 5547 /* Try to optimize to CURLYN. */ 5548 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 5549 regnode * const nxt1 = nxt; 5550 #ifdef DEBUGGING 5551 regnode *nxt2; 5552 #endif 5553 5554 /* Skip open. */ 5555 nxt = regnext(nxt); 5556 if (!REGNODE_SIMPLE(OP(nxt)) 5557 && !(PL_regkind[OP(nxt)] == EXACT 5558 && STR_LEN(nxt) == 1)) 5559 goto nogo; 5560 #ifdef DEBUGGING 5561 nxt2 = nxt; 5562 #endif 5563 nxt = regnext(nxt); 5564 if (OP(nxt) != CLOSE) 5565 goto nogo; 5566 if (RExC_open_parens) { 5567 5568 /*open->CURLYM*/ 5569 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan); 5570 5571 /*close->while*/ 5572 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2; 5573 } 5574 /* Now we know that nxt2 is the only contents: */ 5575 oscan->flags = (U8)ARG(nxt); 5576 OP(oscan) = CURLYN; 5577 OP(nxt1) = NOTHING; /* was OPEN. */ 5578 5579 #ifdef DEBUGGING 5580 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 5581 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ 5582 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ 5583 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 5584 OP(nxt + 1) = OPTIMIZED; /* was count. */ 5585 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ 5586 #endif 5587 } 5588 nogo: 5589 5590 /* Try optimization CURLYX => CURLYM. */ 5591 if ( OP(oscan) == CURLYX && data 5592 && !(data->flags & SF_HAS_PAR) 5593 && !(data->flags & SF_HAS_EVAL) 5594 && !deltanext /* atom is fixed width */ 5595 && minnext != 0 /* CURLYM can't handle zero width */ 5596 /* Nor characters whose fold at run-time may be 5597 * multi-character */ 5598 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) 5599 && mutate_ok 5600 ) { 5601 /* XXXX How to optimize if data == 0? */ 5602 /* Optimize to a simpler form. */ 5603 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 5604 regnode *nxt2; 5605 5606 OP(oscan) = CURLYM; 5607 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 5608 && (OP(nxt2) != WHILEM)) 5609 nxt = nxt2; 5610 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 5611 /* Need to optimize away parenths. */ 5612 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { 5613 /* Set the parenth number. */ 5614 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 5615 5616 oscan->flags = (U8)ARG(nxt); 5617 if (RExC_open_parens) { 5618 /*open->CURLYM*/ 5619 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan); 5620 5621 /*close->NOTHING*/ 5622 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2) 5623 + 1; 5624 } 5625 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 5626 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 5627 5628 #ifdef DEBUGGING 5629 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 5630 OP(nxt + 1) = OPTIMIZED; /* was count. */ 5631 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ 5632 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ 5633 #endif 5634 #if 0 5635 while ( nxt1 && (OP(nxt1) != WHILEM)) { 5636 regnode *nnxt = regnext(nxt1); 5637 if (nnxt == nxt) { 5638 if (reg_off_by_arg[OP(nxt1)]) 5639 ARG_SET(nxt1, nxt2 - nxt1); 5640 else if (nxt2 - nxt1 < U16_MAX) 5641 NEXT_OFF(nxt1) = nxt2 - nxt1; 5642 else 5643 OP(nxt) = NOTHING; /* Cannot beautify */ 5644 } 5645 nxt1 = nnxt; 5646 } 5647 #endif 5648 /* Optimize again: */ 5649 /* recurse study_chunk() on optimised CURLYX => CURLYM */ 5650 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 5651 NULL, stopparen, recursed_depth, NULL, 0, 5652 depth+1, mutate_ok); 5653 } 5654 else 5655 oscan->flags = 0; 5656 } 5657 else if ((OP(oscan) == CURLYX) 5658 && (flags & SCF_WHILEM_VISITED_POS) 5659 /* See the comment on a similar expression above. 5660 However, this time it's not a subexpression 5661 we care about, but the expression itself. */ 5662 && (maxcount == REG_INFTY) 5663 && data) { 5664 /* This stays as CURLYX, we can put the count/of pair. */ 5665 /* Find WHILEM (as in regexec.c) */ 5666 regnode *nxt = oscan + NEXT_OFF(oscan); 5667 5668 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 5669 nxt += ARG(nxt); 5670 nxt = PREVOPER(nxt); 5671 if (nxt->flags & 0xf) { 5672 /* we've already set whilem count on this node */ 5673 } else if (++data->whilem_c < 16) { 5674 assert(data->whilem_c <= RExC_whilem_seen); 5675 nxt->flags = (U8)(data->whilem_c 5676 | (RExC_whilem_seen << 4)); /* On WHILEM */ 5677 } 5678 } 5679 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 5680 pars++; 5681 if (flags & SCF_DO_SUBSTR) { 5682 SV *last_str = NULL; 5683 STRLEN last_chrs = 0; 5684 int counted = mincount != 0; 5685 5686 if (data->last_end > 0 && mincount != 0) { /* Ends with a 5687 string. */ 5688 SSize_t b = pos_before >= data->last_start_min 5689 ? pos_before : data->last_start_min; 5690 STRLEN l; 5691 const char * const s = SvPV_const(data->last_found, l); 5692 SSize_t old = b - data->last_start_min; 5693 assert(old >= 0); 5694 5695 if (UTF) 5696 old = utf8_hop_forward((U8*)s, old, 5697 (U8 *) SvEND(data->last_found)) 5698 - (U8*)s; 5699 l -= old; 5700 /* Get the added string: */ 5701 last_str = newSVpvn_utf8(s + old, l, UTF); 5702 last_chrs = UTF ? utf8_length((U8*)(s + old), 5703 (U8*)(s + old + l)) : l; 5704 if (deltanext == 0 && pos_before == b) { 5705 /* What was added is a constant string */ 5706 if (mincount > 1) { 5707 5708 SvGROW(last_str, (mincount * l) + 1); 5709 repeatcpy(SvPVX(last_str) + l, 5710 SvPVX_const(last_str), l, 5711 mincount - 1); 5712 SvCUR_set(last_str, SvCUR(last_str) * mincount); 5713 /* Add additional parts. */ 5714 SvCUR_set(data->last_found, 5715 SvCUR(data->last_found) - l); 5716 sv_catsv(data->last_found, last_str); 5717 { 5718 SV * sv = data->last_found; 5719 MAGIC *mg = 5720 SvUTF8(sv) && SvMAGICAL(sv) ? 5721 mg_find(sv, PERL_MAGIC_utf8) : NULL; 5722 if (mg && mg->mg_len >= 0) 5723 mg->mg_len += last_chrs * (mincount-1); 5724 } 5725 last_chrs *= mincount; 5726 data->last_end += l * (mincount - 1); 5727 } 5728 } else { 5729 /* start offset must point into the last copy */ 5730 data->last_start_min += minnext * (mincount - 1); 5731 data->last_start_max = 5732 is_inf 5733 ? SSize_t_MAX 5734 : data->last_start_max + 5735 (maxcount - 1) * (minnext + data->pos_delta); 5736 } 5737 } 5738 /* It is counted once already... */ 5739 data->pos_min += minnext * (mincount - counted); 5740 #if 0 5741 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf 5742 " SSize_t_MAX=%" UVuf " minnext=%" UVuf 5743 " maxcount=%" UVuf " mincount=%" UVuf "\n", 5744 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, 5745 (UV)mincount); 5746 if (deltanext != SSize_t_MAX) 5747 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", 5748 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount 5749 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); 5750 #endif 5751 if (deltanext == SSize_t_MAX 5752 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) 5753 data->pos_delta = SSize_t_MAX; 5754 else 5755 data->pos_delta += - counted * deltanext + 5756 (minnext + deltanext) * maxcount - minnext * mincount; 5757 if (mincount != maxcount) { 5758 /* Cannot extend fixed substrings found inside 5759 the group. */ 5760 scan_commit(pRExC_state, data, minlenp, is_inf); 5761 if (mincount && last_str) { 5762 SV * const sv = data->last_found; 5763 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 5764 mg_find(sv, PERL_MAGIC_utf8) : NULL; 5765 5766 if (mg) 5767 mg->mg_len = -1; 5768 sv_setsv(sv, last_str); 5769 data->last_end = data->pos_min; 5770 data->last_start_min = data->pos_min - last_chrs; 5771 data->last_start_max = is_inf 5772 ? SSize_t_MAX 5773 : data->pos_min + data->pos_delta - last_chrs; 5774 } 5775 data->cur_is_floating = 1; /* float */ 5776 } 5777 SvREFCNT_dec(last_str); 5778 } 5779 if (data && (fl & SF_HAS_EVAL)) 5780 data->flags |= SF_HAS_EVAL; 5781 optimize_curly_tail: 5782 rck_elide_nothing(oscan); 5783 continue; 5784 5785 default: 5786 #ifdef DEBUGGING 5787 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", 5788 OP(scan)); 5789 #endif 5790 case REF: 5791 case CLUMP: 5792 if (flags & SCF_DO_SUBSTR) { 5793 /* Cannot expect anything... */ 5794 scan_commit(pRExC_state, data, minlenp, is_inf); 5795 data->cur_is_floating = 1; /* float */ 5796 } 5797 is_inf = is_inf_internal = 1; 5798 if (flags & SCF_DO_STCLASS_OR) { 5799 if (OP(scan) == CLUMP) { 5800 /* Actually is any start char, but very few code points 5801 * aren't start characters */ 5802 ssc_match_all_cp(data->start_class); 5803 } 5804 else { 5805 ssc_anything(data->start_class); 5806 } 5807 } 5808 flags &= ~SCF_DO_STCLASS; 5809 break; 5810 } 5811 } 5812 else if (OP(scan) == LNBREAK) { 5813 if (flags & SCF_DO_STCLASS) { 5814 if (flags & SCF_DO_STCLASS_AND) { 5815 ssc_intersection(data->start_class, 5816 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); 5817 ssc_clear_locale(data->start_class); 5818 ANYOF_FLAGS(data->start_class) 5819 &= ~SSC_MATCHES_EMPTY_STRING; 5820 } 5821 else if (flags & SCF_DO_STCLASS_OR) { 5822 ssc_union(data->start_class, 5823 PL_XPosix_ptrs[_CC_VERTSPACE], 5824 FALSE); 5825 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5826 5827 /* See commit msg for 5828 * 749e076fceedeb708a624933726e7989f2302f6a */ 5829 ANYOF_FLAGS(data->start_class) 5830 &= ~SSC_MATCHES_EMPTY_STRING; 5831 } 5832 flags &= ~SCF_DO_STCLASS; 5833 } 5834 min++; 5835 if (delta != SSize_t_MAX) 5836 delta++; /* Because of the 2 char string cr-lf */ 5837 if (flags & SCF_DO_SUBSTR) { 5838 /* Cannot expect anything... */ 5839 scan_commit(pRExC_state, data, minlenp, is_inf); 5840 data->pos_min += 1; 5841 if (data->pos_delta != SSize_t_MAX) { 5842 data->pos_delta += 1; 5843 } 5844 data->cur_is_floating = 1; /* float */ 5845 } 5846 } 5847 else if (REGNODE_SIMPLE(OP(scan))) { 5848 5849 if (flags & SCF_DO_SUBSTR) { 5850 scan_commit(pRExC_state, data, minlenp, is_inf); 5851 data->pos_min++; 5852 } 5853 min++; 5854 if (flags & SCF_DO_STCLASS) { 5855 bool invert = 0; 5856 SV* my_invlist = NULL; 5857 U8 namedclass; 5858 5859 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 5860 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5861 5862 /* Some of the logic below assumes that switching 5863 locale on will only add false positives. */ 5864 switch (OP(scan)) { 5865 5866 default: 5867 #ifdef DEBUGGING 5868 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", 5869 OP(scan)); 5870 #endif 5871 case SANY: 5872 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 5873 ssc_match_all_cp(data->start_class); 5874 break; 5875 5876 case REG_ANY: 5877 { 5878 SV* REG_ANY_invlist = _new_invlist(2); 5879 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, 5880 '\n'); 5881 if (flags & SCF_DO_STCLASS_OR) { 5882 ssc_union(data->start_class, 5883 REG_ANY_invlist, 5884 TRUE /* TRUE => invert, hence all but \n 5885 */ 5886 ); 5887 } 5888 else if (flags & SCF_DO_STCLASS_AND) { 5889 ssc_intersection(data->start_class, 5890 REG_ANY_invlist, 5891 TRUE /* TRUE => invert */ 5892 ); 5893 ssc_clear_locale(data->start_class); 5894 } 5895 SvREFCNT_dec_NN(REG_ANY_invlist); 5896 } 5897 break; 5898 5899 case ANYOFD: 5900 case ANYOFL: 5901 case ANYOFPOSIXL: 5902 case ANYOFH: 5903 case ANYOF: 5904 if (flags & SCF_DO_STCLASS_AND) 5905 ssc_and(pRExC_state, data->start_class, 5906 (regnode_charclass *) scan); 5907 else 5908 ssc_or(pRExC_state, data->start_class, 5909 (regnode_charclass *) scan); 5910 break; 5911 5912 case NANYOFM: 5913 case ANYOFM: 5914 { 5915 SV* cp_list = get_ANYOFM_contents(scan); 5916 5917 if (flags & SCF_DO_STCLASS_OR) { 5918 ssc_union(data->start_class, cp_list, invert); 5919 } 5920 else if (flags & SCF_DO_STCLASS_AND) { 5921 ssc_intersection(data->start_class, cp_list, invert); 5922 } 5923 5924 SvREFCNT_dec_NN(cp_list); 5925 break; 5926 } 5927 5928 case NPOSIXL: 5929 invert = 1; 5930 /* FALLTHROUGH */ 5931 5932 case POSIXL: 5933 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; 5934 if (flags & SCF_DO_STCLASS_AND) { 5935 bool was_there = cBOOL( 5936 ANYOF_POSIXL_TEST(data->start_class, 5937 namedclass)); 5938 ANYOF_POSIXL_ZERO(data->start_class); 5939 if (was_there) { /* Do an AND */ 5940 ANYOF_POSIXL_SET(data->start_class, namedclass); 5941 } 5942 /* No individual code points can now match */ 5943 data->start_class->invlist 5944 = sv_2mortal(_new_invlist(0)); 5945 } 5946 else { 5947 int complement = namedclass + ((invert) ? -1 : 1); 5948 5949 assert(flags & SCF_DO_STCLASS_OR); 5950 5951 /* If the complement of this class was already there, 5952 * the result is that they match all code points, 5953 * (\d + \D == everything). Remove the classes from 5954 * future consideration. Locale is not relevant in 5955 * this case */ 5956 if (ANYOF_POSIXL_TEST(data->start_class, complement)) { 5957 ssc_match_all_cp(data->start_class); 5958 ANYOF_POSIXL_CLEAR(data->start_class, namedclass); 5959 ANYOF_POSIXL_CLEAR(data->start_class, complement); 5960 } 5961 else { /* The usual case; just add this class to the 5962 existing set */ 5963 ANYOF_POSIXL_SET(data->start_class, namedclass); 5964 } 5965 } 5966 break; 5967 5968 case NPOSIXA: /* For these, we always know the exact set of 5969 what's matched */ 5970 invert = 1; 5971 /* FALLTHROUGH */ 5972 case POSIXA: 5973 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); 5974 goto join_posix_and_ascii; 5975 5976 case NPOSIXD: 5977 case NPOSIXU: 5978 invert = 1; 5979 /* FALLTHROUGH */ 5980 case POSIXD: 5981 case POSIXU: 5982 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); 5983 5984 /* NPOSIXD matches all upper Latin1 code points unless the 5985 * target string being matched is UTF-8, which is 5986 * unknowable until match time. Since we are going to 5987 * invert, we want to get rid of all of them so that the 5988 * inversion will match all */ 5989 if (OP(scan) == NPOSIXD) { 5990 _invlist_subtract(my_invlist, PL_UpperLatin1, 5991 &my_invlist); 5992 } 5993 5994 join_posix_and_ascii: 5995 5996 if (flags & SCF_DO_STCLASS_AND) { 5997 ssc_intersection(data->start_class, my_invlist, invert); 5998 ssc_clear_locale(data->start_class); 5999 } 6000 else { 6001 assert(flags & SCF_DO_STCLASS_OR); 6002 ssc_union(data->start_class, my_invlist, invert); 6003 } 6004 SvREFCNT_dec(my_invlist); 6005 } 6006 if (flags & SCF_DO_STCLASS_OR) 6007 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 6008 flags &= ~SCF_DO_STCLASS; 6009 } 6010 } 6011 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 6012 data->flags |= (OP(scan) == MEOL 6013 ? SF_BEFORE_MEOL 6014 : SF_BEFORE_SEOL); 6015 scan_commit(pRExC_state, data, minlenp, is_inf); 6016 6017 } 6018 else if ( PL_regkind[OP(scan)] == BRANCHJ 6019 /* Lookbehind, or need to calculate parens/evals/stclass: */ 6020 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 6021 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) 6022 { 6023 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 6024 || OP(scan) == UNLESSM ) 6025 { 6026 /* Negative Lookahead/lookbehind 6027 In this case we can't do fixed string optimisation. 6028 */ 6029 6030 SSize_t deltanext, minnext, fake = 0; 6031 regnode *nscan; 6032 regnode_ssc intrnl; 6033 int f = 0; 6034 6035 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 6036 if (data) { 6037 data_fake.whilem_c = data->whilem_c; 6038 data_fake.last_closep = data->last_closep; 6039 } 6040 else 6041 data_fake.last_closep = &fake; 6042 data_fake.pos_delta = delta; 6043 if ( flags & SCF_DO_STCLASS && !scan->flags 6044 && OP(scan) == IFMATCH ) { /* Lookahead */ 6045 ssc_init(pRExC_state, &intrnl); 6046 data_fake.start_class = &intrnl; 6047 f |= SCF_DO_STCLASS_AND; 6048 } 6049 if (flags & SCF_WHILEM_VISITED_POS) 6050 f |= SCF_WHILEM_VISITED_POS; 6051 next = regnext(scan); 6052 nscan = NEXTOPER(NEXTOPER(scan)); 6053 6054 /* recurse study_chunk() for lookahead body */ 6055 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 6056 last, &data_fake, stopparen, 6057 recursed_depth, NULL, f, depth+1, 6058 mutate_ok); 6059 if (scan->flags) { 6060 if ( deltanext < 0 6061 || deltanext > (I32) U8_MAX 6062 || minnext > (I32)U8_MAX 6063 || minnext + deltanext > (I32)U8_MAX) 6064 { 6065 FAIL2("Lookbehind longer than %" UVuf " not implemented", 6066 (UV)U8_MAX); 6067 } 6068 6069 /* The 'next_off' field has been repurposed to count the 6070 * additional starting positions to try beyond the initial 6071 * one. (This leaves it at 0 for non-variable length 6072 * matches to avoid breakage for those not using this 6073 * extension) */ 6074 if (deltanext) { 6075 scan->next_off = deltanext; 6076 ckWARNexperimental(RExC_parse, 6077 WARN_EXPERIMENTAL__VLB, 6078 "Variable length lookbehind is experimental"); 6079 } 6080 scan->flags = (U8)minnext + deltanext; 6081 } 6082 if (data) { 6083 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 6084 pars++; 6085 if (data_fake.flags & SF_HAS_EVAL) 6086 data->flags |= SF_HAS_EVAL; 6087 data->whilem_c = data_fake.whilem_c; 6088 } 6089 if (f & SCF_DO_STCLASS_AND) { 6090 if (flags & SCF_DO_STCLASS_OR) { 6091 /* OR before, AND after: ideally we would recurse with 6092 * data_fake to get the AND applied by study of the 6093 * remainder of the pattern, and then derecurse; 6094 * *** HACK *** for now just treat as "no information". 6095 * See [perl #56690]. 6096 */ 6097 ssc_init(pRExC_state, data->start_class); 6098 } else { 6099 /* AND before and after: combine and continue. These 6100 * assertions are zero-length, so can match an EMPTY 6101 * string */ 6102 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 6103 ANYOF_FLAGS(data->start_class) 6104 |= SSC_MATCHES_EMPTY_STRING; 6105 } 6106 } 6107 } 6108 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 6109 else { 6110 /* Positive Lookahead/lookbehind 6111 In this case we can do fixed string optimisation, 6112 but we must be careful about it. Note in the case of 6113 lookbehind the positions will be offset by the minimum 6114 length of the pattern, something we won't know about 6115 until after the recurse. 6116 */ 6117 SSize_t deltanext, fake = 0; 6118 regnode *nscan; 6119 regnode_ssc intrnl; 6120 int f = 0; 6121 /* We use SAVEFREEPV so that when the full compile 6122 is finished perl will clean up the allocated 6123 minlens when it's all done. This way we don't 6124 have to worry about freeing them when we know 6125 they wont be used, which would be a pain. 6126 */ 6127 SSize_t *minnextp; 6128 Newx( minnextp, 1, SSize_t ); 6129 SAVEFREEPV(minnextp); 6130 6131 if (data) { 6132 StructCopy(data, &data_fake, scan_data_t); 6133 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 6134 f |= SCF_DO_SUBSTR; 6135 if (scan->flags) 6136 scan_commit(pRExC_state, &data_fake, minlenp, is_inf); 6137 data_fake.last_found=newSVsv(data->last_found); 6138 } 6139 } 6140 else 6141 data_fake.last_closep = &fake; 6142 data_fake.flags = 0; 6143 data_fake.substrs[0].flags = 0; 6144 data_fake.substrs[1].flags = 0; 6145 data_fake.pos_delta = delta; 6146 if (is_inf) 6147 data_fake.flags |= SF_IS_INF; 6148 if ( flags & SCF_DO_STCLASS && !scan->flags 6149 && OP(scan) == IFMATCH ) { /* Lookahead */ 6150 ssc_init(pRExC_state, &intrnl); 6151 data_fake.start_class = &intrnl; 6152 f |= SCF_DO_STCLASS_AND; 6153 } 6154 if (flags & SCF_WHILEM_VISITED_POS) 6155 f |= SCF_WHILEM_VISITED_POS; 6156 next = regnext(scan); 6157 nscan = NEXTOPER(NEXTOPER(scan)); 6158 6159 /* positive lookahead study_chunk() recursion */ 6160 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, 6161 &deltanext, last, &data_fake, 6162 stopparen, recursed_depth, NULL, 6163 f, depth+1, mutate_ok); 6164 if (scan->flags) { 6165 assert(0); /* This code has never been tested since this 6166 is normally not compiled */ 6167 if ( deltanext < 0 6168 || deltanext > (I32) U8_MAX 6169 || *minnextp > (I32)U8_MAX 6170 || *minnextp + deltanext > (I32)U8_MAX) 6171 { 6172 FAIL2("Lookbehind longer than %" UVuf " not implemented", 6173 (UV)U8_MAX); 6174 } 6175 6176 if (deltanext) { 6177 scan->next_off = deltanext; 6178 } 6179 scan->flags = (U8)*minnextp + deltanext; 6180 } 6181 6182 *minnextp += min; 6183 6184 if (f & SCF_DO_STCLASS_AND) { 6185 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 6186 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; 6187 } 6188 if (data) { 6189 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 6190 pars++; 6191 if (data_fake.flags & SF_HAS_EVAL) 6192 data->flags |= SF_HAS_EVAL; 6193 data->whilem_c = data_fake.whilem_c; 6194 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 6195 int i; 6196 if (RExC_rx->minlen<*minnextp) 6197 RExC_rx->minlen=*minnextp; 6198 scan_commit(pRExC_state, &data_fake, minnextp, is_inf); 6199 SvREFCNT_dec_NN(data_fake.last_found); 6200 6201 for (i = 0; i < 2; i++) { 6202 if (data_fake.substrs[i].minlenp != minlenp) { 6203 data->substrs[i].min_offset = 6204 data_fake.substrs[i].min_offset; 6205 data->substrs[i].max_offset = 6206 data_fake.substrs[i].max_offset; 6207 data->substrs[i].minlenp = 6208 data_fake.substrs[i].minlenp; 6209 data->substrs[i].lookbehind += scan->flags; 6210 } 6211 } 6212 } 6213 } 6214 } 6215 #endif 6216 } 6217 6218 else if (OP(scan) == OPEN) { 6219 if (stopparen != (I32)ARG(scan)) 6220 pars++; 6221 } 6222 else if (OP(scan) == CLOSE) { 6223 if (stopparen == (I32)ARG(scan)) { 6224 break; 6225 } 6226 if ((I32)ARG(scan) == is_par) { 6227 next = regnext(scan); 6228 6229 if ( next && (OP(next) != WHILEM) && next < last) 6230 is_par = 0; /* Disable optimization */ 6231 } 6232 if (data) 6233 *(data->last_closep) = ARG(scan); 6234 } 6235 else if (OP(scan) == EVAL) { 6236 if (data) 6237 data->flags |= SF_HAS_EVAL; 6238 } 6239 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 6240 if (flags & SCF_DO_SUBSTR) { 6241 scan_commit(pRExC_state, data, minlenp, is_inf); 6242 flags &= ~SCF_DO_SUBSTR; 6243 } 6244 if (data && OP(scan)==ACCEPT) { 6245 data->flags |= SCF_SEEN_ACCEPT; 6246 if (stopmin > min) 6247 stopmin = min; 6248 } 6249 } 6250 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 6251 { 6252 if (flags & SCF_DO_SUBSTR) { 6253 scan_commit(pRExC_state, data, minlenp, is_inf); 6254 data->cur_is_floating = 1; /* float */ 6255 } 6256 is_inf = is_inf_internal = 1; 6257 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 6258 ssc_anything(data->start_class); 6259 flags &= ~SCF_DO_STCLASS; 6260 } 6261 else if (OP(scan) == GPOS) { 6262 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && 6263 !(delta || is_inf || (data && data->pos_delta))) 6264 { 6265 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) 6266 RExC_rx->intflags |= PREGf_ANCH_GPOS; 6267 if (RExC_rx->gofs < (STRLEN)min) 6268 RExC_rx->gofs = min; 6269 } else { 6270 RExC_rx->intflags |= PREGf_GPOS_FLOAT; 6271 RExC_rx->gofs = 0; 6272 } 6273 } 6274 #ifdef TRIE_STUDY_OPT 6275 #ifdef FULL_TRIE_STUDY 6276 else if (PL_regkind[OP(scan)] == TRIE) { 6277 /* NOTE - There is similar code to this block above for handling 6278 BRANCH nodes on the initial study. If you change stuff here 6279 check there too. */ 6280 regnode *trie_node= scan; 6281 regnode *tail= regnext(scan); 6282 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 6283 SSize_t max1 = 0, min1 = SSize_t_MAX; 6284 regnode_ssc accum; 6285 6286 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ 6287 /* Cannot merge strings after this. */ 6288 scan_commit(pRExC_state, data, minlenp, is_inf); 6289 } 6290 if (flags & SCF_DO_STCLASS) 6291 ssc_init_zero(pRExC_state, &accum); 6292 6293 if (!trie->jump) { 6294 min1= trie->minlen; 6295 max1= trie->maxlen; 6296 } else { 6297 const regnode *nextbranch= NULL; 6298 U32 word; 6299 6300 for ( word=1 ; word <= trie->wordcount ; word++) 6301 { 6302 SSize_t deltanext=0, minnext=0, f = 0, fake; 6303 regnode_ssc this_class; 6304 6305 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 6306 if (data) { 6307 data_fake.whilem_c = data->whilem_c; 6308 data_fake.last_closep = data->last_closep; 6309 } 6310 else 6311 data_fake.last_closep = &fake; 6312 data_fake.pos_delta = delta; 6313 if (flags & SCF_DO_STCLASS) { 6314 ssc_init(pRExC_state, &this_class); 6315 data_fake.start_class = &this_class; 6316 f = SCF_DO_STCLASS_AND; 6317 } 6318 if (flags & SCF_WHILEM_VISITED_POS) 6319 f |= SCF_WHILEM_VISITED_POS; 6320 6321 if (trie->jump[word]) { 6322 if (!nextbranch) 6323 nextbranch = trie_node + trie->jump[0]; 6324 scan= trie_node + trie->jump[word]; 6325 /* We go from the jump point to the branch that follows 6326 it. Note this means we need the vestigal unused 6327 branches even though they arent otherwise used. */ 6328 /* optimise study_chunk() for TRIE */ 6329 minnext = study_chunk(pRExC_state, &scan, minlenp, 6330 &deltanext, (regnode *)nextbranch, &data_fake, 6331 stopparen, recursed_depth, NULL, f, depth+1, 6332 mutate_ok); 6333 } 6334 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 6335 nextbranch= regnext((regnode*)nextbranch); 6336 6337 if (min1 > (SSize_t)(minnext + trie->minlen)) 6338 min1 = minnext + trie->minlen; 6339 if (deltanext == SSize_t_MAX) { 6340 is_inf = is_inf_internal = 1; 6341 max1 = SSize_t_MAX; 6342 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) 6343 max1 = minnext + deltanext + trie->maxlen; 6344 6345 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 6346 pars++; 6347 if (data_fake.flags & SCF_SEEN_ACCEPT) { 6348 if ( stopmin > min + min1) 6349 stopmin = min + min1; 6350 flags &= ~SCF_DO_SUBSTR; 6351 if (data) 6352 data->flags |= SCF_SEEN_ACCEPT; 6353 } 6354 if (data) { 6355 if (data_fake.flags & SF_HAS_EVAL) 6356 data->flags |= SF_HAS_EVAL; 6357 data->whilem_c = data_fake.whilem_c; 6358 } 6359 if (flags & SCF_DO_STCLASS) 6360 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); 6361 } 6362 } 6363 if (flags & SCF_DO_SUBSTR) { 6364 data->pos_min += min1; 6365 data->pos_delta += max1 - min1; 6366 if (max1 != min1 || is_inf) 6367 data->cur_is_floating = 1; /* float */ 6368 } 6369 min += min1; 6370 if (delta != SSize_t_MAX) { 6371 if (SSize_t_MAX - (max1 - min1) >= delta) 6372 delta += max1 - min1; 6373 else 6374 delta = SSize_t_MAX; 6375 } 6376 if (flags & SCF_DO_STCLASS_OR) { 6377 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); 6378 if (min1) { 6379 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 6380 flags &= ~SCF_DO_STCLASS; 6381 } 6382 } 6383 else if (flags & SCF_DO_STCLASS_AND) { 6384 if (min1) { 6385 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 6386 flags &= ~SCF_DO_STCLASS; 6387 } 6388 else { 6389 /* Switch to OR mode: cache the old value of 6390 * data->start_class */ 6391 INIT_AND_WITHP; 6392 StructCopy(data->start_class, and_withp, regnode_ssc); 6393 flags &= ~SCF_DO_STCLASS_AND; 6394 StructCopy(&accum, data->start_class, regnode_ssc); 6395 flags |= SCF_DO_STCLASS_OR; 6396 } 6397 } 6398 scan= tail; 6399 continue; 6400 } 6401 #else 6402 else if (PL_regkind[OP(scan)] == TRIE) { 6403 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 6404 U8*bang=NULL; 6405 6406 min += trie->minlen; 6407 delta += (trie->maxlen - trie->minlen); 6408 flags &= ~SCF_DO_STCLASS; /* xxx */ 6409 if (flags & SCF_DO_SUBSTR) { 6410 /* Cannot expect anything... */ 6411 scan_commit(pRExC_state, data, minlenp, is_inf); 6412 data->pos_min += trie->minlen; 6413 data->pos_delta += (trie->maxlen - trie->minlen); 6414 if (trie->maxlen != trie->minlen) 6415 data->cur_is_floating = 1; /* float */ 6416 } 6417 if (trie->jump) /* no more substrings -- for now /grr*/ 6418 flags &= ~SCF_DO_SUBSTR; 6419 } 6420 #endif /* old or new */ 6421 #endif /* TRIE_STUDY_OPT */ 6422 6423 /* Else: zero-length, ignore. */ 6424 scan = regnext(scan); 6425 } 6426 6427 finish: 6428 if (frame) { 6429 /* we need to unwind recursion. */ 6430 depth = depth - 1; 6431 6432 DEBUG_STUDYDATA("frame-end", data, depth, is_inf); 6433 DEBUG_PEEP("fend", scan, depth, flags); 6434 6435 /* restore previous context */ 6436 last = frame->last_regnode; 6437 scan = frame->next_regnode; 6438 stopparen = frame->stopparen; 6439 recursed_depth = frame->prev_recursed_depth; 6440 6441 RExC_frame_last = frame->prev_frame; 6442 frame = frame->this_prev_frame; 6443 goto fake_study_recurse; 6444 } 6445 6446 assert(!frame); 6447 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf); 6448 6449 *scanp = scan; 6450 *deltap = is_inf_internal ? SSize_t_MAX : delta; 6451 6452 if (flags & SCF_DO_SUBSTR && is_inf) 6453 data->pos_delta = SSize_t_MAX - data->pos_min; 6454 if (is_par > (I32)U8_MAX) 6455 is_par = 0; 6456 if (is_par && pars==1 && data) { 6457 data->flags |= SF_IN_PAR; 6458 data->flags &= ~SF_HAS_PAR; 6459 } 6460 else if (pars && data) { 6461 data->flags |= SF_HAS_PAR; 6462 data->flags &= ~SF_IN_PAR; 6463 } 6464 if (flags & SCF_DO_STCLASS_OR) 6465 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 6466 if (flags & SCF_TRIE_RESTUDY) 6467 data->flags |= SCF_TRIE_RESTUDY; 6468 6469 DEBUG_STUDYDATA("post-fin", data, depth, is_inf); 6470 6471 { 6472 SSize_t final_minlen= min < stopmin ? min : stopmin; 6473 6474 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { 6475 if (final_minlen > SSize_t_MAX - delta) 6476 RExC_maxlen = SSize_t_MAX; 6477 else if (RExC_maxlen < final_minlen + delta) 6478 RExC_maxlen = final_minlen + delta; 6479 } 6480 return final_minlen; 6481 } 6482 NOT_REACHED; /* NOTREACHED */ 6483 } 6484 6485 STATIC U32 6486 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) 6487 { 6488 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 6489 6490 PERL_ARGS_ASSERT_ADD_DATA; 6491 6492 Renewc(RExC_rxi->data, 6493 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 6494 char, struct reg_data); 6495 if(count) 6496 Renew(RExC_rxi->data->what, count + n, U8); 6497 else 6498 Newx(RExC_rxi->data->what, n, U8); 6499 RExC_rxi->data->count = count + n; 6500 Copy(s, RExC_rxi->data->what + count, n, U8); 6501 return count; 6502 } 6503 6504 /*XXX: todo make this not included in a non debugging perl, but appears to be 6505 * used anyway there, in 'use re' */ 6506 #ifndef PERL_IN_XSUB_RE 6507 void 6508 Perl_reginitcolors(pTHX) 6509 { 6510 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 6511 if (s) { 6512 char *t = savepv(s); 6513 int i = 0; 6514 PL_colors[0] = t; 6515 while (++i < 6) { 6516 t = strchr(t, '\t'); 6517 if (t) { 6518 *t = '\0'; 6519 PL_colors[i] = ++t; 6520 } 6521 else 6522 PL_colors[i] = t = (char *)""; 6523 } 6524 } else { 6525 int i = 0; 6526 while (i < 6) 6527 PL_colors[i++] = (char *)""; 6528 } 6529 PL_colorset = 1; 6530 } 6531 #endif 6532 6533 6534 #ifdef TRIE_STUDY_OPT 6535 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ 6536 STMT_START { \ 6537 if ( \ 6538 (data.flags & SCF_TRIE_RESTUDY) \ 6539 && ! restudied++ \ 6540 ) { \ 6541 dOsomething; \ 6542 goto reStudy; \ 6543 } \ 6544 } STMT_END 6545 #else 6546 #define CHECK_RESTUDY_GOTO_butfirst 6547 #endif 6548 6549 /* 6550 * pregcomp - compile a regular expression into internal code 6551 * 6552 * Decides which engine's compiler to call based on the hint currently in 6553 * scope 6554 */ 6555 6556 #ifndef PERL_IN_XSUB_RE 6557 6558 /* return the currently in-scope regex engine (or the default if none) */ 6559 6560 regexp_engine const * 6561 Perl_current_re_engine(pTHX) 6562 { 6563 if (IN_PERL_COMPILETIME) { 6564 HV * const table = GvHV(PL_hintgv); 6565 SV **ptr; 6566 6567 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) 6568 return &PL_core_reg_engine; 6569 ptr = hv_fetchs(table, "regcomp", FALSE); 6570 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) 6571 return &PL_core_reg_engine; 6572 return INT2PTR(regexp_engine*, SvIV(*ptr)); 6573 } 6574 else { 6575 SV *ptr; 6576 if (!PL_curcop->cop_hints_hash) 6577 return &PL_core_reg_engine; 6578 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); 6579 if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) 6580 return &PL_core_reg_engine; 6581 return INT2PTR(regexp_engine*, SvIV(ptr)); 6582 } 6583 } 6584 6585 6586 REGEXP * 6587 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 6588 { 6589 regexp_engine const *eng = current_re_engine(); 6590 GET_RE_DEBUG_FLAGS_DECL; 6591 6592 PERL_ARGS_ASSERT_PREGCOMP; 6593 6594 /* Dispatch a request to compile a regexp to correct regexp engine. */ 6595 DEBUG_COMPILE_r({ 6596 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n", 6597 PTR2UV(eng)); 6598 }); 6599 return CALLREGCOMP_ENG(eng, pattern, flags); 6600 } 6601 #endif 6602 6603 /* public(ish) entry point for the perl core's own regex compiling code. 6604 * It's actually a wrapper for Perl_re_op_compile that only takes an SV 6605 * pattern rather than a list of OPs, and uses the internal engine rather 6606 * than the current one */ 6607 6608 REGEXP * 6609 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) 6610 { 6611 SV *pat = pattern; /* defeat constness! */ 6612 PERL_ARGS_ASSERT_RE_COMPILE; 6613 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, 6614 #ifdef PERL_IN_XSUB_RE 6615 &my_reg_engine, 6616 #else 6617 &PL_core_reg_engine, 6618 #endif 6619 NULL, NULL, rx_flags, 0); 6620 } 6621 6622 6623 static void 6624 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs) 6625 { 6626 int n; 6627 6628 if (--cbs->refcnt > 0) 6629 return; 6630 for (n = 0; n < cbs->count; n++) { 6631 REGEXP *rx = cbs->cb[n].src_regex; 6632 if (rx) { 6633 cbs->cb[n].src_regex = NULL; 6634 SvREFCNT_dec_NN(rx); 6635 } 6636 } 6637 Safefree(cbs->cb); 6638 Safefree(cbs); 6639 } 6640 6641 6642 static struct reg_code_blocks * 6643 S_alloc_code_blocks(pTHX_ int ncode) 6644 { 6645 struct reg_code_blocks *cbs; 6646 Newx(cbs, 1, struct reg_code_blocks); 6647 cbs->count = ncode; 6648 cbs->refcnt = 1; 6649 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs); 6650 if (ncode) 6651 Newx(cbs->cb, ncode, struct reg_code_block); 6652 else 6653 cbs->cb = NULL; 6654 return cbs; 6655 } 6656 6657 6658 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code 6659 * blocks, recalculate the indices. Update pat_p and plen_p in-place to 6660 * point to the realloced string and length. 6661 * 6662 * This is essentially a copy of Perl_bytes_to_utf8() with the code index 6663 * stuff added */ 6664 6665 static void 6666 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, 6667 char **pat_p, STRLEN *plen_p, int num_code_blocks) 6668 { 6669 U8 *const src = (U8*)*pat_p; 6670 U8 *dst, *d; 6671 int n=0; 6672 STRLEN s = 0; 6673 bool do_end = 0; 6674 GET_RE_DEBUG_FLAGS_DECL; 6675 6676 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 6677 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 6678 6679 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 6680 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8); 6681 d = dst; 6682 6683 while (s < *plen_p) { 6684 append_utf8_from_native_byte(src[s], &d); 6685 6686 if (n < num_code_blocks) { 6687 assert(pRExC_state->code_blocks); 6688 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) { 6689 pRExC_state->code_blocks->cb[n].start = d - dst - 1; 6690 assert(*(d - 1) == '('); 6691 do_end = 1; 6692 } 6693 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) { 6694 pRExC_state->code_blocks->cb[n].end = d - dst - 1; 6695 assert(*(d - 1) == ')'); 6696 do_end = 0; 6697 n++; 6698 } 6699 } 6700 s++; 6701 } 6702 *d = '\0'; 6703 *plen_p = d - dst; 6704 *pat_p = (char*) dst; 6705 SAVEFREEPV(*pat_p); 6706 RExC_orig_utf8 = RExC_utf8 = 1; 6707 } 6708 6709 6710 6711 /* S_concat_pat(): concatenate a list of args to the pattern string pat, 6712 * while recording any code block indices, and handling overloading, 6713 * nested qr// objects etc. If pat is null, it will allocate a new 6714 * string, or just return the first arg, if there's only one. 6715 * 6716 * Returns the malloced/updated pat. 6717 * patternp and pat_count is the array of SVs to be concatted; 6718 * oplist is the optional list of ops that generated the SVs; 6719 * recompile_p is a pointer to a boolean that will be set if 6720 * the regex will need to be recompiled. 6721 * delim, if non-null is an SV that will be inserted between each element 6722 */ 6723 6724 static SV* 6725 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, 6726 SV *pat, SV ** const patternp, int pat_count, 6727 OP *oplist, bool *recompile_p, SV *delim) 6728 { 6729 SV **svp; 6730 int n = 0; 6731 bool use_delim = FALSE; 6732 bool alloced = FALSE; 6733 6734 /* if we know we have at least two args, create an empty string, 6735 * then concatenate args to that. For no args, return an empty string */ 6736 if (!pat && pat_count != 1) { 6737 pat = newSVpvs(""); 6738 SAVEFREESV(pat); 6739 alloced = TRUE; 6740 } 6741 6742 for (svp = patternp; svp < patternp + pat_count; svp++) { 6743 SV *sv; 6744 SV *rx = NULL; 6745 STRLEN orig_patlen = 0; 6746 bool code = 0; 6747 SV *msv = use_delim ? delim : *svp; 6748 if (!msv) msv = &PL_sv_undef; 6749 6750 /* if we've got a delimiter, we go round the loop twice for each 6751 * svp slot (except the last), using the delimiter the second 6752 * time round */ 6753 if (use_delim) { 6754 svp--; 6755 use_delim = FALSE; 6756 } 6757 else if (delim) 6758 use_delim = TRUE; 6759 6760 if (SvTYPE(msv) == SVt_PVAV) { 6761 /* we've encountered an interpolated array within 6762 * the pattern, e.g. /...@a..../. Expand the list of elements, 6763 * then recursively append elements. 6764 * The code in this block is based on S_pushav() */ 6765 6766 AV *const av = (AV*)msv; 6767 const SSize_t maxarg = AvFILL(av) + 1; 6768 SV **array; 6769 6770 if (oplist) { 6771 assert(oplist->op_type == OP_PADAV 6772 || oplist->op_type == OP_RV2AV); 6773 oplist = OpSIBLING(oplist); 6774 } 6775 6776 if (SvRMAGICAL(av)) { 6777 SSize_t i; 6778 6779 Newx(array, maxarg, SV*); 6780 SAVEFREEPV(array); 6781 for (i=0; i < maxarg; i++) { 6782 SV ** const svp = av_fetch(av, i, FALSE); 6783 array[i] = svp ? *svp : &PL_sv_undef; 6784 } 6785 } 6786 else 6787 array = AvARRAY(av); 6788 6789 pat = S_concat_pat(aTHX_ pRExC_state, pat, 6790 array, maxarg, NULL, recompile_p, 6791 /* $" */ 6792 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); 6793 6794 continue; 6795 } 6796 6797 6798 /* we make the assumption here that each op in the list of 6799 * op_siblings maps to one SV pushed onto the stack, 6800 * except for code blocks, with have both an OP_NULL and 6801 * and OP_CONST. 6802 * This allows us to match up the list of SVs against the 6803 * list of OPs to find the next code block. 6804 * 6805 * Note that PUSHMARK PADSV PADSV .. 6806 * is optimised to 6807 * PADRANGE PADSV PADSV .. 6808 * so the alignment still works. */ 6809 6810 if (oplist) { 6811 if (oplist->op_type == OP_NULL 6812 && (oplist->op_flags & OPf_SPECIAL)) 6813 { 6814 assert(n < pRExC_state->code_blocks->count); 6815 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0; 6816 pRExC_state->code_blocks->cb[n].block = oplist; 6817 pRExC_state->code_blocks->cb[n].src_regex = NULL; 6818 n++; 6819 code = 1; 6820 oplist = OpSIBLING(oplist); /* skip CONST */ 6821 assert(oplist); 6822 } 6823 oplist = OpSIBLING(oplist);; 6824 } 6825 6826 /* apply magic and QR overloading to arg */ 6827 6828 SvGETMAGIC(msv); 6829 if (SvROK(msv) && SvAMAGIC(msv)) { 6830 SV *sv = AMG_CALLunary(msv, regexp_amg); 6831 if (sv) { 6832 if (SvROK(sv)) 6833 sv = SvRV(sv); 6834 if (SvTYPE(sv) != SVt_REGEXP) 6835 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); 6836 msv = sv; 6837 } 6838 } 6839 6840 /* try concatenation overload ... */ 6841 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && 6842 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) 6843 { 6844 sv_setsv(pat, sv); 6845 /* overloading involved: all bets are off over literal 6846 * code. Pretend we haven't seen it */ 6847 if (n) 6848 pRExC_state->code_blocks->count -= n; 6849 n = 0; 6850 } 6851 else { 6852 /* ... or failing that, try "" overload */ 6853 while (SvAMAGIC(msv) 6854 && (sv = AMG_CALLunary(msv, string_amg)) 6855 && sv != msv 6856 && !( SvROK(msv) 6857 && SvROK(sv) 6858 && SvRV(msv) == SvRV(sv)) 6859 ) { 6860 msv = sv; 6861 SvGETMAGIC(msv); 6862 } 6863 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) 6864 msv = SvRV(msv); 6865 6866 if (pat) { 6867 /* this is a partially unrolled 6868 * sv_catsv_nomg(pat, msv); 6869 * that allows us to adjust code block indices if 6870 * needed */ 6871 STRLEN dlen; 6872 char *dst = SvPV_force_nomg(pat, dlen); 6873 orig_patlen = dlen; 6874 if (SvUTF8(msv) && !SvUTF8(pat)) { 6875 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); 6876 sv_setpvn(pat, dst, dlen); 6877 SvUTF8_on(pat); 6878 } 6879 sv_catsv_nomg(pat, msv); 6880 rx = msv; 6881 } 6882 else { 6883 /* We have only one SV to process, but we need to verify 6884 * it is properly null terminated or we will fail asserts 6885 * later. In theory we probably shouldn't get such SV's, 6886 * but if we do we should handle it gracefully. */ 6887 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) { 6888 /* not a string, or a string with a trailing null */ 6889 pat = msv; 6890 } else { 6891 /* a string with no trailing null, we need to copy it 6892 * so it has a trailing null */ 6893 pat = sv_2mortal(newSVsv(msv)); 6894 } 6895 } 6896 6897 if (code) 6898 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1; 6899 } 6900 6901 /* extract any code blocks within any embedded qr//'s */ 6902 if (rx && SvTYPE(rx) == SVt_REGEXP 6903 && RX_ENGINE((REGEXP*)rx)->op_comp) 6904 { 6905 6906 RXi_GET_DECL(ReANY((REGEXP *)rx), ri); 6907 if (ri->code_blocks && ri->code_blocks->count) { 6908 int i; 6909 /* the presence of an embedded qr// with code means 6910 * we should always recompile: the text of the 6911 * qr// may not have changed, but it may be a 6912 * different closure than last time */ 6913 *recompile_p = 1; 6914 if (pRExC_state->code_blocks) { 6915 int new_count = pRExC_state->code_blocks->count 6916 + ri->code_blocks->count; 6917 Renew(pRExC_state->code_blocks->cb, 6918 new_count, struct reg_code_block); 6919 pRExC_state->code_blocks->count = new_count; 6920 } 6921 else 6922 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ 6923 ri->code_blocks->count); 6924 6925 for (i=0; i < ri->code_blocks->count; i++) { 6926 struct reg_code_block *src, *dst; 6927 STRLEN offset = orig_patlen 6928 + ReANY((REGEXP *)rx)->pre_prefix; 6929 assert(n < pRExC_state->code_blocks->count); 6930 src = &ri->code_blocks->cb[i]; 6931 dst = &pRExC_state->code_blocks->cb[n]; 6932 dst->start = src->start + offset; 6933 dst->end = src->end + offset; 6934 dst->block = src->block; 6935 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) 6936 src->src_regex 6937 ? src->src_regex 6938 : (REGEXP*)rx); 6939 n++; 6940 } 6941 } 6942 } 6943 } 6944 /* avoid calling magic multiple times on a single element e.g. =~ $qr */ 6945 if (alloced) 6946 SvSETMAGIC(pat); 6947 6948 return pat; 6949 } 6950 6951 6952 6953 /* see if there are any run-time code blocks in the pattern. 6954 * False positives are allowed */ 6955 6956 static bool 6957 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 6958 char *pat, STRLEN plen) 6959 { 6960 int n = 0; 6961 STRLEN s; 6962 6963 PERL_UNUSED_CONTEXT; 6964 6965 for (s = 0; s < plen; s++) { 6966 if ( pRExC_state->code_blocks 6967 && n < pRExC_state->code_blocks->count 6968 && s == pRExC_state->code_blocks->cb[n].start) 6969 { 6970 s = pRExC_state->code_blocks->cb[n].end; 6971 n++; 6972 continue; 6973 } 6974 /* TODO ideally should handle [..], (#..), /#.../x to reduce false 6975 * positives here */ 6976 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && 6977 (pat[s+2] == '{' 6978 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) 6979 ) 6980 return 1; 6981 } 6982 return 0; 6983 } 6984 6985 /* Handle run-time code blocks. We will already have compiled any direct 6986 * or indirect literal code blocks. Now, take the pattern 'pat' and make a 6987 * copy of it, but with any literal code blocks blanked out and 6988 * appropriate chars escaped; then feed it into 6989 * 6990 * eval "qr'modified_pattern'" 6991 * 6992 * For example, 6993 * 6994 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno 6995 * 6996 * becomes 6997 * 6998 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' 6999 * 7000 * After eval_sv()-ing that, grab any new code blocks from the returned qr 7001 * and merge them with any code blocks of the original regexp. 7002 * 7003 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; 7004 * instead, just save the qr and return FALSE; this tells our caller that 7005 * the original pattern needs upgrading to utf8. 7006 */ 7007 7008 static bool 7009 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 7010 char *pat, STRLEN plen) 7011 { 7012 SV *qr; 7013 7014 GET_RE_DEBUG_FLAGS_DECL; 7015 7016 if (pRExC_state->runtime_code_qr) { 7017 /* this is the second time we've been called; this should 7018 * only happen if the main pattern got upgraded to utf8 7019 * during compilation; re-use the qr we compiled first time 7020 * round (which should be utf8 too) 7021 */ 7022 qr = pRExC_state->runtime_code_qr; 7023 pRExC_state->runtime_code_qr = NULL; 7024 assert(RExC_utf8 && SvUTF8(qr)); 7025 } 7026 else { 7027 int n = 0; 7028 STRLEN s; 7029 char *p, *newpat; 7030 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */ 7031 SV *sv, *qr_ref; 7032 dSP; 7033 7034 /* determine how many extra chars we need for ' and \ escaping */ 7035 for (s = 0; s < plen; s++) { 7036 if (pat[s] == '\'' || pat[s] == '\\') 7037 newlen++; 7038 } 7039 7040 Newx(newpat, newlen, char); 7041 p = newpat; 7042 *p++ = 'q'; *p++ = 'r'; *p++ = '\''; 7043 7044 for (s = 0; s < plen; s++) { 7045 if ( pRExC_state->code_blocks 7046 && n < pRExC_state->code_blocks->count 7047 && s == pRExC_state->code_blocks->cb[n].start) 7048 { 7049 /* blank out literal code block so that they aren't 7050 * recompiled: eg change from/to: 7051 * /(?{xyz})/ 7052 * /(?=====)/ 7053 * and 7054 * /(??{xyz})/ 7055 * /(?======)/ 7056 * and 7057 * /(?(?{xyz}))/ 7058 * /(?(?=====))/ 7059 */ 7060 assert(pat[s] == '('); 7061 assert(pat[s+1] == '?'); 7062 *p++ = '('; 7063 *p++ = '?'; 7064 s += 2; 7065 while (s < pRExC_state->code_blocks->cb[n].end) { 7066 *p++ = '='; 7067 s++; 7068 } 7069 *p++ = ')'; 7070 n++; 7071 continue; 7072 } 7073 if (pat[s] == '\'' || pat[s] == '\\') 7074 *p++ = '\\'; 7075 *p++ = pat[s]; 7076 } 7077 *p++ = '\''; 7078 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) { 7079 *p++ = 'x'; 7080 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) { 7081 *p++ = 'x'; 7082 } 7083 } 7084 *p++ = '\0'; 7085 DEBUG_COMPILE_r({ 7086 Perl_re_printf( aTHX_ 7087 "%sre-parsing pattern for runtime code:%s %s\n", 7088 PL_colors[4], PL_colors[5], newpat); 7089 }); 7090 7091 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); 7092 Safefree(newpat); 7093 7094 ENTER; 7095 SAVETMPS; 7096 save_re_context(); 7097 PUSHSTACKi(PERLSI_REQUIRE); 7098 /* G_RE_REPARSING causes the toker to collapse \\ into \ when 7099 * parsing qr''; normally only q'' does this. It also alters 7100 * hints handling */ 7101 eval_sv(sv, G_SCALAR|G_RE_REPARSING); 7102 SvREFCNT_dec_NN(sv); 7103 SPAGAIN; 7104 qr_ref = POPs; 7105 PUTBACK; 7106 { 7107 SV * const errsv = ERRSV; 7108 if (SvTRUE_NN(errsv)) 7109 /* use croak_sv ? */ 7110 Perl_croak_nocontext("%" SVf, SVfARG(errsv)); 7111 } 7112 assert(SvROK(qr_ref)); 7113 qr = SvRV(qr_ref); 7114 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); 7115 /* the leaving below frees the tmp qr_ref. 7116 * Give qr a life of its own */ 7117 SvREFCNT_inc(qr); 7118 POPSTACK; 7119 FREETMPS; 7120 LEAVE; 7121 7122 } 7123 7124 if (!RExC_utf8 && SvUTF8(qr)) { 7125 /* first time through; the pattern got upgraded; save the 7126 * qr for the next time through */ 7127 assert(!pRExC_state->runtime_code_qr); 7128 pRExC_state->runtime_code_qr = qr; 7129 return 0; 7130 } 7131 7132 7133 /* extract any code blocks within the returned qr// */ 7134 7135 7136 /* merge the main (r1) and run-time (r2) code blocks into one */ 7137 { 7138 RXi_GET_DECL(ReANY((REGEXP *)qr), r2); 7139 struct reg_code_block *new_block, *dst; 7140 RExC_state_t * const r1 = pRExC_state; /* convenient alias */ 7141 int i1 = 0, i2 = 0; 7142 int r1c, r2c; 7143 7144 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */ 7145 { 7146 SvREFCNT_dec_NN(qr); 7147 return 1; 7148 } 7149 7150 if (!r1->code_blocks) 7151 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0); 7152 7153 r1c = r1->code_blocks->count; 7154 r2c = r2->code_blocks->count; 7155 7156 Newx(new_block, r1c + r2c, struct reg_code_block); 7157 7158 dst = new_block; 7159 7160 while (i1 < r1c || i2 < r2c) { 7161 struct reg_code_block *src; 7162 bool is_qr = 0; 7163 7164 if (i1 == r1c) { 7165 src = &r2->code_blocks->cb[i2++]; 7166 is_qr = 1; 7167 } 7168 else if (i2 == r2c) 7169 src = &r1->code_blocks->cb[i1++]; 7170 else if ( r1->code_blocks->cb[i1].start 7171 < r2->code_blocks->cb[i2].start) 7172 { 7173 src = &r1->code_blocks->cb[i1++]; 7174 assert(src->end < r2->code_blocks->cb[i2].start); 7175 } 7176 else { 7177 assert( r1->code_blocks->cb[i1].start 7178 > r2->code_blocks->cb[i2].start); 7179 src = &r2->code_blocks->cb[i2++]; 7180 is_qr = 1; 7181 assert(src->end < r1->code_blocks->cb[i1].start); 7182 } 7183 7184 assert(pat[src->start] == '('); 7185 assert(pat[src->end] == ')'); 7186 dst->start = src->start; 7187 dst->end = src->end; 7188 dst->block = src->block; 7189 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) 7190 : src->src_regex; 7191 dst++; 7192 } 7193 r1->code_blocks->count += r2c; 7194 Safefree(r1->code_blocks->cb); 7195 r1->code_blocks->cb = new_block; 7196 } 7197 7198 SvREFCNT_dec_NN(qr); 7199 return 1; 7200 } 7201 7202 7203 STATIC bool 7204 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, 7205 struct reg_substr_datum *rsd, 7206 struct scan_data_substrs *sub, 7207 STRLEN longest_length) 7208 { 7209 /* This is the common code for setting up the floating and fixed length 7210 * string data extracted from Perl_re_op_compile() below. Returns a boolean 7211 * as to whether succeeded or not */ 7212 7213 I32 t; 7214 SSize_t ml; 7215 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL); 7216 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL); 7217 7218 if (! (longest_length 7219 || (eol /* Can't have SEOL and MULTI */ 7220 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) 7221 ) 7222 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ 7223 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) 7224 { 7225 return FALSE; 7226 } 7227 7228 /* copy the information about the longest from the reg_scan_data 7229 over to the program. */ 7230 if (SvUTF8(sub->str)) { 7231 rsd->substr = NULL; 7232 rsd->utf8_substr = sub->str; 7233 } else { 7234 rsd->substr = sub->str; 7235 rsd->utf8_substr = NULL; 7236 } 7237 /* end_shift is how many chars that must be matched that 7238 follow this item. We calculate it ahead of time as once the 7239 lookbehind offset is added in we lose the ability to correctly 7240 calculate it.*/ 7241 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length; 7242 rsd->end_shift = ml - sub->min_offset 7243 - longest_length 7244 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL 7245 * intead? - DAPM 7246 + (SvTAIL(sub->str) != 0) 7247 */ 7248 + sub->lookbehind; 7249 7250 t = (eol/* Can't have SEOL and MULTI */ 7251 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); 7252 fbm_compile(sub->str, t ? FBMcf_TAIL : 0); 7253 7254 return TRUE; 7255 } 7256 7257 STATIC void 7258 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx) 7259 { 7260 /* Calculates and sets in the compiled pattern 'Rx' the string to compile, 7261 * properly wrapped with the right modifiers */ 7262 7263 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 7264 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags) 7265 != REGEX_DEPENDS_CHARSET); 7266 7267 /* The caret is output if there are any defaults: if not all the STD 7268 * flags are set, or if no character set specifier is needed */ 7269 bool has_default = 7270 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) 7271 || ! has_charset); 7272 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) 7273 == REG_RUN_ON_COMMENT_SEEN); 7274 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD) 7275 >> RXf_PMf_STD_PMMOD_SHIFT); 7276 const char *fptr = STD_PAT_MODS; /*"msixxn"*/ 7277 char *p; 7278 STRLEN pat_len = RExC_precomp_end - RExC_precomp; 7279 7280 /* We output all the necessary flags; we never output a minus, as all 7281 * those are defaults, so are 7282 * covered by the caret */ 7283 const STRLEN wraplen = pat_len + has_p + has_runon 7284 + has_default /* If needs a caret */ 7285 + PL_bitcount[reganch] /* 1 char for each set standard flag */ 7286 7287 /* If needs a character set specifier */ 7288 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) 7289 + (sizeof("(?:)") - 1); 7290 7291 PERL_ARGS_ASSERT_SET_REGEX_PV; 7292 7293 /* make sure PL_bitcount bounds not exceeded */ 7294 assert(sizeof(STD_PAT_MODS) <= 8); 7295 7296 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */ 7297 SvPOK_on(Rx); 7298 if (RExC_utf8) 7299 SvFLAGS(Rx) |= SVf_UTF8; 7300 *p++='('; *p++='?'; 7301 7302 /* If a default, cover it using the caret */ 7303 if (has_default) { 7304 *p++= DEFAULT_PAT_MOD; 7305 } 7306 if (has_charset) { 7307 STRLEN len; 7308 const char* name; 7309 7310 name = get_regex_charset_name(RExC_rx->extflags, &len); 7311 if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */ 7312 assert(RExC_utf8); 7313 name = UNICODE_PAT_MODS; 7314 len = sizeof(UNICODE_PAT_MODS) - 1; 7315 } 7316 Copy(name, p, len, char); 7317 p += len; 7318 } 7319 if (has_p) 7320 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 7321 { 7322 char ch; 7323 while((ch = *fptr++)) { 7324 if(reganch & 1) 7325 *p++ = ch; 7326 reganch >>= 1; 7327 } 7328 } 7329 7330 *p++ = ':'; 7331 Copy(RExC_precomp, p, pat_len, char); 7332 assert ((RX_WRAPPED(Rx) - p) < 16); 7333 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx); 7334 p += pat_len; 7335 7336 /* Adding a trailing \n causes this to compile properly: 7337 my $R = qr / A B C # D E/x; /($R)/ 7338 Otherwise the parens are considered part of the comment */ 7339 if (has_runon) 7340 *p++ = '\n'; 7341 *p++ = ')'; 7342 *p = 0; 7343 SvCUR_set(Rx, p - RX_WRAPPED(Rx)); 7344 } 7345 7346 /* 7347 * Perl_re_op_compile - the perl internal RE engine's function to compile a 7348 * regular expression into internal code. 7349 * The pattern may be passed either as: 7350 * a list of SVs (patternp plus pat_count) 7351 * a list of OPs (expr) 7352 * If both are passed, the SV list is used, but the OP list indicates 7353 * which SVs are actually pre-compiled code blocks 7354 * 7355 * The SVs in the list have magic and qr overloading applied to them (and 7356 * the list may be modified in-place with replacement SVs in the latter 7357 * case). 7358 * 7359 * If the pattern hasn't changed from old_re, then old_re will be 7360 * returned. 7361 * 7362 * eng is the current engine. If that engine has an op_comp method, then 7363 * handle directly (i.e. we assume that op_comp was us); otherwise, just 7364 * do the initial concatenation of arguments and pass on to the external 7365 * engine. 7366 * 7367 * If is_bare_re is not null, set it to a boolean indicating whether the 7368 * arg list reduced (after overloading) to a single bare regex which has 7369 * been returned (i.e. /$qr/). 7370 * 7371 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. 7372 * 7373 * pm_flags contains the PMf_* flags, typically based on those from the 7374 * pm_flags field of the related PMOP. Currently we're only interested in 7375 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. 7376 * 7377 * For many years this code had an initial sizing pass that calculated 7378 * (sometimes incorrectly, leading to security holes) the size needed for the 7379 * compiled pattern. That was changed by commit 7380 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a 7381 * node at a time, as parsing goes along. Patches welcome to fix any obsolete 7382 * references to this sizing pass. 7383 * 7384 * Now, an initial crude guess as to the size needed is made, based on the 7385 * length of the pattern. Patches welcome to improve that guess. That amount 7386 * of space is malloc'd and then immediately freed, and then clawed back node 7387 * by node. This design is to minimze, to the extent possible, memory churn 7388 * when doing the the reallocs. 7389 * 7390 * A separate parentheses counting pass may be needed in some cases. 7391 * (Previously the sizing pass did this.) Patches welcome to reduce the number 7392 * of these cases. 7393 * 7394 * The existence of a sizing pass necessitated design decisions that are no 7395 * longer needed. There are potential areas of simplification. 7396 * 7397 * Beware that the optimization-preparation code in here knows about some 7398 * of the structure of the compiled regexp. [I'll say.] 7399 */ 7400 7401 REGEXP * 7402 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, 7403 OP *expr, const regexp_engine* eng, REGEXP *old_re, 7404 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) 7405 { 7406 dVAR; 7407 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ 7408 STRLEN plen; 7409 char *exp; 7410 regnode *scan; 7411 I32 flags; 7412 SSize_t minlen = 0; 7413 U32 rx_flags; 7414 SV *pat; 7415 SV** new_patternp = patternp; 7416 7417 /* these are all flags - maybe they should be turned 7418 * into a single int with different bit masks */ 7419 I32 sawlookahead = 0; 7420 I32 sawplus = 0; 7421 I32 sawopen = 0; 7422 I32 sawminmod = 0; 7423 7424 regex_charset initial_charset = get_regex_charset(orig_rx_flags); 7425 bool recompile = 0; 7426 bool runtime_code = 0; 7427 scan_data_t data; 7428 RExC_state_t RExC_state; 7429 RExC_state_t * const pRExC_state = &RExC_state; 7430 #ifdef TRIE_STUDY_OPT 7431 int restudied = 0; 7432 RExC_state_t copyRExC_state; 7433 #endif 7434 GET_RE_DEBUG_FLAGS_DECL; 7435 7436 PERL_ARGS_ASSERT_RE_OP_COMPILE; 7437 7438 DEBUG_r(if (!PL_colorset) reginitcolors()); 7439 7440 /* Initialize these here instead of as-needed, as is quick and avoids 7441 * having to test them each time otherwise */ 7442 if (! PL_InBitmap) { 7443 #ifdef DEBUGGING 7444 char * dump_len_string; 7445 #endif 7446 7447 /* This is calculated here, because the Perl program that generates the 7448 * static global ones doesn't currently have access to 7449 * NUM_ANYOF_CODE_POINTS */ 7450 PL_InBitmap = _new_invlist(2); 7451 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, 7452 NUM_ANYOF_CODE_POINTS - 1); 7453 #ifdef DEBUGGING 7454 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); 7455 if ( ! dump_len_string 7456 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) 7457 { 7458 PL_dump_re_max_len = 60; /* A reasonable default */ 7459 } 7460 #endif 7461 } 7462 7463 pRExC_state->warn_text = NULL; 7464 pRExC_state->unlexed_names = NULL; 7465 pRExC_state->code_blocks = NULL; 7466 7467 if (is_bare_re) 7468 *is_bare_re = FALSE; 7469 7470 if (expr && (expr->op_type == OP_LIST || 7471 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { 7472 /* allocate code_blocks if needed */ 7473 OP *o; 7474 int ncode = 0; 7475 7476 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) 7477 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) 7478 ncode++; /* count of DO blocks */ 7479 7480 if (ncode) 7481 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode); 7482 } 7483 7484 if (!pat_count) { 7485 /* compile-time pattern with just OP_CONSTs and DO blocks */ 7486 7487 int n; 7488 OP *o; 7489 7490 /* find how many CONSTs there are */ 7491 assert(expr); 7492 n = 0; 7493 if (expr->op_type == OP_CONST) 7494 n = 1; 7495 else 7496 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 7497 if (o->op_type == OP_CONST) 7498 n++; 7499 } 7500 7501 /* fake up an SV array */ 7502 7503 assert(!new_patternp); 7504 Newx(new_patternp, n, SV*); 7505 SAVEFREEPV(new_patternp); 7506 pat_count = n; 7507 7508 n = 0; 7509 if (expr->op_type == OP_CONST) 7510 new_patternp[n] = cSVOPx_sv(expr); 7511 else 7512 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 7513 if (o->op_type == OP_CONST) 7514 new_patternp[n++] = cSVOPo_sv; 7515 } 7516 7517 } 7518 7519 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 7520 "Assembling pattern from %d elements%s\n", pat_count, 7521 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 7522 7523 /* set expr to the first arg op */ 7524 7525 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count 7526 && expr->op_type != OP_CONST) 7527 { 7528 expr = cLISTOPx(expr)->op_first; 7529 assert( expr->op_type == OP_PUSHMARK 7530 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) 7531 || expr->op_type == OP_PADRANGE); 7532 expr = OpSIBLING(expr); 7533 } 7534 7535 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, 7536 expr, &recompile, NULL); 7537 7538 /* handle bare (possibly after overloading) regex: foo =~ $re */ 7539 { 7540 SV *re = pat; 7541 if (SvROK(re)) 7542 re = SvRV(re); 7543 if (SvTYPE(re) == SVt_REGEXP) { 7544 if (is_bare_re) 7545 *is_bare_re = TRUE; 7546 SvREFCNT_inc(re); 7547 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 7548 "Precompiled pattern%s\n", 7549 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 7550 7551 return (REGEXP*)re; 7552 } 7553 } 7554 7555 exp = SvPV_nomg(pat, plen); 7556 7557 if (!eng->op_comp) { 7558 if ((SvUTF8(pat) && IN_BYTES) 7559 || SvGMAGICAL(pat) || SvAMAGIC(pat)) 7560 { 7561 /* make a temporary copy; either to convert to bytes, 7562 * or to avoid repeating get-magic / overloaded stringify */ 7563 pat = newSVpvn_flags(exp, plen, SVs_TEMP | 7564 (IN_BYTES ? 0 : SvUTF8(pat))); 7565 } 7566 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); 7567 } 7568 7569 /* ignore the utf8ness if the pattern is 0 length */ 7570 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); 7571 RExC_uni_semantics = 0; 7572 RExC_contains_locale = 0; 7573 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); 7574 RExC_in_script_run = 0; 7575 RExC_study_started = 0; 7576 pRExC_state->runtime_code_qr = NULL; 7577 RExC_frame_head= NULL; 7578 RExC_frame_last= NULL; 7579 RExC_frame_count= 0; 7580 RExC_latest_warn_offset = 0; 7581 RExC_use_BRANCHJ = 0; 7582 RExC_total_parens = 0; 7583 RExC_open_parens = NULL; 7584 RExC_close_parens = NULL; 7585 RExC_paren_names = NULL; 7586 RExC_size = 0; 7587 RExC_seen_d_op = FALSE; 7588 #ifdef DEBUGGING 7589 RExC_paren_name_list = NULL; 7590 #endif 7591 7592 DEBUG_r({ 7593 RExC_mysv1= sv_newmortal(); 7594 RExC_mysv2= sv_newmortal(); 7595 }); 7596 7597 DEBUG_COMPILE_r({ 7598 SV *dsv= sv_newmortal(); 7599 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len); 7600 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", 7601 PL_colors[4], PL_colors[5], s); 7602 }); 7603 7604 /* we jump here if we have to recompile, e.g., from upgrading the pattern 7605 * to utf8 */ 7606 7607 if ((pm_flags & PMf_USE_RE_EVAL) 7608 /* this second condition covers the non-regex literal case, 7609 * i.e. $foo =~ '(?{})'. */ 7610 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) 7611 ) 7612 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); 7613 7614 redo_parse: 7615 /* return old regex if pattern hasn't changed */ 7616 /* XXX: note in the below we have to check the flags as well as the 7617 * pattern. 7618 * 7619 * Things get a touch tricky as we have to compare the utf8 flag 7620 * independently from the compile flags. */ 7621 7622 if ( old_re 7623 && !recompile 7624 && !!RX_UTF8(old_re) == !!RExC_utf8 7625 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) 7626 && RX_PRECOMP(old_re) 7627 && RX_PRELEN(old_re) == plen 7628 && memEQ(RX_PRECOMP(old_re), exp, plen) 7629 && !runtime_code /* with runtime code, always recompile */ ) 7630 { 7631 return old_re; 7632 } 7633 7634 /* Allocate the pattern's SV */ 7635 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP); 7636 RExC_rx = ReANY(Rx); 7637 if ( RExC_rx == NULL ) 7638 FAIL("Regexp out of space"); 7639 7640 rx_flags = orig_rx_flags; 7641 7642 if ( (UTF || RExC_uni_semantics) 7643 && initial_charset == REGEX_DEPENDS_CHARSET) 7644 { 7645 7646 /* Set to use unicode semantics if the pattern is in utf8 and has the 7647 * 'depends' charset specified, as it means unicode when utf8 */ 7648 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 7649 RExC_uni_semantics = 1; 7650 } 7651 7652 RExC_pm_flags = pm_flags; 7653 7654 if (runtime_code) { 7655 assert(TAINTING_get || !TAINT_get); 7656 if (TAINT_get) 7657 Perl_croak(aTHX_ "Eval-group in insecure regular expression"); 7658 7659 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { 7660 /* whoops, we have a non-utf8 pattern, whilst run-time code 7661 * got compiled as utf8. Try again with a utf8 pattern */ 7662 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 7663 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 7664 goto redo_parse; 7665 } 7666 } 7667 assert(!pRExC_state->runtime_code_qr); 7668 7669 RExC_sawback = 0; 7670 7671 RExC_seen = 0; 7672 RExC_maxlen = 0; 7673 RExC_in_lookbehind = 0; 7674 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 7675 #ifdef EBCDIC 7676 RExC_recode_x_to_native = 0; 7677 #endif 7678 RExC_in_multi_char_class = 0; 7679 7680 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; 7681 RExC_precomp_end = RExC_end = exp + plen; 7682 RExC_nestroot = 0; 7683 RExC_whilem_seen = 0; 7684 RExC_end_op = NULL; 7685 RExC_recurse = NULL; 7686 RExC_study_chunk_recursed = NULL; 7687 RExC_study_chunk_recursed_bytes= 0; 7688 RExC_recurse_count = 0; 7689 pRExC_state->code_index = 0; 7690 7691 /* Initialize the string in the compiled pattern. This is so that there is 7692 * something to output if necessary */ 7693 set_regex_pv(pRExC_state, Rx); 7694 7695 DEBUG_PARSE_r({ 7696 Perl_re_printf( aTHX_ 7697 "Starting parse and generation\n"); 7698 RExC_lastnum=0; 7699 RExC_lastparse=NULL; 7700 }); 7701 7702 /* Allocate space and zero-initialize. Note, the two step process 7703 of zeroing when in debug mode, thus anything assigned has to 7704 happen after that */ 7705 if (! RExC_size) { 7706 7707 /* On the first pass of the parse, we guess how big this will be. Then 7708 * we grow in one operation to that amount and then give it back. As 7709 * we go along, we re-allocate what we need. 7710 * 7711 * XXX Currently the guess is essentially that the pattern will be an 7712 * EXACT node with one byte input, one byte output. This is crude, and 7713 * better heuristics are welcome. 7714 * 7715 * On any subsequent passes, we guess what we actually computed in the 7716 * latest earlier pass. Such a pass probably didn't complete so is 7717 * missing stuff. We could improve those guesses by knowing where the 7718 * parse stopped, and use the length so far plus apply the above 7719 * assumption to what's left. */ 7720 RExC_size = STR_SZ(RExC_end - RExC_start); 7721 } 7722 7723 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal); 7724 if ( RExC_rxi == NULL ) 7725 FAIL("Regexp out of space"); 7726 7727 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char); 7728 RXi_SET( RExC_rx, RExC_rxi ); 7729 7730 /* We start from 0 (over from 0 in the case this is a reparse. The first 7731 * node parsed will give back any excess memory we have allocated so far). 7732 * */ 7733 RExC_size = 0; 7734 7735 /* non-zero initialization begins here */ 7736 RExC_rx->engine= eng; 7737 RExC_rx->extflags = rx_flags; 7738 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; 7739 7740 if (pm_flags & PMf_IS_QR) { 7741 RExC_rxi->code_blocks = pRExC_state->code_blocks; 7742 if (RExC_rxi->code_blocks) { 7743 RExC_rxi->code_blocks->refcnt++; 7744 } 7745 } 7746 7747 RExC_rx->intflags = 0; 7748 7749 RExC_flags = rx_flags; /* don't let top level (?i) bleed */ 7750 RExC_parse = exp; 7751 7752 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv 7753 * code makes sure the final byte is an uncounted NUL. But should this 7754 * ever not be the case, lots of things could read beyond the end of the 7755 * buffer: loops like 7756 * while(isFOO(*RExC_parse)) RExC_parse++; 7757 * strchr(RExC_parse, "foo"); 7758 * etc. So it is worth noting. */ 7759 assert(*RExC_end == '\0'); 7760 7761 RExC_naughty = 0; 7762 RExC_npar = 1; 7763 RExC_parens_buf_size = 0; 7764 RExC_emit_start = RExC_rxi->program; 7765 pRExC_state->code_index = 0; 7766 7767 *((char*) RExC_emit_start) = (char) REG_MAGIC; 7768 RExC_emit = 1; 7769 7770 /* Do the parse */ 7771 if (reg(pRExC_state, 0, &flags, 1)) { 7772 7773 /* Success!, But we may need to redo the parse knowing how many parens 7774 * there actually are */ 7775 if (IN_PARENS_PASS) { 7776 flags |= RESTART_PARSE; 7777 } 7778 7779 /* We have that number in RExC_npar */ 7780 RExC_total_parens = RExC_npar; 7781 7782 /* XXX For backporting, use long jumps if there is any possibility of 7783 * overflow */ 7784 if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) { 7785 RExC_use_BRANCHJ = TRUE; 7786 flags |= RESTART_PARSE; 7787 } 7788 } 7789 else if (! MUST_RESTART(flags)) { 7790 ReREFCNT_dec(Rx); 7791 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags); 7792 } 7793 7794 /* Here, we either have success, or we have to redo the parse for some reason */ 7795 if (MUST_RESTART(flags)) { 7796 7797 /* It's possible to write a regexp in ascii that represents Unicode 7798 codepoints outside of the byte range, such as via \x{100}. If we 7799 detect such a sequence we have to convert the entire pattern to utf8 7800 and then recompile, as our sizing calculation will have been based 7801 on 1 byte == 1 character, but we will need to use utf8 to encode 7802 at least some part of the pattern, and therefore must convert the whole 7803 thing. 7804 -- dmq */ 7805 if (flags & NEED_UTF8) { 7806 7807 /* We have stored the offset of the final warning output so far. 7808 * That must be adjusted. Any variant characters between the start 7809 * of the pattern and this warning count for 2 bytes in the final, 7810 * so just add them again */ 7811 if (UNLIKELY(RExC_latest_warn_offset > 0)) { 7812 RExC_latest_warn_offset += 7813 variant_under_utf8_count((U8 *) exp, (U8 *) exp 7814 + RExC_latest_warn_offset); 7815 } 7816 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 7817 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 7818 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n")); 7819 } 7820 else { 7821 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); 7822 } 7823 7824 if (ALL_PARENS_COUNTED) { 7825 /* Make enough room for all the known parens, and zero it */ 7826 Renew(RExC_open_parens, RExC_total_parens, regnode_offset); 7827 Zero(RExC_open_parens, RExC_total_parens, regnode_offset); 7828 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 7829 7830 Renew(RExC_close_parens, RExC_total_parens, regnode_offset); 7831 Zero(RExC_close_parens, RExC_total_parens, regnode_offset); 7832 } 7833 else { /* Parse did not complete. Reinitialize the parentheses 7834 structures */ 7835 RExC_total_parens = 0; 7836 if (RExC_open_parens) { 7837 Safefree(RExC_open_parens); 7838 RExC_open_parens = NULL; 7839 } 7840 if (RExC_close_parens) { 7841 Safefree(RExC_close_parens); 7842 RExC_close_parens = NULL; 7843 } 7844 } 7845 7846 /* Clean up what we did in this parse */ 7847 SvREFCNT_dec_NN(RExC_rx_sv); 7848 7849 goto redo_parse; 7850 } 7851 7852 /* Here, we have successfully parsed and generated the pattern's program 7853 * for the regex engine. We are ready to finish things up and look for 7854 * optimizations. */ 7855 7856 /* Update the string to compile, with correct modifiers, etc */ 7857 set_regex_pv(pRExC_state, Rx); 7858 7859 RExC_rx->nparens = RExC_total_parens - 1; 7860 7861 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */ 7862 if (RExC_whilem_seen > 15) 7863 RExC_whilem_seen = 15; 7864 7865 DEBUG_PARSE_r({ 7866 Perl_re_printf( aTHX_ 7867 "Required size %" IVdf " nodes\n", (IV)RExC_size); 7868 RExC_lastnum=0; 7869 RExC_lastparse=NULL; 7870 }); 7871 7872 #ifdef RE_TRACK_PATTERN_OFFSETS 7873 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ 7874 "%s %" UVuf " bytes for offset annotations.\n", 7875 RExC_offsets ? "Got" : "Couldn't get", 7876 (UV)((RExC_offsets[0] * 2 + 1)))); 7877 DEBUG_OFFSETS_r(if (RExC_offsets) { 7878 const STRLEN len = RExC_offsets[0]; 7879 STRLEN i; 7880 GET_RE_DEBUG_FLAGS_DECL; 7881 Perl_re_printf( aTHX_ 7882 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]); 7883 for (i = 1; i <= len; i++) { 7884 if (RExC_offsets[i*2-1] || RExC_offsets[i*2]) 7885 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ", 7886 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]); 7887 } 7888 Perl_re_printf( aTHX_ "\n"); 7889 }); 7890 7891 #else 7892 SetProgLen(RExC_rxi,RExC_size); 7893 #endif 7894 7895 DEBUG_OPTIMISE_r( 7896 Perl_re_printf( aTHX_ "Starting post parse optimization\n"); 7897 ); 7898 7899 /* XXXX To minimize changes to RE engine we always allocate 7900 3-units-long substrs field. */ 7901 Newx(RExC_rx->substrs, 1, struct reg_substr_data); 7902 if (RExC_recurse_count) { 7903 Newx(RExC_recurse, RExC_recurse_count, regnode *); 7904 SAVEFREEPV(RExC_recurse); 7905 } 7906 7907 if (RExC_seen & REG_RECURSE_SEEN) { 7908 /* Note, RExC_total_parens is 1 + the number of parens in a pattern. 7909 * So its 1 if there are no parens. */ 7910 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) + 7911 ((RExC_total_parens & 0x07) != 0); 7912 Newx(RExC_study_chunk_recursed, 7913 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 7914 SAVEFREEPV(RExC_study_chunk_recursed); 7915 } 7916 7917 reStudy: 7918 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; 7919 DEBUG_r( 7920 RExC_study_chunk_recursed_count= 0; 7921 ); 7922 Zero(RExC_rx->substrs, 1, struct reg_substr_data); 7923 if (RExC_study_chunk_recursed) { 7924 Zero(RExC_study_chunk_recursed, 7925 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 7926 } 7927 7928 7929 #ifdef TRIE_STUDY_OPT 7930 if (!restudied) { 7931 StructCopy(&zero_scan_data, &data, scan_data_t); 7932 copyRExC_state = RExC_state; 7933 } else { 7934 U32 seen=RExC_seen; 7935 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); 7936 7937 RExC_state = copyRExC_state; 7938 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) 7939 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 7940 else 7941 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; 7942 StructCopy(&zero_scan_data, &data, scan_data_t); 7943 } 7944 #else 7945 StructCopy(&zero_scan_data, &data, scan_data_t); 7946 #endif 7947 7948 /* Dig out information for optimizations. */ 7949 RExC_rx->extflags = RExC_flags; /* was pm_op */ 7950 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 7951 7952 if (UTF) 7953 SvUTF8_on(Rx); /* Unicode in it? */ 7954 RExC_rxi->regstclass = NULL; 7955 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ 7956 RExC_rx->intflags |= PREGf_NAUGHTY; 7957 scan = RExC_rxi->program + 1; /* First BRANCH. */ 7958 7959 /* testing for BRANCH here tells us whether there is "must appear" 7960 data in the pattern. If there is then we can use it for optimisations */ 7961 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. 7962 */ 7963 SSize_t fake; 7964 STRLEN longest_length[2]; 7965 regnode_ssc ch_class; /* pointed to by data */ 7966 int stclass_flag; 7967 SSize_t last_close = 0; /* pointed to by data */ 7968 regnode *first= scan; 7969 regnode *first_next= regnext(first); 7970 int i; 7971 7972 /* 7973 * Skip introductions and multiplicators >= 1 7974 * so that we can extract the 'meat' of the pattern that must 7975 * match in the large if() sequence following. 7976 * NOTE that EXACT is NOT covered here, as it is normally 7977 * picked up by the optimiser separately. 7978 * 7979 * This is unfortunate as the optimiser isnt handling lookahead 7980 * properly currently. 7981 * 7982 */ 7983 while ((OP(first) == OPEN && (sawopen = 1)) || 7984 /* An OR of *one* alternative - should not happen now. */ 7985 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 7986 /* for now we can't handle lookbehind IFMATCH*/ 7987 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || 7988 (OP(first) == PLUS) || 7989 (OP(first) == MINMOD) || 7990 /* An {n,m} with n>0 */ 7991 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 7992 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 7993 { 7994 /* 7995 * the only op that could be a regnode is PLUS, all the rest 7996 * will be regnode_1 or regnode_2. 7997 * 7998 * (yves doesn't think this is true) 7999 */ 8000 if (OP(first) == PLUS) 8001 sawplus = 1; 8002 else { 8003 if (OP(first) == MINMOD) 8004 sawminmod = 1; 8005 first += regarglen[OP(first)]; 8006 } 8007 first = NEXTOPER(first); 8008 first_next= regnext(first); 8009 } 8010 8011 /* Starting-point info. */ 8012 again: 8013 DEBUG_PEEP("first:", first, 0, 0); 8014 /* Ignore EXACT as we deal with it later. */ 8015 if (PL_regkind[OP(first)] == EXACT) { 8016 if ( OP(first) == EXACT 8017 || OP(first) == EXACT_ONLY8 8018 || OP(first) == EXACTL) 8019 { 8020 NOOP; /* Empty, get anchored substr later. */ 8021 } 8022 else 8023 RExC_rxi->regstclass = first; 8024 } 8025 #ifdef TRIE_STCLASS 8026 else if (PL_regkind[OP(first)] == TRIE && 8027 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0) 8028 { 8029 /* this can happen only on restudy */ 8030 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); 8031 } 8032 #endif 8033 else if (REGNODE_SIMPLE(OP(first))) 8034 RExC_rxi->regstclass = first; 8035 else if (PL_regkind[OP(first)] == BOUND || 8036 PL_regkind[OP(first)] == NBOUND) 8037 RExC_rxi->regstclass = first; 8038 else if (PL_regkind[OP(first)] == BOL) { 8039 RExC_rx->intflags |= (OP(first) == MBOL 8040 ? PREGf_ANCH_MBOL 8041 : PREGf_ANCH_SBOL); 8042 first = NEXTOPER(first); 8043 goto again; 8044 } 8045 else if (OP(first) == GPOS) { 8046 RExC_rx->intflags |= PREGf_ANCH_GPOS; 8047 first = NEXTOPER(first); 8048 goto again; 8049 } 8050 else if ((!sawopen || !RExC_sawback) && 8051 !sawlookahead && 8052 (OP(first) == STAR && 8053 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 8054 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks) 8055 { 8056 /* turn .* into ^.* with an implied $*=1 */ 8057 const int type = 8058 (OP(NEXTOPER(first)) == REG_ANY) 8059 ? PREGf_ANCH_MBOL 8060 : PREGf_ANCH_SBOL; 8061 RExC_rx->intflags |= (type | PREGf_IMPLICIT); 8062 first = NEXTOPER(first); 8063 goto again; 8064 } 8065 if (sawplus && !sawminmod && !sawlookahead 8066 && (!sawopen || !RExC_sawback) 8067 && !pRExC_state->code_blocks) /* May examine pos and $& */ 8068 /* x+ must match at the 1st pos of run of x's */ 8069 RExC_rx->intflags |= PREGf_SKIP; 8070 8071 /* Scan is after the zeroth branch, first is atomic matcher. */ 8072 #ifdef TRIE_STUDY_OPT 8073 DEBUG_PARSE_r( 8074 if (!restudied) 8075 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 8076 (IV)(first - scan + 1)) 8077 ); 8078 #else 8079 DEBUG_PARSE_r( 8080 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 8081 (IV)(first - scan + 1)) 8082 ); 8083 #endif 8084 8085 8086 /* 8087 * If there's something expensive in the r.e., find the 8088 * longest literal string that must appear and make it the 8089 * regmust. Resolve ties in favor of later strings, since 8090 * the regstart check works with the beginning of the r.e. 8091 * and avoiding duplication strengthens checking. Not a 8092 * strong reason, but sufficient in the absence of others. 8093 * [Now we resolve ties in favor of the earlier string if 8094 * it happens that c_offset_min has been invalidated, since the 8095 * earlier string may buy us something the later one won't.] 8096 */ 8097 8098 data.substrs[0].str = newSVpvs(""); 8099 data.substrs[1].str = newSVpvs(""); 8100 data.last_found = newSVpvs(""); 8101 data.cur_is_floating = 0; /* initially any found substring is fixed */ 8102 ENTER_with_name("study_chunk"); 8103 SAVEFREESV(data.substrs[0].str); 8104 SAVEFREESV(data.substrs[1].str); 8105 SAVEFREESV(data.last_found); 8106 first = scan; 8107 if (!RExC_rxi->regstclass) { 8108 ssc_init(pRExC_state, &ch_class); 8109 data.start_class = &ch_class; 8110 stclass_flag = SCF_DO_STCLASS_AND; 8111 } else /* XXXX Check for BOUND? */ 8112 stclass_flag = 0; 8113 data.last_closep = &last_close; 8114 8115 DEBUG_RExC_seen(); 8116 /* 8117 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/ 8118 * (NO top level branches) 8119 */ 8120 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, 8121 scan + RExC_size, /* Up to end */ 8122 &data, -1, 0, NULL, 8123 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag 8124 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 8125 0, TRUE); 8126 8127 8128 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); 8129 8130 8131 if ( RExC_total_parens == 1 && !data.cur_is_floating 8132 && data.last_start_min == 0 && data.last_end > 0 8133 && !RExC_seen_zerolen 8134 && !(RExC_seen & REG_VERBARG_SEEN) 8135 && !(RExC_seen & REG_GPOS_SEEN) 8136 ){ 8137 RExC_rx->extflags |= RXf_CHECK_ALL; 8138 } 8139 scan_commit(pRExC_state, &data,&minlen, 0); 8140 8141 8142 /* XXX this is done in reverse order because that's the way the 8143 * code was before it was parameterised. Don't know whether it 8144 * actually needs doing in reverse order. DAPM */ 8145 for (i = 1; i >= 0; i--) { 8146 longest_length[i] = CHR_SVLEN(data.substrs[i].str); 8147 8148 if ( !( i 8149 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */ 8150 && data.substrs[0].min_offset 8151 == data.substrs[1].min_offset 8152 && SvCUR(data.substrs[0].str) 8153 == SvCUR(data.substrs[1].str) 8154 ) 8155 && S_setup_longest (aTHX_ pRExC_state, 8156 &(RExC_rx->substrs->data[i]), 8157 &(data.substrs[i]), 8158 longest_length[i])) 8159 { 8160 RExC_rx->substrs->data[i].min_offset = 8161 data.substrs[i].min_offset - data.substrs[i].lookbehind; 8162 8163 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset; 8164 /* Don't offset infinity */ 8165 if (data.substrs[i].max_offset < SSize_t_MAX) 8166 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind; 8167 SvREFCNT_inc_simple_void_NN(data.substrs[i].str); 8168 } 8169 else { 8170 RExC_rx->substrs->data[i].substr = NULL; 8171 RExC_rx->substrs->data[i].utf8_substr = NULL; 8172 longest_length[i] = 0; 8173 } 8174 } 8175 8176 LEAVE_with_name("study_chunk"); 8177 8178 if (RExC_rxi->regstclass 8179 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY)) 8180 RExC_rxi->regstclass = NULL; 8181 8182 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr) 8183 || RExC_rx->substrs->data[0].min_offset) 8184 && stclass_flag 8185 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 8186 && is_ssc_worth_it(pRExC_state, data.start_class)) 8187 { 8188 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 8189 8190 ssc_finalize(pRExC_state, data.start_class); 8191 8192 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 8193 StructCopy(data.start_class, 8194 (regnode_ssc*)RExC_rxi->data->data[n], 8195 regnode_ssc); 8196 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 8197 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 8198 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 8199 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 8200 Perl_re_printf( aTHX_ 8201 "synthetic stclass \"%s\".\n", 8202 SvPVX_const(sv));}); 8203 data.start_class = NULL; 8204 } 8205 8206 /* A temporary algorithm prefers floated substr to fixed one of 8207 * same length to dig more info. */ 8208 i = (longest_length[0] <= longest_length[1]); 8209 RExC_rx->substrs->check_ix = i; 8210 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift; 8211 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr; 8212 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr; 8213 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset; 8214 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset; 8215 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))) 8216 RExC_rx->intflags |= PREGf_NOSCAN; 8217 8218 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) { 8219 RExC_rx->extflags |= RXf_USE_INTUIT; 8220 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8)) 8221 RExC_rx->extflags |= RXf_INTUIT_TAIL; 8222 } 8223 8224 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 8225 if ( (STRLEN)minlen < longest_length[1] ) 8226 minlen= longest_length[1]; 8227 if ( (STRLEN)minlen < longest_length[0] ) 8228 minlen= longest_length[0]; 8229 */ 8230 } 8231 else { 8232 /* Several toplevels. Best we can is to set minlen. */ 8233 SSize_t fake; 8234 regnode_ssc ch_class; 8235 SSize_t last_close = 0; 8236 8237 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); 8238 8239 scan = RExC_rxi->program + 1; 8240 ssc_init(pRExC_state, &ch_class); 8241 data.start_class = &ch_class; 8242 data.last_closep = &last_close; 8243 8244 DEBUG_RExC_seen(); 8245 /* 8246 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../ 8247 * (patterns WITH top level branches) 8248 */ 8249 minlen = study_chunk(pRExC_state, 8250 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, 8251 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied 8252 ? SCF_TRIE_DOING_RESTUDY 8253 : 0), 8254 0, TRUE); 8255 8256 CHECK_RESTUDY_GOTO_butfirst(NOOP); 8257 8258 RExC_rx->check_substr = NULL; 8259 RExC_rx->check_utf8 = NULL; 8260 RExC_rx->substrs->data[0].substr = NULL; 8261 RExC_rx->substrs->data[0].utf8_substr = NULL; 8262 RExC_rx->substrs->data[1].substr = NULL; 8263 RExC_rx->substrs->data[1].utf8_substr = NULL; 8264 8265 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 8266 && is_ssc_worth_it(pRExC_state, data.start_class)) 8267 { 8268 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 8269 8270 ssc_finalize(pRExC_state, data.start_class); 8271 8272 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 8273 StructCopy(data.start_class, 8274 (regnode_ssc*)RExC_rxi->data->data[n], 8275 regnode_ssc); 8276 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 8277 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 8278 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 8279 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 8280 Perl_re_printf( aTHX_ 8281 "synthetic stclass \"%s\".\n", 8282 SvPVX_const(sv));}); 8283 data.start_class = NULL; 8284 } 8285 } 8286 8287 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { 8288 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; 8289 RExC_rx->maxlen = REG_INFTY; 8290 } 8291 else { 8292 RExC_rx->maxlen = RExC_maxlen; 8293 } 8294 8295 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 8296 the "real" pattern. */ 8297 DEBUG_OPTIMISE_r({ 8298 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n", 8299 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen); 8300 }); 8301 RExC_rx->minlenret = minlen; 8302 if (RExC_rx->minlen < minlen) 8303 RExC_rx->minlen = minlen; 8304 8305 if (RExC_seen & REG_RECURSE_SEEN ) { 8306 RExC_rx->intflags |= PREGf_RECURSE_SEEN; 8307 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *); 8308 } 8309 if (RExC_seen & REG_GPOS_SEEN) 8310 RExC_rx->intflags |= PREGf_GPOS_SEEN; 8311 if (RExC_seen & REG_LOOKBEHIND_SEEN) 8312 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 8313 lookbehind */ 8314 if (pRExC_state->code_blocks) 8315 RExC_rx->extflags |= RXf_EVAL_SEEN; 8316 if (RExC_seen & REG_VERBARG_SEEN) 8317 { 8318 RExC_rx->intflags |= PREGf_VERBARG_SEEN; 8319 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ 8320 } 8321 if (RExC_seen & REG_CUTGROUP_SEEN) 8322 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN; 8323 if (pm_flags & PMf_USE_RE_EVAL) 8324 RExC_rx->intflags |= PREGf_USE_RE_EVAL; 8325 if (RExC_paren_names) 8326 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 8327 else 8328 RXp_PAREN_NAMES(RExC_rx) = NULL; 8329 8330 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED 8331 * so it can be used in pp.c */ 8332 if (RExC_rx->intflags & PREGf_ANCH) 8333 RExC_rx->extflags |= RXf_IS_ANCHORED; 8334 8335 8336 { 8337 /* this is used to identify "special" patterns that might result 8338 * in Perl NOT calling the regex engine and instead doing the match "itself", 8339 * particularly special cases in split//. By having the regex compiler 8340 * do this pattern matching at a regop level (instead of by inspecting the pattern) 8341 * we avoid weird issues with equivalent patterns resulting in different behavior, 8342 * AND we allow non Perl engines to get the same optimizations by the setting the 8343 * flags appropriately - Yves */ 8344 regnode *first = RExC_rxi->program + 1; 8345 U8 fop = OP(first); 8346 regnode *next = regnext(first); 8347 U8 nop = OP(next); 8348 8349 if (PL_regkind[fop] == NOTHING && nop == END) 8350 RExC_rx->extflags |= RXf_NULL; 8351 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END) 8352 /* when fop is SBOL first->flags will be true only when it was 8353 * produced by parsing /\A/, and not when parsing /^/. This is 8354 * very important for the split code as there we want to 8355 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. 8356 * See rt #122761 for more details. -- Yves */ 8357 RExC_rx->extflags |= RXf_START_ONLY; 8358 else if (fop == PLUS 8359 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE 8360 && nop == END) 8361 RExC_rx->extflags |= RXf_WHITE; 8362 else if ( RExC_rx->extflags & RXf_SPLIT 8363 && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL) 8364 && STR_LEN(first) == 1 8365 && *(STRING(first)) == ' ' 8366 && nop == END ) 8367 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 8368 8369 } 8370 8371 if (RExC_contains_locale) { 8372 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED; 8373 } 8374 8375 #ifdef DEBUGGING 8376 if (RExC_paren_names) { 8377 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); 8378 RExC_rxi->data->data[RExC_rxi->name_list_idx] 8379 = (void*)SvREFCNT_inc(RExC_paren_name_list); 8380 } else 8381 #endif 8382 RExC_rxi->name_list_idx = 0; 8383 8384 while ( RExC_recurse_count > 0 ) { 8385 const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; 8386 /* 8387 * This data structure is set up in study_chunk() and is used 8388 * to calculate the distance between a GOSUB regopcode and 8389 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's) 8390 * it refers to. 8391 * 8392 * If for some reason someone writes code that optimises 8393 * away a GOSUB opcode then the assert should be changed to 8394 * an if(scan) to guard the ARG2L_SET() - Yves 8395 * 8396 */ 8397 assert(scan && OP(scan) == GOSUB); 8398 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan)); 8399 } 8400 8401 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair); 8402 /* assume we don't need to swap parens around before we match */ 8403 DEBUG_TEST_r({ 8404 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", 8405 (unsigned long)RExC_study_chunk_recursed_count); 8406 }); 8407 DEBUG_DUMP_r({ 8408 DEBUG_RExC_seen(); 8409 Perl_re_printf( aTHX_ "Final program:\n"); 8410 regdump(RExC_rx); 8411 }); 8412 8413 if (RExC_open_parens) { 8414 Safefree(RExC_open_parens); 8415 RExC_open_parens = NULL; 8416 } 8417 if (RExC_close_parens) { 8418 Safefree(RExC_close_parens); 8419 RExC_close_parens = NULL; 8420 } 8421 8422 #ifdef USE_ITHREADS 8423 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated 8424 * by setting the regexp SV to readonly-only instead. If the 8425 * pattern's been recompiled, the USEDness should remain. */ 8426 if (old_re && SvREADONLY(old_re)) 8427 SvREADONLY_on(Rx); 8428 #endif 8429 return Rx; 8430 } 8431 8432 8433 SV* 8434 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 8435 const U32 flags) 8436 { 8437 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 8438 8439 PERL_UNUSED_ARG(value); 8440 8441 if (flags & RXapif_FETCH) { 8442 return reg_named_buff_fetch(rx, key, flags); 8443 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 8444 Perl_croak_no_modify(); 8445 return NULL; 8446 } else if (flags & RXapif_EXISTS) { 8447 return reg_named_buff_exists(rx, key, flags) 8448 ? &PL_sv_yes 8449 : &PL_sv_no; 8450 } else if (flags & RXapif_REGNAMES) { 8451 return reg_named_buff_all(rx, flags); 8452 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 8453 return reg_named_buff_scalar(rx, flags); 8454 } else { 8455 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 8456 return NULL; 8457 } 8458 } 8459 8460 SV* 8461 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 8462 const U32 flags) 8463 { 8464 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 8465 PERL_UNUSED_ARG(lastkey); 8466 8467 if (flags & RXapif_FIRSTKEY) 8468 return reg_named_buff_firstkey(rx, flags); 8469 else if (flags & RXapif_NEXTKEY) 8470 return reg_named_buff_nextkey(rx, flags); 8471 else { 8472 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", 8473 (int)flags); 8474 return NULL; 8475 } 8476 } 8477 8478 SV* 8479 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 8480 const U32 flags) 8481 { 8482 SV *ret; 8483 struct regexp *const rx = ReANY(r); 8484 8485 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 8486 8487 if (rx && RXp_PAREN_NAMES(rx)) { 8488 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 8489 if (he_str) { 8490 IV i; 8491 SV* sv_dat=HeVAL(he_str); 8492 I32 *nums=(I32*)SvPVX(sv_dat); 8493 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL; 8494 for ( i=0; i<SvIVX(sv_dat); i++ ) { 8495 if ((I32)(rx->nparens) >= nums[i] 8496 && rx->offs[nums[i]].start != -1 8497 && rx->offs[nums[i]].end != -1) 8498 { 8499 ret = newSVpvs(""); 8500 CALLREG_NUMBUF_FETCH(r, nums[i], ret); 8501 if (!retarray) 8502 return ret; 8503 } else { 8504 if (retarray) 8505 ret = newSVsv(&PL_sv_undef); 8506 } 8507 if (retarray) 8508 av_push(retarray, ret); 8509 } 8510 if (retarray) 8511 return newRV_noinc(MUTABLE_SV(retarray)); 8512 } 8513 } 8514 return NULL; 8515 } 8516 8517 bool 8518 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 8519 const U32 flags) 8520 { 8521 struct regexp *const rx = ReANY(r); 8522 8523 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 8524 8525 if (rx && RXp_PAREN_NAMES(rx)) { 8526 if (flags & RXapif_ALL) { 8527 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 8528 } else { 8529 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 8530 if (sv) { 8531 SvREFCNT_dec_NN(sv); 8532 return TRUE; 8533 } else { 8534 return FALSE; 8535 } 8536 } 8537 } else { 8538 return FALSE; 8539 } 8540 } 8541 8542 SV* 8543 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 8544 { 8545 struct regexp *const rx = ReANY(r); 8546 8547 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 8548 8549 if ( rx && RXp_PAREN_NAMES(rx) ) { 8550 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 8551 8552 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 8553 } else { 8554 return FALSE; 8555 } 8556 } 8557 8558 SV* 8559 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 8560 { 8561 struct regexp *const rx = ReANY(r); 8562 GET_RE_DEBUG_FLAGS_DECL; 8563 8564 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 8565 8566 if (rx && RXp_PAREN_NAMES(rx)) { 8567 HV *hv = RXp_PAREN_NAMES(rx); 8568 HE *temphe; 8569 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 8570 IV i; 8571 IV parno = 0; 8572 SV* sv_dat = HeVAL(temphe); 8573 I32 *nums = (I32*)SvPVX(sv_dat); 8574 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 8575 if ((I32)(rx->lastparen) >= nums[i] && 8576 rx->offs[nums[i]].start != -1 && 8577 rx->offs[nums[i]].end != -1) 8578 { 8579 parno = nums[i]; 8580 break; 8581 } 8582 } 8583 if (parno || flags & RXapif_ALL) { 8584 return newSVhek(HeKEY_hek(temphe)); 8585 } 8586 } 8587 } 8588 return NULL; 8589 } 8590 8591 SV* 8592 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 8593 { 8594 SV *ret; 8595 AV *av; 8596 SSize_t length; 8597 struct regexp *const rx = ReANY(r); 8598 8599 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 8600 8601 if (rx && RXp_PAREN_NAMES(rx)) { 8602 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 8603 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 8604 } else if (flags & RXapif_ONE) { 8605 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 8606 av = MUTABLE_AV(SvRV(ret)); 8607 length = av_tindex(av); 8608 SvREFCNT_dec_NN(ret); 8609 return newSViv(length + 1); 8610 } else { 8611 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", 8612 (int)flags); 8613 return NULL; 8614 } 8615 } 8616 return &PL_sv_undef; 8617 } 8618 8619 SV* 8620 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 8621 { 8622 struct regexp *const rx = ReANY(r); 8623 AV *av = newAV(); 8624 8625 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 8626 8627 if (rx && RXp_PAREN_NAMES(rx)) { 8628 HV *hv= RXp_PAREN_NAMES(rx); 8629 HE *temphe; 8630 (void)hv_iterinit(hv); 8631 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 8632 IV i; 8633 IV parno = 0; 8634 SV* sv_dat = HeVAL(temphe); 8635 I32 *nums = (I32*)SvPVX(sv_dat); 8636 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 8637 if ((I32)(rx->lastparen) >= nums[i] && 8638 rx->offs[nums[i]].start != -1 && 8639 rx->offs[nums[i]].end != -1) 8640 { 8641 parno = nums[i]; 8642 break; 8643 } 8644 } 8645 if (parno || flags & RXapif_ALL) { 8646 av_push(av, newSVhek(HeKEY_hek(temphe))); 8647 } 8648 } 8649 } 8650 8651 return newRV_noinc(MUTABLE_SV(av)); 8652 } 8653 8654 void 8655 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, 8656 SV * const sv) 8657 { 8658 struct regexp *const rx = ReANY(r); 8659 char *s = NULL; 8660 SSize_t i = 0; 8661 SSize_t s1, t1; 8662 I32 n = paren; 8663 8664 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 8665 8666 if ( n == RX_BUFF_IDX_CARET_PREMATCH 8667 || n == RX_BUFF_IDX_CARET_FULLMATCH 8668 || n == RX_BUFF_IDX_CARET_POSTMATCH 8669 ) 8670 { 8671 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 8672 if (!keepcopy) { 8673 /* on something like 8674 * $r = qr/.../; 8675 * /$qr/p; 8676 * the KEEPCOPY is set on the PMOP rather than the regex */ 8677 if (PL_curpm && r == PM_GETRE(PL_curpm)) 8678 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 8679 } 8680 if (!keepcopy) 8681 goto ret_undef; 8682 } 8683 8684 if (!rx->subbeg) 8685 goto ret_undef; 8686 8687 if (n == RX_BUFF_IDX_CARET_FULLMATCH) 8688 /* no need to distinguish between them any more */ 8689 n = RX_BUFF_IDX_FULLMATCH; 8690 8691 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) 8692 && rx->offs[0].start != -1) 8693 { 8694 /* $`, ${^PREMATCH} */ 8695 i = rx->offs[0].start; 8696 s = rx->subbeg; 8697 } 8698 else 8699 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) 8700 && rx->offs[0].end != -1) 8701 { 8702 /* $', ${^POSTMATCH} */ 8703 s = rx->subbeg - rx->suboffset + rx->offs[0].end; 8704 i = rx->sublen + rx->suboffset - rx->offs[0].end; 8705 } 8706 else 8707 if ( 0 <= n && n <= (I32)rx->nparens && 8708 (s1 = rx->offs[n].start) != -1 && 8709 (t1 = rx->offs[n].end) != -1) 8710 { 8711 /* $&, ${^MATCH}, $1 ... */ 8712 i = t1 - s1; 8713 s = rx->subbeg + s1 - rx->suboffset; 8714 } else { 8715 goto ret_undef; 8716 } 8717 8718 assert(s >= rx->subbeg); 8719 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); 8720 if (i >= 0) { 8721 #ifdef NO_TAINT_SUPPORT 8722 sv_setpvn(sv, s, i); 8723 #else 8724 const int oldtainted = TAINT_get; 8725 TAINT_NOT; 8726 sv_setpvn(sv, s, i); 8727 TAINT_set(oldtainted); 8728 #endif 8729 if (RXp_MATCH_UTF8(rx)) 8730 SvUTF8_on(sv); 8731 else 8732 SvUTF8_off(sv); 8733 if (TAINTING_get) { 8734 if (RXp_MATCH_TAINTED(rx)) { 8735 if (SvTYPE(sv) >= SVt_PVMG) { 8736 MAGIC* const mg = SvMAGIC(sv); 8737 MAGIC* mgt; 8738 TAINT; 8739 SvMAGIC_set(sv, mg->mg_moremagic); 8740 SvTAINT(sv); 8741 if ((mgt = SvMAGIC(sv))) { 8742 mg->mg_moremagic = mgt; 8743 SvMAGIC_set(sv, mg); 8744 } 8745 } else { 8746 TAINT; 8747 SvTAINT(sv); 8748 } 8749 } else 8750 SvTAINTED_off(sv); 8751 } 8752 } else { 8753 ret_undef: 8754 sv_set_undef(sv); 8755 return; 8756 } 8757 } 8758 8759 void 8760 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 8761 SV const * const value) 8762 { 8763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 8764 8765 PERL_UNUSED_ARG(rx); 8766 PERL_UNUSED_ARG(paren); 8767 PERL_UNUSED_ARG(value); 8768 8769 if (!PL_localizing) 8770 Perl_croak_no_modify(); 8771 } 8772 8773 I32 8774 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 8775 const I32 paren) 8776 { 8777 struct regexp *const rx = ReANY(r); 8778 I32 i; 8779 I32 s1, t1; 8780 8781 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 8782 8783 if ( paren == RX_BUFF_IDX_CARET_PREMATCH 8784 || paren == RX_BUFF_IDX_CARET_FULLMATCH 8785 || paren == RX_BUFF_IDX_CARET_POSTMATCH 8786 ) 8787 { 8788 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 8789 if (!keepcopy) { 8790 /* on something like 8791 * $r = qr/.../; 8792 * /$qr/p; 8793 * the KEEPCOPY is set on the PMOP rather than the regex */ 8794 if (PL_curpm && r == PM_GETRE(PL_curpm)) 8795 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 8796 } 8797 if (!keepcopy) 8798 goto warn_undef; 8799 } 8800 8801 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 8802 switch (paren) { 8803 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ 8804 case RX_BUFF_IDX_PREMATCH: /* $` */ 8805 if (rx->offs[0].start != -1) { 8806 i = rx->offs[0].start; 8807 if (i > 0) { 8808 s1 = 0; 8809 t1 = i; 8810 goto getlen; 8811 } 8812 } 8813 return 0; 8814 8815 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ 8816 case RX_BUFF_IDX_POSTMATCH: /* $' */ 8817 if (rx->offs[0].end != -1) { 8818 i = rx->sublen - rx->offs[0].end; 8819 if (i > 0) { 8820 s1 = rx->offs[0].end; 8821 t1 = rx->sublen; 8822 goto getlen; 8823 } 8824 } 8825 return 0; 8826 8827 default: /* $& / ${^MATCH}, $1, $2, ... */ 8828 if (paren <= (I32)rx->nparens && 8829 (s1 = rx->offs[paren].start) != -1 && 8830 (t1 = rx->offs[paren].end) != -1) 8831 { 8832 i = t1 - s1; 8833 goto getlen; 8834 } else { 8835 warn_undef: 8836 if (ckWARN(WARN_UNINITIALIZED)) 8837 report_uninit((const SV *)sv); 8838 return 0; 8839 } 8840 } 8841 getlen: 8842 if (i > 0 && RXp_MATCH_UTF8(rx)) { 8843 const char * const s = rx->subbeg - rx->suboffset + s1; 8844 const U8 *ep; 8845 STRLEN el; 8846 8847 i = t1 - s1; 8848 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 8849 i = el; 8850 } 8851 return i; 8852 } 8853 8854 SV* 8855 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 8856 { 8857 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 8858 PERL_UNUSED_ARG(rx); 8859 if (0) 8860 return NULL; 8861 else 8862 return newSVpvs("Regexp"); 8863 } 8864 8865 /* Scans the name of a named buffer from the pattern. 8866 * If flags is REG_RSN_RETURN_NULL returns null. 8867 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 8868 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 8869 * to the parsed name as looked up in the RExC_paren_names hash. 8870 * If there is an error throws a vFAIL().. type exception. 8871 */ 8872 8873 #define REG_RSN_RETURN_NULL 0 8874 #define REG_RSN_RETURN_NAME 1 8875 #define REG_RSN_RETURN_DATA 2 8876 8877 STATIC SV* 8878 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 8879 { 8880 char *name_start = RExC_parse; 8881 SV* sv_name; 8882 8883 PERL_ARGS_ASSERT_REG_SCAN_NAME; 8884 8885 assert (RExC_parse <= RExC_end); 8886 if (RExC_parse == RExC_end) NOOP; 8887 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { 8888 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by 8889 * using do...while */ 8890 if (UTF) 8891 do { 8892 RExC_parse += UTF8SKIP(RExC_parse); 8893 } while ( RExC_parse < RExC_end 8894 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); 8895 else 8896 do { 8897 RExC_parse++; 8898 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); 8899 } else { 8900 RExC_parse++; /* so the <- from the vFAIL is after the offending 8901 character */ 8902 vFAIL("Group name must start with a non-digit word character"); 8903 } 8904 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 8905 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 8906 if ( flags == REG_RSN_RETURN_NAME) 8907 return sv_name; 8908 else if (flags==REG_RSN_RETURN_DATA) { 8909 HE *he_str = NULL; 8910 SV *sv_dat = NULL; 8911 if ( ! sv_name ) /* should not happen*/ 8912 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 8913 if (RExC_paren_names) 8914 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 8915 if ( he_str ) 8916 sv_dat = HeVAL(he_str); 8917 if ( ! sv_dat ) { /* Didn't find group */ 8918 8919 /* It might be a forward reference; we can't fail until we 8920 * know, by completing the parse to get all the groups, and 8921 * then reparsing */ 8922 if (ALL_PARENS_COUNTED) { 8923 vFAIL("Reference to nonexistent named group"); 8924 } 8925 else { 8926 REQUIRE_PARENS_PASS; 8927 } 8928 } 8929 return sv_dat; 8930 } 8931 8932 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", 8933 (unsigned long) flags); 8934 } 8935 8936 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 8937 if (RExC_lastparse!=RExC_parse) { \ 8938 Perl_re_printf( aTHX_ "%s", \ 8939 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ 8940 RExC_end - RExC_parse, 16, \ 8941 "", "", \ 8942 PERL_PV_ESCAPE_UNI_DETECT | \ 8943 PERL_PV_PRETTY_ELLIPSES | \ 8944 PERL_PV_PRETTY_LTGT | \ 8945 PERL_PV_ESCAPE_RE | \ 8946 PERL_PV_PRETTY_EXACTSIZE \ 8947 ) \ 8948 ); \ 8949 } else \ 8950 Perl_re_printf( aTHX_ "%16s",""); \ 8951 \ 8952 if (RExC_lastnum!=RExC_emit) \ 8953 Perl_re_printf( aTHX_ "|%4d", RExC_emit); \ 8954 else \ 8955 Perl_re_printf( aTHX_ "|%4s",""); \ 8956 Perl_re_printf( aTHX_ "|%*s%-4s", \ 8957 (int)((depth*2)), "", \ 8958 (funcname) \ 8959 ); \ 8960 RExC_lastnum=RExC_emit; \ 8961 RExC_lastparse=RExC_parse; \ 8962 }) 8963 8964 8965 8966 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 8967 DEBUG_PARSE_MSG((funcname)); \ 8968 Perl_re_printf( aTHX_ "%4s","\n"); \ 8969 }) 8970 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ 8971 DEBUG_PARSE_MSG((funcname)); \ 8972 Perl_re_printf( aTHX_ fmt "\n",args); \ 8973 }) 8974 8975 /* This section of code defines the inversion list object and its methods. The 8976 * interfaces are highly subject to change, so as much as possible is static to 8977 * this file. An inversion list is here implemented as a malloc'd C UV array 8978 * as an SVt_INVLIST scalar. 8979 * 8980 * An inversion list for Unicode is an array of code points, sorted by ordinal 8981 * number. Each element gives the code point that begins a range that extends 8982 * up-to but not including the code point given by the next element. The final 8983 * element gives the first code point of a range that extends to the platform's 8984 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], 8985 * ...) give ranges whose code points are all in the inversion list. We say 8986 * that those ranges are in the set. The odd-numbered elements give ranges 8987 * whose code points are not in the inversion list, and hence not in the set. 8988 * Thus, element [0] is the first code point in the list. Element [1] 8989 * is the first code point beyond that not in the list; and element [2] is the 8990 * first code point beyond that that is in the list. In other words, the first 8991 * range is invlist[0]..(invlist[1]-1), and all code points in that range are 8992 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and 8993 * all code points in that range are not in the inversion list. The third 8994 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion 8995 * list, and so forth. Thus every element whose index is divisible by two 8996 * gives the beginning of a range that is in the list, and every element whose 8997 * index is not divisible by two gives the beginning of a range not in the 8998 * list. If the final element's index is divisible by two, the inversion list 8999 * extends to the platform's infinity; otherwise the highest code point in the 9000 * inversion list is the contents of that element minus 1. 9001 * 9002 * A range that contains just a single code point N will look like 9003 * invlist[i] == N 9004 * invlist[i+1] == N+1 9005 * 9006 * If N is UV_MAX (the highest representable code point on the machine), N+1 is 9007 * impossible to represent, so element [i+1] is omitted. The single element 9008 * inversion list 9009 * invlist[0] == UV_MAX 9010 * contains just UV_MAX, but is interpreted as matching to infinity. 9011 * 9012 * Taking the complement (inverting) an inversion list is quite simple, if the 9013 * first element is 0, remove it; otherwise add a 0 element at the beginning. 9014 * This implementation reserves an element at the beginning of each inversion 9015 * list to always contain 0; there is an additional flag in the header which 9016 * indicates if the list begins at the 0, or is offset to begin at the next 9017 * element. This means that the inversion list can be inverted without any 9018 * copying; just flip the flag. 9019 * 9020 * More about inversion lists can be found in "Unicode Demystified" 9021 * Chapter 13 by Richard Gillam, published by Addison-Wesley. 9022 * 9023 * The inversion list data structure is currently implemented as an SV pointing 9024 * to an array of UVs that the SV thinks are bytes. This allows us to have an 9025 * array of UV whose memory management is automatically handled by the existing 9026 * facilities for SV's. 9027 * 9028 * Some of the methods should always be private to the implementation, and some 9029 * should eventually be made public */ 9030 9031 /* The header definitions are in F<invlist_inline.h> */ 9032 9033 #ifndef PERL_IN_XSUB_RE 9034 9035 PERL_STATIC_INLINE UV* 9036 S__invlist_array_init(SV* const invlist, const bool will_have_0) 9037 { 9038 /* Returns a pointer to the first element in the inversion list's array. 9039 * This is called upon initialization of an inversion list. Where the 9040 * array begins depends on whether the list has the code point U+0000 in it 9041 * or not. The other parameter tells it whether the code that follows this 9042 * call is about to put a 0 in the inversion list or not. The first 9043 * element is either the element reserved for 0, if TRUE, or the element 9044 * after it, if FALSE */ 9045 9046 bool* offset = get_invlist_offset_addr(invlist); 9047 UV* zero_addr = (UV *) SvPVX(invlist); 9048 9049 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; 9050 9051 /* Must be empty */ 9052 assert(! _invlist_len(invlist)); 9053 9054 *zero_addr = 0; 9055 9056 /* 1^1 = 0; 1^0 = 1 */ 9057 *offset = 1 ^ will_have_0; 9058 return zero_addr + *offset; 9059 } 9060 9061 PERL_STATIC_INLINE void 9062 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) 9063 { 9064 /* Sets the current number of elements stored in the inversion list. 9065 * Updates SvCUR correspondingly */ 9066 PERL_UNUSED_CONTEXT; 9067 PERL_ARGS_ASSERT_INVLIST_SET_LEN; 9068 9069 assert(is_invlist(invlist)); 9070 9071 SvCUR_set(invlist, 9072 (len == 0) 9073 ? 0 9074 : TO_INTERNAL_SIZE(len + offset)); 9075 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); 9076 } 9077 9078 STATIC void 9079 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) 9080 { 9081 /* Replaces the inversion list in 'dest' with the one from 'src'. It 9082 * steals the list from 'src', so 'src' is made to have a NULL list. This 9083 * is similar to what SvSetMagicSV() would do, if it were implemented on 9084 * inversion lists, though this routine avoids a copy */ 9085 9086 const UV src_len = _invlist_len(src); 9087 const bool src_offset = *get_invlist_offset_addr(src); 9088 const STRLEN src_byte_len = SvLEN(src); 9089 char * array = SvPVX(src); 9090 9091 const int oldtainted = TAINT_get; 9092 9093 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; 9094 9095 assert(is_invlist(src)); 9096 assert(is_invlist(dest)); 9097 assert(! invlist_is_iterating(src)); 9098 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); 9099 9100 /* Make sure it ends in the right place with a NUL, as our inversion list 9101 * manipulations aren't careful to keep this true, but sv_usepvn_flags() 9102 * asserts it */ 9103 array[src_byte_len - 1] = '\0'; 9104 9105 TAINT_NOT; /* Otherwise it breaks */ 9106 sv_usepvn_flags(dest, 9107 (char *) array, 9108 src_byte_len - 1, 9109 9110 /* This flag is documented to cause a copy to be avoided */ 9111 SV_HAS_TRAILING_NUL); 9112 TAINT_set(oldtainted); 9113 SvPV_set(src, 0); 9114 SvLEN_set(src, 0); 9115 SvCUR_set(src, 0); 9116 9117 /* Finish up copying over the other fields in an inversion list */ 9118 *get_invlist_offset_addr(dest) = src_offset; 9119 invlist_set_len(dest, src_len, src_offset); 9120 *get_invlist_previous_index_addr(dest) = 0; 9121 invlist_iterfinish(dest); 9122 } 9123 9124 PERL_STATIC_INLINE IV* 9125 S_get_invlist_previous_index_addr(SV* invlist) 9126 { 9127 /* Return the address of the IV that is reserved to hold the cached index 9128 * */ 9129 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; 9130 9131 assert(is_invlist(invlist)); 9132 9133 return &(((XINVLIST*) SvANY(invlist))->prev_index); 9134 } 9135 9136 PERL_STATIC_INLINE IV 9137 S_invlist_previous_index(SV* const invlist) 9138 { 9139 /* Returns cached index of previous search */ 9140 9141 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; 9142 9143 return *get_invlist_previous_index_addr(invlist); 9144 } 9145 9146 PERL_STATIC_INLINE void 9147 S_invlist_set_previous_index(SV* const invlist, const IV index) 9148 { 9149 /* Caches <index> for later retrieval */ 9150 9151 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; 9152 9153 assert(index == 0 || index < (int) _invlist_len(invlist)); 9154 9155 *get_invlist_previous_index_addr(invlist) = index; 9156 } 9157 9158 PERL_STATIC_INLINE void 9159 S_invlist_trim(SV* invlist) 9160 { 9161 /* Free the not currently-being-used space in an inversion list */ 9162 9163 /* But don't free up the space needed for the 0 UV that is always at the 9164 * beginning of the list, nor the trailing NUL */ 9165 const UV min_size = TO_INTERNAL_SIZE(1) + 1; 9166 9167 PERL_ARGS_ASSERT_INVLIST_TRIM; 9168 9169 assert(is_invlist(invlist)); 9170 9171 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); 9172 } 9173 9174 PERL_STATIC_INLINE void 9175 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ 9176 { 9177 PERL_ARGS_ASSERT_INVLIST_CLEAR; 9178 9179 assert(is_invlist(invlist)); 9180 9181 invlist_set_len(invlist, 0, 0); 9182 invlist_trim(invlist); 9183 } 9184 9185 #endif /* ifndef PERL_IN_XSUB_RE */ 9186 9187 PERL_STATIC_INLINE bool 9188 S_invlist_is_iterating(SV* const invlist) 9189 { 9190 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; 9191 9192 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; 9193 } 9194 9195 #ifndef PERL_IN_XSUB_RE 9196 9197 PERL_STATIC_INLINE UV 9198 S_invlist_max(SV* const invlist) 9199 { 9200 /* Returns the maximum number of elements storable in the inversion list's 9201 * array, without having to realloc() */ 9202 9203 PERL_ARGS_ASSERT_INVLIST_MAX; 9204 9205 assert(is_invlist(invlist)); 9206 9207 /* Assumes worst case, in which the 0 element is not counted in the 9208 * inversion list, so subtracts 1 for that */ 9209 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ 9210 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 9211 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; 9212 } 9213 9214 STATIC void 9215 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) 9216 { 9217 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS; 9218 9219 /* First 1 is in case the zero element isn't in the list; second 1 is for 9220 * trailing NUL */ 9221 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1); 9222 invlist_set_len(invlist, 0, 0); 9223 9224 /* Force iterinit() to be used to get iteration to work */ 9225 invlist_iterfinish(invlist); 9226 9227 *get_invlist_previous_index_addr(invlist) = 0; 9228 } 9229 9230 SV* 9231 Perl__new_invlist(pTHX_ IV initial_size) 9232 { 9233 9234 /* Return a pointer to a newly constructed inversion list, with enough 9235 * space to store 'initial_size' elements. If that number is negative, a 9236 * system default is used instead */ 9237 9238 SV* new_list; 9239 9240 if (initial_size < 0) { 9241 initial_size = 10; 9242 } 9243 9244 new_list = newSV_type(SVt_INVLIST); 9245 initialize_invlist_guts(new_list, initial_size); 9246 9247 return new_list; 9248 } 9249 9250 SV* 9251 Perl__new_invlist_C_array(pTHX_ const UV* const list) 9252 { 9253 /* Return a pointer to a newly constructed inversion list, initialized to 9254 * point to <list>, which has to be in the exact correct inversion list 9255 * form, including internal fields. Thus this is a dangerous routine that 9256 * should not be used in the wrong hands. The passed in 'list' contains 9257 * several header fields at the beginning that are not part of the 9258 * inversion list body proper */ 9259 9260 const STRLEN length = (STRLEN) list[0]; 9261 const UV version_id = list[1]; 9262 const bool offset = cBOOL(list[2]); 9263 #define HEADER_LENGTH 3 9264 /* If any of the above changes in any way, you must change HEADER_LENGTH 9265 * (if appropriate) and regenerate INVLIST_VERSION_ID by running 9266 * perl -E 'say int(rand 2**31-1)' 9267 */ 9268 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and 9269 data structure type, so that one being 9270 passed in can be validated to be an 9271 inversion list of the correct vintage. 9272 */ 9273 9274 SV* invlist = newSV_type(SVt_INVLIST); 9275 9276 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; 9277 9278 if (version_id != INVLIST_VERSION_ID) { 9279 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); 9280 } 9281 9282 /* The generated array passed in includes header elements that aren't part 9283 * of the list proper, so start it just after them */ 9284 SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); 9285 9286 SvLEN_set(invlist, 0); /* Means we own the contents, and the system 9287 shouldn't touch it */ 9288 9289 *(get_invlist_offset_addr(invlist)) = offset; 9290 9291 /* The 'length' passed to us is the physical number of elements in the 9292 * inversion list. But if there is an offset the logical number is one 9293 * less than that */ 9294 invlist_set_len(invlist, length - offset, offset); 9295 9296 invlist_set_previous_index(invlist, 0); 9297 9298 /* Initialize the iteration pointer. */ 9299 invlist_iterfinish(invlist); 9300 9301 SvREADONLY_on(invlist); 9302 9303 return invlist; 9304 } 9305 9306 STATIC void 9307 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) 9308 { 9309 /* Grow the maximum size of an inversion list */ 9310 9311 PERL_ARGS_ASSERT_INVLIST_EXTEND; 9312 9313 assert(is_invlist(invlist)); 9314 9315 /* Add one to account for the zero element at the beginning which may not 9316 * be counted by the calling parameters */ 9317 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); 9318 } 9319 9320 STATIC void 9321 S__append_range_to_invlist(pTHX_ SV* const invlist, 9322 const UV start, const UV end) 9323 { 9324 /* Subject to change or removal. Append the range from 'start' to 'end' at 9325 * the end of the inversion list. The range must be above any existing 9326 * ones. */ 9327 9328 UV* array; 9329 UV max = invlist_max(invlist); 9330 UV len = _invlist_len(invlist); 9331 bool offset; 9332 9333 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; 9334 9335 if (len == 0) { /* Empty lists must be initialized */ 9336 offset = start != 0; 9337 array = _invlist_array_init(invlist, ! offset); 9338 } 9339 else { 9340 /* Here, the existing list is non-empty. The current max entry in the 9341 * list is generally the first value not in the set, except when the 9342 * set extends to the end of permissible values, in which case it is 9343 * the first entry in that final set, and so this call is an attempt to 9344 * append out-of-order */ 9345 9346 UV final_element = len - 1; 9347 array = invlist_array(invlist); 9348 if ( array[final_element] > start 9349 || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) 9350 { 9351 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", 9352 array[final_element], start, 9353 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); 9354 } 9355 9356 /* Here, it is a legal append. If the new range begins 1 above the end 9357 * of the range below it, it is extending the range below it, so the 9358 * new first value not in the set is one greater than the newly 9359 * extended range. */ 9360 offset = *get_invlist_offset_addr(invlist); 9361 if (array[final_element] == start) { 9362 if (end != UV_MAX) { 9363 array[final_element] = end + 1; 9364 } 9365 else { 9366 /* But if the end is the maximum representable on the machine, 9367 * assume that infinity was actually what was meant. Just let 9368 * the range that this would extend to have no end */ 9369 invlist_set_len(invlist, len - 1, offset); 9370 } 9371 return; 9372 } 9373 } 9374 9375 /* Here the new range doesn't extend any existing set. Add it */ 9376 9377 len += 2; /* Includes an element each for the start and end of range */ 9378 9379 /* If wll overflow the existing space, extend, which may cause the array to 9380 * be moved */ 9381 if (max < len) { 9382 invlist_extend(invlist, len); 9383 9384 /* Have to set len here to avoid assert failure in invlist_array() */ 9385 invlist_set_len(invlist, len, offset); 9386 9387 array = invlist_array(invlist); 9388 } 9389 else { 9390 invlist_set_len(invlist, len, offset); 9391 } 9392 9393 /* The next item on the list starts the range, the one after that is 9394 * one past the new range. */ 9395 array[len - 2] = start; 9396 if (end != UV_MAX) { 9397 array[len - 1] = end + 1; 9398 } 9399 else { 9400 /* But if the end is the maximum representable on the machine, just let 9401 * the range have no end */ 9402 invlist_set_len(invlist, len - 1, offset); 9403 } 9404 } 9405 9406 SSize_t 9407 Perl__invlist_search(SV* const invlist, const UV cp) 9408 { 9409 /* Searches the inversion list for the entry that contains the input code 9410 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the 9411 * return value is the index into the list's array of the range that 9412 * contains <cp>, that is, 'i' such that 9413 * array[i] <= cp < array[i+1] 9414 */ 9415 9416 IV low = 0; 9417 IV mid; 9418 IV high = _invlist_len(invlist); 9419 const IV highest_element = high - 1; 9420 const UV* array; 9421 9422 PERL_ARGS_ASSERT__INVLIST_SEARCH; 9423 9424 /* If list is empty, return failure. */ 9425 if (high == 0) { 9426 return -1; 9427 } 9428 9429 /* (We can't get the array unless we know the list is non-empty) */ 9430 array = invlist_array(invlist); 9431 9432 mid = invlist_previous_index(invlist); 9433 assert(mid >=0); 9434 if (mid > highest_element) { 9435 mid = highest_element; 9436 } 9437 9438 /* <mid> contains the cache of the result of the previous call to this 9439 * function (0 the first time). See if this call is for the same result, 9440 * or if it is for mid-1. This is under the theory that calls to this 9441 * function will often be for related code points that are near each other. 9442 * And benchmarks show that caching gives better results. We also test 9443 * here if the code point is within the bounds of the list. These tests 9444 * replace others that would have had to be made anyway to make sure that 9445 * the array bounds were not exceeded, and these give us extra information 9446 * at the same time */ 9447 if (cp >= array[mid]) { 9448 if (cp >= array[highest_element]) { 9449 return highest_element; 9450 } 9451 9452 /* Here, array[mid] <= cp < array[highest_element]. This means that 9453 * the final element is not the answer, so can exclude it; it also 9454 * means that <mid> is not the final element, so can refer to 'mid + 1' 9455 * safely */ 9456 if (cp < array[mid + 1]) { 9457 return mid; 9458 } 9459 high--; 9460 low = mid + 1; 9461 } 9462 else { /* cp < aray[mid] */ 9463 if (cp < array[0]) { /* Fail if outside the array */ 9464 return -1; 9465 } 9466 high = mid; 9467 if (cp >= array[mid - 1]) { 9468 goto found_entry; 9469 } 9470 } 9471 9472 /* Binary search. What we are looking for is <i> such that 9473 * array[i] <= cp < array[i+1] 9474 * The loop below converges on the i+1. Note that there may not be an 9475 * (i+1)th element in the array, and things work nonetheless */ 9476 while (low < high) { 9477 mid = (low + high) / 2; 9478 assert(mid <= highest_element); 9479 if (array[mid] <= cp) { /* cp >= array[mid] */ 9480 low = mid + 1; 9481 9482 /* We could do this extra test to exit the loop early. 9483 if (cp < array[low]) { 9484 return mid; 9485 } 9486 */ 9487 } 9488 else { /* cp < array[mid] */ 9489 high = mid; 9490 } 9491 } 9492 9493 found_entry: 9494 high--; 9495 invlist_set_previous_index(invlist, high); 9496 return high; 9497 } 9498 9499 void 9500 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 9501 const bool complement_b, SV** output) 9502 { 9503 /* Take the union of two inversion lists and point '*output' to it. On 9504 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly 9505 * even 'a' or 'b'). If to an inversion list, the contents of the original 9506 * list will be replaced by the union. The first list, 'a', may be 9507 * NULL, in which case a copy of the second list is placed in '*output'. 9508 * If 'complement_b' is TRUE, the union is taken of the complement 9509 * (inversion) of 'b' instead of b itself. 9510 * 9511 * The basis for this comes from "Unicode Demystified" Chapter 13 by 9512 * Richard Gillam, published by Addison-Wesley, and explained at some 9513 * length there. The preface says to incorporate its examples into your 9514 * code at your own risk. 9515 * 9516 * The algorithm is like a merge sort. */ 9517 9518 const UV* array_a; /* a's array */ 9519 const UV* array_b; 9520 UV len_a; /* length of a's array */ 9521 UV len_b; 9522 9523 SV* u; /* the resulting union */ 9524 UV* array_u; 9525 UV len_u = 0; 9526 9527 UV i_a = 0; /* current index into a's array */ 9528 UV i_b = 0; 9529 UV i_u = 0; 9530 9531 /* running count, as explained in the algorithm source book; items are 9532 * stopped accumulating and are output when the count changes to/from 0. 9533 * The count is incremented when we start a range that's in an input's set, 9534 * and decremented when we start a range that's not in a set. So this 9535 * variable can be 0, 1, or 2. When it is 0 neither input is in their set, 9536 * and hence nothing goes into the union; 1, just one of the inputs is in 9537 * its set (and its current range gets added to the union); and 2 when both 9538 * inputs are in their sets. */ 9539 UV count = 0; 9540 9541 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; 9542 assert(a != b); 9543 assert(*output == NULL || is_invlist(*output)); 9544 9545 len_b = _invlist_len(b); 9546 if (len_b == 0) { 9547 9548 /* Here, 'b' is empty, hence it's complement is all possible code 9549 * points. So if the union includes the complement of 'b', it includes 9550 * everything, and we need not even look at 'a'. It's easiest to 9551 * create a new inversion list that matches everything. */ 9552 if (complement_b) { 9553 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); 9554 9555 if (*output == NULL) { /* If the output didn't exist, just point it 9556 at the new list */ 9557 *output = everything; 9558 } 9559 else { /* Otherwise, replace its contents with the new list */ 9560 invlist_replace_list_destroys_src(*output, everything); 9561 SvREFCNT_dec_NN(everything); 9562 } 9563 9564 return; 9565 } 9566 9567 /* Here, we don't want the complement of 'b', and since 'b' is empty, 9568 * the union will come entirely from 'a'. If 'a' is NULL or empty, the 9569 * output will be empty */ 9570 9571 if (a == NULL || _invlist_len(a) == 0) { 9572 if (*output == NULL) { 9573 *output = _new_invlist(0); 9574 } 9575 else { 9576 invlist_clear(*output); 9577 } 9578 return; 9579 } 9580 9581 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the 9582 * union. We can just return a copy of 'a' if '*output' doesn't point 9583 * to an existing list */ 9584 if (*output == NULL) { 9585 *output = invlist_clone(a, NULL); 9586 return; 9587 } 9588 9589 /* If the output is to overwrite 'a', we have a no-op, as it's 9590 * already in 'a' */ 9591 if (*output == a) { 9592 return; 9593 } 9594 9595 /* Here, '*output' is to be overwritten by 'a' */ 9596 u = invlist_clone(a, NULL); 9597 invlist_replace_list_destroys_src(*output, u); 9598 SvREFCNT_dec_NN(u); 9599 9600 return; 9601 } 9602 9603 /* Here 'b' is not empty. See about 'a' */ 9604 9605 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { 9606 9607 /* Here, 'a' is empty (and b is not). That means the union will come 9608 * entirely from 'b'. If '*output' is NULL, we can directly return a 9609 * clone of 'b'. Otherwise, we replace the contents of '*output' with 9610 * the clone */ 9611 9612 SV ** dest = (*output == NULL) ? output : &u; 9613 *dest = invlist_clone(b, NULL); 9614 if (complement_b) { 9615 _invlist_invert(*dest); 9616 } 9617 9618 if (dest == &u) { 9619 invlist_replace_list_destroys_src(*output, u); 9620 SvREFCNT_dec_NN(u); 9621 } 9622 9623 return; 9624 } 9625 9626 /* Here both lists exist and are non-empty */ 9627 array_a = invlist_array(a); 9628 array_b = invlist_array(b); 9629 9630 /* If are to take the union of 'a' with the complement of b, set it 9631 * up so are looking at b's complement. */ 9632 if (complement_b) { 9633 9634 /* To complement, we invert: if the first element is 0, remove it. To 9635 * do this, we just pretend the array starts one later */ 9636 if (array_b[0] == 0) { 9637 array_b++; 9638 len_b--; 9639 } 9640 else { 9641 9642 /* But if the first element is not zero, we pretend the list starts 9643 * at the 0 that is always stored immediately before the array. */ 9644 array_b--; 9645 len_b++; 9646 } 9647 } 9648 9649 /* Size the union for the worst case: that the sets are completely 9650 * disjoint */ 9651 u = _new_invlist(len_a + len_b); 9652 9653 /* Will contain U+0000 if either component does */ 9654 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) 9655 || (len_b > 0 && array_b[0] == 0)); 9656 9657 /* Go through each input list item by item, stopping when have exhausted 9658 * one of them */ 9659 while (i_a < len_a && i_b < len_b) { 9660 UV cp; /* The element to potentially add to the union's array */ 9661 bool cp_in_set; /* is it in the the input list's set or not */ 9662 9663 /* We need to take one or the other of the two inputs for the union. 9664 * Since we are merging two sorted lists, we take the smaller of the 9665 * next items. In case of a tie, we take first the one that is in its 9666 * set. If we first took the one not in its set, it would decrement 9667 * the count, possibly to 0 which would cause it to be output as ending 9668 * the range, and the next time through we would take the same number, 9669 * and output it again as beginning the next range. By doing it the 9670 * opposite way, there is no possibility that the count will be 9671 * momentarily decremented to 0, and thus the two adjoining ranges will 9672 * be seamlessly merged. (In a tie and both are in the set or both not 9673 * in the set, it doesn't matter which we take first.) */ 9674 if ( array_a[i_a] < array_b[i_b] 9675 || ( array_a[i_a] == array_b[i_b] 9676 && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 9677 { 9678 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 9679 cp = array_a[i_a++]; 9680 } 9681 else { 9682 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 9683 cp = array_b[i_b++]; 9684 } 9685 9686 /* Here, have chosen which of the two inputs to look at. Only output 9687 * if the running count changes to/from 0, which marks the 9688 * beginning/end of a range that's in the set */ 9689 if (cp_in_set) { 9690 if (count == 0) { 9691 array_u[i_u++] = cp; 9692 } 9693 count++; 9694 } 9695 else { 9696 count--; 9697 if (count == 0) { 9698 array_u[i_u++] = cp; 9699 } 9700 } 9701 } 9702 9703 9704 /* The loop above increments the index into exactly one of the input lists 9705 * each iteration, and ends when either index gets to its list end. That 9706 * means the other index is lower than its end, and so something is 9707 * remaining in that one. We decrement 'count', as explained below, if 9708 * that list is in its set. (i_a and i_b each currently index the element 9709 * beyond the one we care about.) */ 9710 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 9711 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 9712 { 9713 count--; 9714 } 9715 9716 /* Above we decremented 'count' if the list that had unexamined elements in 9717 * it was in its set. This has made it so that 'count' being non-zero 9718 * means there isn't anything left to output; and 'count' equal to 0 means 9719 * that what is left to output is precisely that which is left in the 9720 * non-exhausted input list. 9721 * 9722 * To see why, note first that the exhausted input obviously has nothing 9723 * left to add to the union. If it was in its set at its end, that means 9724 * the set extends from here to the platform's infinity, and hence so does 9725 * the union and the non-exhausted set is irrelevant. The exhausted set 9726 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to 9727 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so 9728 * 'count' remains at 1. This is consistent with the decremented 'count' 9729 * != 0 meaning there's nothing left to add to the union. 9730 * 9731 * But if the exhausted input wasn't in its set, it contributed 0 to 9732 * 'count', and the rest of the union will be whatever the other input is. 9733 * If 'count' was 0, neither list was in its set, and 'count' remains 0; 9734 * otherwise it gets decremented to 0. This is consistent with 'count' 9735 * == 0 meaning the remainder of the union is whatever is left in the 9736 * non-exhausted list. */ 9737 if (count != 0) { 9738 len_u = i_u; 9739 } 9740 else { 9741 IV copy_count = len_a - i_a; 9742 if (copy_count > 0) { /* The non-exhausted input is 'a' */ 9743 Copy(array_a + i_a, array_u + i_u, copy_count, UV); 9744 } 9745 else { /* The non-exhausted input is b */ 9746 copy_count = len_b - i_b; 9747 Copy(array_b + i_b, array_u + i_u, copy_count, UV); 9748 } 9749 len_u = i_u + copy_count; 9750 } 9751 9752 /* Set the result to the final length, which can change the pointer to 9753 * array_u, so re-find it. (Note that it is unlikely that this will 9754 * change, as we are shrinking the space, not enlarging it) */ 9755 if (len_u != _invlist_len(u)) { 9756 invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); 9757 invlist_trim(u); 9758 array_u = invlist_array(u); 9759 } 9760 9761 if (*output == NULL) { /* Simply return the new inversion list */ 9762 *output = u; 9763 } 9764 else { 9765 /* Otherwise, overwrite the inversion list that was in '*output'. We 9766 * could instead free '*output', and then set it to 'u', but experience 9767 * has shown [perl #127392] that if the input is a mortal, we can get a 9768 * huge build-up of these during regex compilation before they get 9769 * freed. */ 9770 invlist_replace_list_destroys_src(*output, u); 9771 SvREFCNT_dec_NN(u); 9772 } 9773 9774 return; 9775 } 9776 9777 void 9778 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 9779 const bool complement_b, SV** i) 9780 { 9781 /* Take the intersection of two inversion lists and point '*i' to it. On 9782 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly 9783 * even 'a' or 'b'). If to an inversion list, the contents of the original 9784 * list will be replaced by the intersection. The first list, 'a', may be 9785 * NULL, in which case '*i' will be an empty list. If 'complement_b' is 9786 * TRUE, the result will be the intersection of 'a' and the complement (or 9787 * inversion) of 'b' instead of 'b' directly. 9788 * 9789 * The basis for this comes from "Unicode Demystified" Chapter 13 by 9790 * Richard Gillam, published by Addison-Wesley, and explained at some 9791 * length there. The preface says to incorporate its examples into your 9792 * code at your own risk. In fact, it had bugs 9793 * 9794 * The algorithm is like a merge sort, and is essentially the same as the 9795 * union above 9796 */ 9797 9798 const UV* array_a; /* a's array */ 9799 const UV* array_b; 9800 UV len_a; /* length of a's array */ 9801 UV len_b; 9802 9803 SV* r; /* the resulting intersection */ 9804 UV* array_r; 9805 UV len_r = 0; 9806 9807 UV i_a = 0; /* current index into a's array */ 9808 UV i_b = 0; 9809 UV i_r = 0; 9810 9811 /* running count of how many of the two inputs are postitioned at ranges 9812 * that are in their sets. As explained in the algorithm source book, 9813 * items are stopped accumulating and are output when the count changes 9814 * to/from 2. The count is incremented when we start a range that's in an 9815 * input's set, and decremented when we start a range that's not in a set. 9816 * Only when it is 2 are we in the intersection. */ 9817 UV count = 0; 9818 9819 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; 9820 assert(a != b); 9821 assert(*i == NULL || is_invlist(*i)); 9822 9823 /* Special case if either one is empty */ 9824 len_a = (a == NULL) ? 0 : _invlist_len(a); 9825 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { 9826 if (len_a != 0 && complement_b) { 9827 9828 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' 9829 * must be empty. Here, also we are using 'b's complement, which 9830 * hence must be every possible code point. Thus the intersection 9831 * is simply 'a'. */ 9832 9833 if (*i == a) { /* No-op */ 9834 return; 9835 } 9836 9837 if (*i == NULL) { 9838 *i = invlist_clone(a, NULL); 9839 return; 9840 } 9841 9842 r = invlist_clone(a, NULL); 9843 invlist_replace_list_destroys_src(*i, r); 9844 SvREFCNT_dec_NN(r); 9845 return; 9846 } 9847 9848 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The 9849 * intersection must be empty */ 9850 if (*i == NULL) { 9851 *i = _new_invlist(0); 9852 return; 9853 } 9854 9855 invlist_clear(*i); 9856 return; 9857 } 9858 9859 /* Here both lists exist and are non-empty */ 9860 array_a = invlist_array(a); 9861 array_b = invlist_array(b); 9862 9863 /* If are to take the intersection of 'a' with the complement of b, set it 9864 * up so are looking at b's complement. */ 9865 if (complement_b) { 9866 9867 /* To complement, we invert: if the first element is 0, remove it. To 9868 * do this, we just pretend the array starts one later */ 9869 if (array_b[0] == 0) { 9870 array_b++; 9871 len_b--; 9872 } 9873 else { 9874 9875 /* But if the first element is not zero, we pretend the list starts 9876 * at the 0 that is always stored immediately before the array. */ 9877 array_b--; 9878 len_b++; 9879 } 9880 } 9881 9882 /* Size the intersection for the worst case: that the intersection ends up 9883 * fragmenting everything to be completely disjoint */ 9884 r= _new_invlist(len_a + len_b); 9885 9886 /* Will contain U+0000 iff both components do */ 9887 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 9888 && len_b > 0 && array_b[0] == 0); 9889 9890 /* Go through each list item by item, stopping when have exhausted one of 9891 * them */ 9892 while (i_a < len_a && i_b < len_b) { 9893 UV cp; /* The element to potentially add to the intersection's 9894 array */ 9895 bool cp_in_set; /* Is it in the input list's set or not */ 9896 9897 /* We need to take one or the other of the two inputs for the 9898 * intersection. Since we are merging two sorted lists, we take the 9899 * smaller of the next items. In case of a tie, we take first the one 9900 * that is not in its set (a difference from the union algorithm). If 9901 * we first took the one in its set, it would increment the count, 9902 * possibly to 2 which would cause it to be output as starting a range 9903 * in the intersection, and the next time through we would take that 9904 * same number, and output it again as ending the set. By doing the 9905 * opposite of this, there is no possibility that the count will be 9906 * momentarily incremented to 2. (In a tie and both are in the set or 9907 * both not in the set, it doesn't matter which we take first.) */ 9908 if ( array_a[i_a] < array_b[i_b] 9909 || ( array_a[i_a] == array_b[i_b] 9910 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 9911 { 9912 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 9913 cp = array_a[i_a++]; 9914 } 9915 else { 9916 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 9917 cp= array_b[i_b++]; 9918 } 9919 9920 /* Here, have chosen which of the two inputs to look at. Only output 9921 * if the running count changes to/from 2, which marks the 9922 * beginning/end of a range that's in the intersection */ 9923 if (cp_in_set) { 9924 count++; 9925 if (count == 2) { 9926 array_r[i_r++] = cp; 9927 } 9928 } 9929 else { 9930 if (count == 2) { 9931 array_r[i_r++] = cp; 9932 } 9933 count--; 9934 } 9935 9936 } 9937 9938 /* The loop above increments the index into exactly one of the input lists 9939 * each iteration, and ends when either index gets to its list end. That 9940 * means the other index is lower than its end, and so something is 9941 * remaining in that one. We increment 'count', as explained below, if the 9942 * exhausted list was in its set. (i_a and i_b each currently index the 9943 * element beyond the one we care about.) */ 9944 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 9945 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 9946 { 9947 count++; 9948 } 9949 9950 /* Above we incremented 'count' if the exhausted list was in its set. This 9951 * has made it so that 'count' being below 2 means there is nothing left to 9952 * output; otheriwse what's left to add to the intersection is precisely 9953 * that which is left in the non-exhausted input list. 9954 * 9955 * To see why, note first that the exhausted input obviously has nothing 9956 * left to affect the intersection. If it was in its set at its end, that 9957 * means the set extends from here to the platform's infinity, and hence 9958 * anything in the non-exhausted's list will be in the intersection, and 9959 * anything not in it won't be. Hence, the rest of the intersection is 9960 * precisely what's in the non-exhausted list The exhausted set also 9961 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing 9962 * it means 'count' is now at least 2. This is consistent with the 9963 * incremented 'count' being >= 2 means to add the non-exhausted list to 9964 * the intersection. 9965 * 9966 * But if the exhausted input wasn't in its set, it contributed 0 to 9967 * 'count', and the intersection can't include anything further; the 9968 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get 9969 * incremented. This is consistent with 'count' being < 2 meaning nothing 9970 * further to add to the intersection. */ 9971 if (count < 2) { /* Nothing left to put in the intersection. */ 9972 len_r = i_r; 9973 } 9974 else { /* copy the non-exhausted list, unchanged. */ 9975 IV copy_count = len_a - i_a; 9976 if (copy_count > 0) { /* a is the one with stuff left */ 9977 Copy(array_a + i_a, array_r + i_r, copy_count, UV); 9978 } 9979 else { /* b is the one with stuff left */ 9980 copy_count = len_b - i_b; 9981 Copy(array_b + i_b, array_r + i_r, copy_count, UV); 9982 } 9983 len_r = i_r + copy_count; 9984 } 9985 9986 /* Set the result to the final length, which can change the pointer to 9987 * array_r, so re-find it. (Note that it is unlikely that this will 9988 * change, as we are shrinking the space, not enlarging it) */ 9989 if (len_r != _invlist_len(r)) { 9990 invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); 9991 invlist_trim(r); 9992 array_r = invlist_array(r); 9993 } 9994 9995 if (*i == NULL) { /* Simply return the calculated intersection */ 9996 *i = r; 9997 } 9998 else { /* Otherwise, replace the existing inversion list in '*i'. We could 9999 instead free '*i', and then set it to 'r', but experience has 10000 shown [perl #127392] that if the input is a mortal, we can get a 10001 huge build-up of these during regex compilation before they get 10002 freed. */ 10003 if (len_r) { 10004 invlist_replace_list_destroys_src(*i, r); 10005 } 10006 else { 10007 invlist_clear(*i); 10008 } 10009 SvREFCNT_dec_NN(r); 10010 } 10011 10012 return; 10013 } 10014 10015 SV* 10016 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) 10017 { 10018 /* Add the range from 'start' to 'end' inclusive to the inversion list's 10019 * set. A pointer to the inversion list is returned. This may actually be 10020 * a new list, in which case the passed in one has been destroyed. The 10021 * passed-in inversion list can be NULL, in which case a new one is created 10022 * with just the one range in it. The new list is not necessarily 10023 * NUL-terminated. Space is not freed if the inversion list shrinks as a 10024 * result of this function. The gain would not be large, and in many 10025 * cases, this is called multiple times on a single inversion list, so 10026 * anything freed may almost immediately be needed again. 10027 * 10028 * This used to mostly call the 'union' routine, but that is much more 10029 * heavyweight than really needed for a single range addition */ 10030 10031 UV* array; /* The array implementing the inversion list */ 10032 UV len; /* How many elements in 'array' */ 10033 SSize_t i_s; /* index into the invlist array where 'start' 10034 should go */ 10035 SSize_t i_e = 0; /* And the index where 'end' should go */ 10036 UV cur_highest; /* The highest code point in the inversion list 10037 upon entry to this function */ 10038 10039 /* This range becomes the whole inversion list if none already existed */ 10040 if (invlist == NULL) { 10041 invlist = _new_invlist(2); 10042 _append_range_to_invlist(invlist, start, end); 10043 return invlist; 10044 } 10045 10046 /* Likewise, if the inversion list is currently empty */ 10047 len = _invlist_len(invlist); 10048 if (len == 0) { 10049 _append_range_to_invlist(invlist, start, end); 10050 return invlist; 10051 } 10052 10053 /* Starting here, we have to know the internals of the list */ 10054 array = invlist_array(invlist); 10055 10056 /* If the new range ends higher than the current highest ... */ 10057 cur_highest = invlist_highest(invlist); 10058 if (end > cur_highest) { 10059 10060 /* If the whole range is higher, we can just append it */ 10061 if (start > cur_highest) { 10062 _append_range_to_invlist(invlist, start, end); 10063 return invlist; 10064 } 10065 10066 /* Otherwise, add the portion that is higher ... */ 10067 _append_range_to_invlist(invlist, cur_highest + 1, end); 10068 10069 /* ... and continue on below to handle the rest. As a result of the 10070 * above append, we know that the index of the end of the range is the 10071 * final even numbered one of the array. Recall that the final element 10072 * always starts a range that extends to infinity. If that range is in 10073 * the set (meaning the set goes from here to infinity), it will be an 10074 * even index, but if it isn't in the set, it's odd, and the final 10075 * range in the set is one less, which is even. */ 10076 if (end == UV_MAX) { 10077 i_e = len; 10078 } 10079 else { 10080 i_e = len - 2; 10081 } 10082 } 10083 10084 /* We have dealt with appending, now see about prepending. If the new 10085 * range starts lower than the current lowest ... */ 10086 if (start < array[0]) { 10087 10088 /* Adding something which has 0 in it is somewhat tricky, and uncommon. 10089 * Let the union code handle it, rather than having to know the 10090 * trickiness in two code places. */ 10091 if (UNLIKELY(start == 0)) { 10092 SV* range_invlist; 10093 10094 range_invlist = _new_invlist(2); 10095 _append_range_to_invlist(range_invlist, start, end); 10096 10097 _invlist_union(invlist, range_invlist, &invlist); 10098 10099 SvREFCNT_dec_NN(range_invlist); 10100 10101 return invlist; 10102 } 10103 10104 /* If the whole new range comes before the first entry, and doesn't 10105 * extend it, we have to insert it as an additional range */ 10106 if (end < array[0] - 1) { 10107 i_s = i_e = -1; 10108 goto splice_in_new_range; 10109 } 10110 10111 /* Here the new range adjoins the existing first range, extending it 10112 * downwards. */ 10113 array[0] = start; 10114 10115 /* And continue on below to handle the rest. We know that the index of 10116 * the beginning of the range is the first one of the array */ 10117 i_s = 0; 10118 } 10119 else { /* Not prepending any part of the new range to the existing list. 10120 * Find where in the list it should go. This finds i_s, such that: 10121 * invlist[i_s] <= start < array[i_s+1] 10122 */ 10123 i_s = _invlist_search(invlist, start); 10124 } 10125 10126 /* At this point, any extending before the beginning of the inversion list 10127 * and/or after the end has been done. This has made it so that, in the 10128 * code below, each endpoint of the new range is either in a range that is 10129 * in the set, or is in a gap between two ranges that are. This means we 10130 * don't have to worry about exceeding the array bounds. 10131 * 10132 * Find where in the list the new range ends (but we can skip this if we 10133 * have already determined what it is, or if it will be the same as i_s, 10134 * which we already have computed) */ 10135 if (i_e == 0) { 10136 i_e = (start == end) 10137 ? i_s 10138 : _invlist_search(invlist, end); 10139 } 10140 10141 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] 10142 * is a range that goes to infinity there is no element at invlist[i_e+1], 10143 * so only the first relation holds. */ 10144 10145 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { 10146 10147 /* Here, the ranges on either side of the beginning of the new range 10148 * are in the set, and this range starts in the gap between them. 10149 * 10150 * The new range extends the range above it downwards if the new range 10151 * ends at or above that range's start */ 10152 const bool extends_the_range_above = ( end == UV_MAX 10153 || end + 1 >= array[i_s+1]); 10154 10155 /* The new range extends the range below it upwards if it begins just 10156 * after where that range ends */ 10157 if (start == array[i_s]) { 10158 10159 /* If the new range fills the entire gap between the other ranges, 10160 * they will get merged together. Other ranges may also get 10161 * merged, depending on how many of them the new range spans. In 10162 * the general case, we do the merge later, just once, after we 10163 * figure out how many to merge. But in the case where the new 10164 * range exactly spans just this one gap (possibly extending into 10165 * the one above), we do the merge here, and an early exit. This 10166 * is done here to avoid having to special case later. */ 10167 if (i_e - i_s <= 1) { 10168 10169 /* If i_e - i_s == 1, it means that the new range terminates 10170 * within the range above, and hence 'extends_the_range_above' 10171 * must be true. (If the range above it extends to infinity, 10172 * 'i_s+2' will be above the array's limit, but 'len-i_s-2' 10173 * will be 0, so no harm done.) */ 10174 if (extends_the_range_above) { 10175 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); 10176 invlist_set_len(invlist, 10177 len - 2, 10178 *(get_invlist_offset_addr(invlist))); 10179 return invlist; 10180 } 10181 10182 /* Here, i_e must == i_s. We keep them in sync, as they apply 10183 * to the same range, and below we are about to decrement i_s 10184 * */ 10185 i_e--; 10186 } 10187 10188 /* Here, the new range is adjacent to the one below. (It may also 10189 * span beyond the range above, but that will get resolved later.) 10190 * Extend the range below to include this one. */ 10191 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; 10192 i_s--; 10193 start = array[i_s]; 10194 } 10195 else if (extends_the_range_above) { 10196 10197 /* Here the new range only extends the range above it, but not the 10198 * one below. It merges with the one above. Again, we keep i_e 10199 * and i_s in sync if they point to the same range */ 10200 if (i_e == i_s) { 10201 i_e++; 10202 } 10203 i_s++; 10204 array[i_s] = start; 10205 } 10206 } 10207 10208 /* Here, we've dealt with the new range start extending any adjoining 10209 * existing ranges. 10210 * 10211 * If the new range extends to infinity, it is now the final one, 10212 * regardless of what was there before */ 10213 if (UNLIKELY(end == UV_MAX)) { 10214 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); 10215 return invlist; 10216 } 10217 10218 /* If i_e started as == i_s, it has also been dealt with, 10219 * and been updated to the new i_s, which will fail the following if */ 10220 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { 10221 10222 /* Here, the ranges on either side of the end of the new range are in 10223 * the set, and this range ends in the gap between them. 10224 * 10225 * If this range is adjacent to (hence extends) the range above it, it 10226 * becomes part of that range; likewise if it extends the range below, 10227 * it becomes part of that range */ 10228 if (end + 1 == array[i_e+1]) { 10229 i_e++; 10230 array[i_e] = start; 10231 } 10232 else if (start <= array[i_e]) { 10233 array[i_e] = end + 1; 10234 i_e--; 10235 } 10236 } 10237 10238 if (i_s == i_e) { 10239 10240 /* If the range fits entirely in an existing range (as possibly already 10241 * extended above), it doesn't add anything new */ 10242 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { 10243 return invlist; 10244 } 10245 10246 /* Here, no part of the range is in the list. Must add it. It will 10247 * occupy 2 more slots */ 10248 splice_in_new_range: 10249 10250 invlist_extend(invlist, len + 2); 10251 array = invlist_array(invlist); 10252 /* Move the rest of the array down two slots. Don't include any 10253 * trailing NUL */ 10254 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); 10255 10256 /* Do the actual splice */ 10257 array[i_e+1] = start; 10258 array[i_e+2] = end + 1; 10259 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); 10260 return invlist; 10261 } 10262 10263 /* Here the new range crossed the boundaries of a pre-existing range. The 10264 * code above has adjusted things so that both ends are in ranges that are 10265 * in the set. This means everything in between must also be in the set. 10266 * Just squash things together */ 10267 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); 10268 invlist_set_len(invlist, 10269 len - i_e + i_s, 10270 *(get_invlist_offset_addr(invlist))); 10271 10272 return invlist; 10273 } 10274 10275 SV* 10276 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, 10277 UV** other_elements_ptr) 10278 { 10279 /* Create and return an inversion list whose contents are to be populated 10280 * by the caller. The caller gives the number of elements (in 'size') and 10281 * the very first element ('element0'). This function will set 10282 * '*other_elements_ptr' to an array of UVs, where the remaining elements 10283 * are to be placed. 10284 * 10285 * Obviously there is some trust involved that the caller will properly 10286 * fill in the other elements of the array. 10287 * 10288 * (The first element needs to be passed in, as the underlying code does 10289 * things differently depending on whether it is zero or non-zero) */ 10290 10291 SV* invlist = _new_invlist(size); 10292 bool offset; 10293 10294 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; 10295 10296 invlist = add_cp_to_invlist(invlist, element0); 10297 offset = *get_invlist_offset_addr(invlist); 10298 10299 invlist_set_len(invlist, size, offset); 10300 *other_elements_ptr = invlist_array(invlist) + 1; 10301 return invlist; 10302 } 10303 10304 #endif 10305 10306 PERL_STATIC_INLINE SV* 10307 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { 10308 return _add_range_to_invlist(invlist, cp, cp); 10309 } 10310 10311 #ifndef PERL_IN_XSUB_RE 10312 void 10313 Perl__invlist_invert(pTHX_ SV* const invlist) 10314 { 10315 /* Complement the input inversion list. This adds a 0 if the list didn't 10316 * have a zero; removes it otherwise. As described above, the data 10317 * structure is set up so that this is very efficient */ 10318 10319 PERL_ARGS_ASSERT__INVLIST_INVERT; 10320 10321 assert(! invlist_is_iterating(invlist)); 10322 10323 /* The inverse of matching nothing is matching everything */ 10324 if (_invlist_len(invlist) == 0) { 10325 _append_range_to_invlist(invlist, 0, UV_MAX); 10326 return; 10327 } 10328 10329 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); 10330 } 10331 10332 SV* 10333 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) 10334 { 10335 /* Return a new inversion list that is a copy of the input one, which is 10336 * unchanged. The new list will not be mortal even if the old one was. */ 10337 10338 const STRLEN nominal_length = _invlist_len(invlist); 10339 const STRLEN physical_length = SvCUR(invlist); 10340 const bool offset = *(get_invlist_offset_addr(invlist)); 10341 10342 PERL_ARGS_ASSERT_INVLIST_CLONE; 10343 10344 if (new_invlist == NULL) { 10345 new_invlist = _new_invlist(nominal_length); 10346 } 10347 else { 10348 sv_upgrade(new_invlist, SVt_INVLIST); 10349 initialize_invlist_guts(new_invlist, nominal_length); 10350 } 10351 10352 *(get_invlist_offset_addr(new_invlist)) = offset; 10353 invlist_set_len(new_invlist, nominal_length, offset); 10354 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); 10355 10356 return new_invlist; 10357 } 10358 10359 #endif 10360 10361 PERL_STATIC_INLINE STRLEN* 10362 S_get_invlist_iter_addr(SV* invlist) 10363 { 10364 /* Return the address of the UV that contains the current iteration 10365 * position */ 10366 10367 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; 10368 10369 assert(is_invlist(invlist)); 10370 10371 return &(((XINVLIST*) SvANY(invlist))->iterator); 10372 } 10373 10374 PERL_STATIC_INLINE void 10375 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ 10376 { 10377 PERL_ARGS_ASSERT_INVLIST_ITERINIT; 10378 10379 *get_invlist_iter_addr(invlist) = 0; 10380 } 10381 10382 PERL_STATIC_INLINE void 10383 S_invlist_iterfinish(SV* invlist) 10384 { 10385 /* Terminate iterator for invlist. This is to catch development errors. 10386 * Any iteration that is interrupted before completed should call this 10387 * function. Functions that add code points anywhere else but to the end 10388 * of an inversion list assert that they are not in the middle of an 10389 * iteration. If they were, the addition would make the iteration 10390 * problematical: if the iteration hadn't reached the place where things 10391 * were being added, it would be ok */ 10392 10393 PERL_ARGS_ASSERT_INVLIST_ITERFINISH; 10394 10395 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; 10396 } 10397 10398 STATIC bool 10399 S_invlist_iternext(SV* invlist, UV* start, UV* end) 10400 { 10401 /* An C<invlist_iterinit> call on <invlist> must be used to set this up. 10402 * This call sets in <*start> and <*end>, the next range in <invlist>. 10403 * Returns <TRUE> if successful and the next call will return the next 10404 * range; <FALSE> if was already at the end of the list. If the latter, 10405 * <*start> and <*end> are unchanged, and the next call to this function 10406 * will start over at the beginning of the list */ 10407 10408 STRLEN* pos = get_invlist_iter_addr(invlist); 10409 UV len = _invlist_len(invlist); 10410 UV *array; 10411 10412 PERL_ARGS_ASSERT_INVLIST_ITERNEXT; 10413 10414 if (*pos >= len) { 10415 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ 10416 return FALSE; 10417 } 10418 10419 array = invlist_array(invlist); 10420 10421 *start = array[(*pos)++]; 10422 10423 if (*pos >= len) { 10424 *end = UV_MAX; 10425 } 10426 else { 10427 *end = array[(*pos)++] - 1; 10428 } 10429 10430 return TRUE; 10431 } 10432 10433 PERL_STATIC_INLINE UV 10434 S_invlist_highest(SV* const invlist) 10435 { 10436 /* Returns the highest code point that matches an inversion list. This API 10437 * has an ambiguity, as it returns 0 under either the highest is actually 10438 * 0, or if the list is empty. If this distinction matters to you, check 10439 * for emptiness before calling this function */ 10440 10441 UV len = _invlist_len(invlist); 10442 UV *array; 10443 10444 PERL_ARGS_ASSERT_INVLIST_HIGHEST; 10445 10446 if (len == 0) { 10447 return 0; 10448 } 10449 10450 array = invlist_array(invlist); 10451 10452 /* The last element in the array in the inversion list always starts a 10453 * range that goes to infinity. That range may be for code points that are 10454 * matched in the inversion list, or it may be for ones that aren't 10455 * matched. In the latter case, the highest code point in the set is one 10456 * less than the beginning of this range; otherwise it is the final element 10457 * of this range: infinity */ 10458 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) 10459 ? UV_MAX 10460 : array[len - 1] - 1; 10461 } 10462 10463 STATIC SV * 10464 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) 10465 { 10466 /* Get the contents of an inversion list into a string SV so that they can 10467 * be printed out. If 'traditional_style' is TRUE, it uses the format 10468 * traditionally done for debug tracing; otherwise it uses a format 10469 * suitable for just copying to the output, with blanks between ranges and 10470 * a dash between range components */ 10471 10472 UV start, end; 10473 SV* output; 10474 const char intra_range_delimiter = (traditional_style ? '\t' : '-'); 10475 const char inter_range_delimiter = (traditional_style ? '\n' : ' '); 10476 10477 if (traditional_style) { 10478 output = newSVpvs("\n"); 10479 } 10480 else { 10481 output = newSVpvs(""); 10482 } 10483 10484 PERL_ARGS_ASSERT_INVLIST_CONTENTS; 10485 10486 assert(! invlist_is_iterating(invlist)); 10487 10488 invlist_iterinit(invlist); 10489 while (invlist_iternext(invlist, &start, &end)) { 10490 if (end == UV_MAX) { 10491 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c", 10492 start, intra_range_delimiter, 10493 inter_range_delimiter); 10494 } 10495 else if (end != start) { 10496 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c", 10497 start, 10498 intra_range_delimiter, 10499 end, inter_range_delimiter); 10500 } 10501 else { 10502 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c", 10503 start, inter_range_delimiter); 10504 } 10505 } 10506 10507 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ 10508 SvCUR_set(output, SvCUR(output) - 1); 10509 } 10510 10511 return output; 10512 } 10513 10514 #ifndef PERL_IN_XSUB_RE 10515 void 10516 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, 10517 const char * const indent, SV* const invlist) 10518 { 10519 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the 10520 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by 10521 * the string 'indent'. The output looks like this: 10522 [0] 0x000A .. 0x000D 10523 [2] 0x0085 10524 [4] 0x2028 .. 0x2029 10525 [6] 0x3104 .. INFTY 10526 * This means that the first range of code points matched by the list are 10527 * 0xA through 0xD; the second range contains only the single code point 10528 * 0x85, etc. An inversion list is an array of UVs. Two array elements 10529 * are used to define each range (except if the final range extends to 10530 * infinity, only a single element is needed). The array index of the 10531 * first element for the corresponding range is given in brackets. */ 10532 10533 UV start, end; 10534 STRLEN count = 0; 10535 10536 PERL_ARGS_ASSERT__INVLIST_DUMP; 10537 10538 if (invlist_is_iterating(invlist)) { 10539 Perl_dump_indent(aTHX_ level, file, 10540 "%sCan't dump inversion list because is in middle of iterating\n", 10541 indent); 10542 return; 10543 } 10544 10545 invlist_iterinit(invlist); 10546 while (invlist_iternext(invlist, &start, &end)) { 10547 if (end == UV_MAX) { 10548 Perl_dump_indent(aTHX_ level, file, 10549 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n", 10550 indent, (UV)count, start); 10551 } 10552 else if (end != start) { 10553 Perl_dump_indent(aTHX_ level, file, 10554 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", 10555 indent, (UV)count, start, end); 10556 } 10557 else { 10558 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", 10559 indent, (UV)count, start); 10560 } 10561 count += 2; 10562 } 10563 } 10564 10565 #endif 10566 10567 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) 10568 bool 10569 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) 10570 { 10571 /* Return a boolean as to if the two passed in inversion lists are 10572 * identical. The final argument, if TRUE, says to take the complement of 10573 * the second inversion list before doing the comparison */ 10574 10575 const UV len_a = _invlist_len(a); 10576 UV len_b = _invlist_len(b); 10577 10578 const UV* array_a = NULL; 10579 const UV* array_b = NULL; 10580 10581 PERL_ARGS_ASSERT__INVLISTEQ; 10582 10583 /* This code avoids accessing the arrays unless it knows the length is 10584 * non-zero */ 10585 10586 if (len_a == 0) { 10587 if (len_b == 0) { 10588 return ! complement_b; 10589 } 10590 } 10591 else { 10592 array_a = invlist_array(a); 10593 } 10594 10595 if (len_b != 0) { 10596 array_b = invlist_array(b); 10597 } 10598 10599 /* If are to compare 'a' with the complement of b, set it 10600 * up so are looking at b's complement. */ 10601 if (complement_b) { 10602 10603 /* The complement of nothing is everything, so <a> would have to have 10604 * just one element, starting at zero (ending at infinity) */ 10605 if (len_b == 0) { 10606 return (len_a == 1 && array_a[0] == 0); 10607 } 10608 if (array_b[0] == 0) { 10609 10610 /* Otherwise, to complement, we invert. Here, the first element is 10611 * 0, just remove it. To do this, we just pretend the array starts 10612 * one later */ 10613 10614 array_b++; 10615 len_b--; 10616 } 10617 else { 10618 10619 /* But if the first element is not zero, we pretend the list starts 10620 * at the 0 that is always stored immediately before the array. */ 10621 array_b--; 10622 len_b++; 10623 } 10624 } 10625 10626 return len_a == len_b 10627 && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); 10628 10629 } 10630 #endif 10631 10632 /* 10633 * As best we can, determine the characters that can match the start of 10634 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there 10635 * can be false positive matches 10636 * 10637 * Returns the invlist as a new SV*; it is the caller's responsibility to 10638 * call SvREFCNT_dec() when done with it. 10639 */ 10640 STATIC SV* 10641 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) 10642 { 10643 dVAR; 10644 const U8 * s = (U8*)STRING(node); 10645 SSize_t bytelen = STR_LEN(node); 10646 UV uc; 10647 /* Start out big enough for 2 separate code points */ 10648 SV* invlist = _new_invlist(4); 10649 10650 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; 10651 10652 if (! UTF) { 10653 uc = *s; 10654 10655 /* We punt and assume can match anything if the node begins 10656 * with a multi-character fold. Things are complicated. For 10657 * example, /ffi/i could match any of: 10658 * "\N{LATIN SMALL LIGATURE FFI}" 10659 * "\N{LATIN SMALL LIGATURE FF}I" 10660 * "F\N{LATIN SMALL LIGATURE FI}" 10661 * plus several other things; and making sure we have all the 10662 * possibilities is hard. */ 10663 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { 10664 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 10665 } 10666 else { 10667 /* Any Latin1 range character can potentially match any 10668 * other depending on the locale, and in Turkic locales, U+130 and 10669 * U+131 */ 10670 if (OP(node) == EXACTFL) { 10671 _invlist_union(invlist, PL_Latin1, &invlist); 10672 invlist = add_cp_to_invlist(invlist, 10673 LATIN_SMALL_LETTER_DOTLESS_I); 10674 invlist = add_cp_to_invlist(invlist, 10675 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 10676 } 10677 else { 10678 /* But otherwise, it matches at least itself. We can 10679 * quickly tell if it has a distinct fold, and if so, 10680 * it matches that as well */ 10681 invlist = add_cp_to_invlist(invlist, uc); 10682 if (IS_IN_SOME_FOLD_L1(uc)) 10683 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); 10684 } 10685 10686 /* Some characters match above-Latin1 ones under /i. This 10687 * is true of EXACTFL ones when the locale is UTF-8 */ 10688 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) 10689 && (! isASCII(uc) || (OP(node) != EXACTFAA 10690 && OP(node) != EXACTFAA_NO_TRIE))) 10691 { 10692 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); 10693 } 10694 } 10695 } 10696 else { /* Pattern is UTF-8 */ 10697 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; 10698 const U8* e = s + bytelen; 10699 IV fc; 10700 10701 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); 10702 10703 /* The only code points that aren't folded in a UTF EXACTFish 10704 * node are are the problematic ones in EXACTFL nodes */ 10705 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { 10706 /* We need to check for the possibility that this EXACTFL 10707 * node begins with a multi-char fold. Therefore we fold 10708 * the first few characters of it so that we can make that 10709 * check */ 10710 U8 *d = folded; 10711 int i; 10712 10713 fc = -1; 10714 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { 10715 if (isASCII(*s)) { 10716 *(d++) = (U8) toFOLD(*s); 10717 if (fc < 0) { /* Save the first fold */ 10718 fc = *(d-1); 10719 } 10720 s++; 10721 } 10722 else { 10723 STRLEN len; 10724 UV fold = toFOLD_utf8_safe(s, e, d, &len); 10725 if (fc < 0) { /* Save the first fold */ 10726 fc = fold; 10727 } 10728 d += len; 10729 s += UTF8SKIP(s); 10730 } 10731 } 10732 10733 /* And set up so the code below that looks in this folded 10734 * buffer instead of the node's string */ 10735 e = d; 10736 s = folded; 10737 } 10738 10739 /* When we reach here 's' points to the fold of the first 10740 * character(s) of the node; and 'e' points to far enough along 10741 * the folded string to be just past any possible multi-char 10742 * fold. 10743 * 10744 * Unlike the non-UTF-8 case, the macro for determining if a 10745 * string is a multi-char fold requires all the characters to 10746 * already be folded. This is because of all the complications 10747 * if not. Note that they are folded anyway, except in EXACTFL 10748 * nodes. Like the non-UTF case above, we punt if the node 10749 * begins with a multi-char fold */ 10750 10751 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { 10752 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 10753 } 10754 else { /* Single char fold */ 10755 unsigned int k; 10756 unsigned int first_fold; 10757 const unsigned int * remaining_folds; 10758 Size_t folds_count; 10759 10760 /* It matches itself */ 10761 invlist = add_cp_to_invlist(invlist, fc); 10762 10763 /* ... plus all the things that fold to it, which are found in 10764 * PL_utf8_foldclosures */ 10765 folds_count = _inverse_folds(fc, &first_fold, 10766 &remaining_folds); 10767 for (k = 0; k < folds_count; k++) { 10768 UV c = (k == 0) ? first_fold : remaining_folds[k-1]; 10769 10770 /* /aa doesn't allow folds between ASCII and non- */ 10771 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE) 10772 && isASCII(c) != isASCII(fc)) 10773 { 10774 continue; 10775 } 10776 10777 invlist = add_cp_to_invlist(invlist, c); 10778 } 10779 10780 if (OP(node) == EXACTFL) { 10781 10782 /* If either [iI] are present in an EXACTFL node the above code 10783 * should have added its normal case pair, but under a Turkish 10784 * locale they could match instead the case pairs from it. Add 10785 * those as potential matches as well */ 10786 if (isALPHA_FOLD_EQ(fc, 'I')) { 10787 invlist = add_cp_to_invlist(invlist, 10788 LATIN_SMALL_LETTER_DOTLESS_I); 10789 invlist = add_cp_to_invlist(invlist, 10790 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 10791 } 10792 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { 10793 invlist = add_cp_to_invlist(invlist, 'I'); 10794 } 10795 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 10796 invlist = add_cp_to_invlist(invlist, 'i'); 10797 } 10798 } 10799 } 10800 } 10801 10802 return invlist; 10803 } 10804 10805 #undef HEADER_LENGTH 10806 #undef TO_INTERNAL_SIZE 10807 #undef FROM_INTERNAL_SIZE 10808 #undef INVLIST_VERSION_ID 10809 10810 /* End of inversion list object */ 10811 10812 STATIC void 10813 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) 10814 { 10815 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' 10816 * constructs, and updates RExC_flags with them. On input, RExC_parse 10817 * should point to the first flag; it is updated on output to point to the 10818 * final ')' or ':'. There needs to be at least one flag, or this will 10819 * abort */ 10820 10821 /* for (?g), (?gc), and (?o) warnings; warning 10822 about (?c) will warn about (?g) -- japhy */ 10823 10824 #define WASTED_O 0x01 10825 #define WASTED_G 0x02 10826 #define WASTED_C 0x04 10827 #define WASTED_GC (WASTED_G|WASTED_C) 10828 I32 wastedflags = 0x00; 10829 U32 posflags = 0, negflags = 0; 10830 U32 *flagsp = &posflags; 10831 char has_charset_modifier = '\0'; 10832 regex_charset cs; 10833 bool has_use_defaults = FALSE; 10834 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ 10835 int x_mod_count = 0; 10836 10837 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; 10838 10839 /* '^' as an initial flag sets certain defaults */ 10840 if (UCHARAT(RExC_parse) == '^') { 10841 RExC_parse++; 10842 has_use_defaults = TRUE; 10843 STD_PMMOD_FLAGS_CLEAR(&RExC_flags); 10844 cs = (RExC_uni_semantics) 10845 ? REGEX_UNICODE_CHARSET 10846 : REGEX_DEPENDS_CHARSET; 10847 set_regex_charset(&RExC_flags, cs); 10848 } 10849 else { 10850 cs = get_regex_charset(RExC_flags); 10851 if ( cs == REGEX_DEPENDS_CHARSET 10852 && RExC_uni_semantics) 10853 { 10854 cs = REGEX_UNICODE_CHARSET; 10855 } 10856 } 10857 10858 while (RExC_parse < RExC_end) { 10859 /* && strchr("iogcmsx", *RExC_parse) */ 10860 /* (?g), (?gc) and (?o) are useless here 10861 and must be globally applied -- japhy */ 10862 switch (*RExC_parse) { 10863 10864 /* Code for the imsxn flags */ 10865 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); 10866 10867 case LOCALE_PAT_MOD: 10868 if (has_charset_modifier) { 10869 goto excess_modifier; 10870 } 10871 else if (flagsp == &negflags) { 10872 goto neg_modifier; 10873 } 10874 cs = REGEX_LOCALE_CHARSET; 10875 has_charset_modifier = LOCALE_PAT_MOD; 10876 break; 10877 case UNICODE_PAT_MOD: 10878 if (has_charset_modifier) { 10879 goto excess_modifier; 10880 } 10881 else if (flagsp == &negflags) { 10882 goto neg_modifier; 10883 } 10884 cs = REGEX_UNICODE_CHARSET; 10885 has_charset_modifier = UNICODE_PAT_MOD; 10886 break; 10887 case ASCII_RESTRICT_PAT_MOD: 10888 if (flagsp == &negflags) { 10889 goto neg_modifier; 10890 } 10891 if (has_charset_modifier) { 10892 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { 10893 goto excess_modifier; 10894 } 10895 /* Doubled modifier implies more restricted */ 10896 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; 10897 } 10898 else { 10899 cs = REGEX_ASCII_RESTRICTED_CHARSET; 10900 } 10901 has_charset_modifier = ASCII_RESTRICT_PAT_MOD; 10902 break; 10903 case DEPENDS_PAT_MOD: 10904 if (has_use_defaults) { 10905 goto fail_modifiers; 10906 } 10907 else if (flagsp == &negflags) { 10908 goto neg_modifier; 10909 } 10910 else if (has_charset_modifier) { 10911 goto excess_modifier; 10912 } 10913 10914 /* The dual charset means unicode semantics if the 10915 * pattern (or target, not known until runtime) are 10916 * utf8, or something in the pattern indicates unicode 10917 * semantics */ 10918 cs = (RExC_uni_semantics) 10919 ? REGEX_UNICODE_CHARSET 10920 : REGEX_DEPENDS_CHARSET; 10921 has_charset_modifier = DEPENDS_PAT_MOD; 10922 break; 10923 excess_modifier: 10924 RExC_parse++; 10925 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { 10926 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); 10927 } 10928 else if (has_charset_modifier == *(RExC_parse - 1)) { 10929 vFAIL2("Regexp modifier \"%c\" may not appear twice", 10930 *(RExC_parse - 1)); 10931 } 10932 else { 10933 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); 10934 } 10935 NOT_REACHED; /*NOTREACHED*/ 10936 neg_modifier: 10937 RExC_parse++; 10938 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", 10939 *(RExC_parse - 1)); 10940 NOT_REACHED; /*NOTREACHED*/ 10941 case ONCE_PAT_MOD: /* 'o' */ 10942 case GLOBAL_PAT_MOD: /* 'g' */ 10943 if (ckWARN(WARN_REGEXP)) { 10944 const I32 wflagbit = *RExC_parse == 'o' 10945 ? WASTED_O 10946 : WASTED_G; 10947 if (! (wastedflags & wflagbit) ) { 10948 wastedflags |= wflagbit; 10949 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 10950 vWARN5( 10951 RExC_parse + 1, 10952 "Useless (%s%c) - %suse /%c modifier", 10953 flagsp == &negflags ? "?-" : "?", 10954 *RExC_parse, 10955 flagsp == &negflags ? "don't " : "", 10956 *RExC_parse 10957 ); 10958 } 10959 } 10960 break; 10961 10962 case CONTINUE_PAT_MOD: /* 'c' */ 10963 if (ckWARN(WARN_REGEXP)) { 10964 if (! (wastedflags & WASTED_C) ) { 10965 wastedflags |= WASTED_GC; 10966 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 10967 vWARN3( 10968 RExC_parse + 1, 10969 "Useless (%sc) - %suse /gc modifier", 10970 flagsp == &negflags ? "?-" : "?", 10971 flagsp == &negflags ? "don't " : "" 10972 ); 10973 } 10974 } 10975 break; 10976 case KEEPCOPY_PAT_MOD: /* 'p' */ 10977 if (flagsp == &negflags) { 10978 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 10979 } else { 10980 *flagsp |= RXf_PMf_KEEPCOPY; 10981 } 10982 break; 10983 case '-': 10984 /* A flag is a default iff it is following a minus, so 10985 * if there is a minus, it means will be trying to 10986 * re-specify a default which is an error */ 10987 if (has_use_defaults || flagsp == &negflags) { 10988 goto fail_modifiers; 10989 } 10990 flagsp = &negflags; 10991 wastedflags = 0; /* reset so (?g-c) warns twice */ 10992 x_mod_count = 0; 10993 break; 10994 case ':': 10995 case ')': 10996 10997 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) { 10998 negflags |= RXf_PMf_EXTENDED_MORE; 10999 } 11000 RExC_flags |= posflags; 11001 11002 if (negflags & RXf_PMf_EXTENDED) { 11003 negflags |= RXf_PMf_EXTENDED_MORE; 11004 } 11005 RExC_flags &= ~negflags; 11006 set_regex_charset(&RExC_flags, cs); 11007 11008 return; 11009 default: 11010 fail_modifiers: 11011 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); 11012 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 11013 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", 11014 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 11015 NOT_REACHED; /*NOTREACHED*/ 11016 } 11017 11018 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 11019 } 11020 11021 vFAIL("Sequence (?... not terminated"); 11022 } 11023 11024 /* 11025 - reg - regular expression, i.e. main body or parenthesized thing 11026 * 11027 * Caller must absorb opening parenthesis. 11028 * 11029 * Combining parenthesis handling with the base level of regular expression 11030 * is a trifle forced, but the need to tie the tails of the branches to what 11031 * follows makes it hard to avoid. 11032 */ 11033 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 11034 #ifdef DEBUGGING 11035 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 11036 #else 11037 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 11038 #endif 11039 11040 PERL_STATIC_INLINE regnode_offset 11041 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, 11042 I32 *flagp, 11043 char * parse_start, 11044 char ch 11045 ) 11046 { 11047 regnode_offset ret; 11048 char* name_start = RExC_parse; 11049 U32 num = 0; 11050 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 11051 GET_RE_DEBUG_FLAGS_DECL; 11052 11053 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; 11054 11055 if (RExC_parse == name_start || *RExC_parse != ch) { 11056 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 11057 vFAIL2("Sequence %.3s... not terminated", parse_start); 11058 } 11059 11060 if (sv_dat) { 11061 num = add_data( pRExC_state, STR_WITH_LEN("S")); 11062 RExC_rxi->data->data[num]=(void*)sv_dat; 11063 SvREFCNT_inc_simple_void_NN(sv_dat); 11064 } 11065 RExC_sawback = 1; 11066 ret = reganode(pRExC_state, 11067 ((! FOLD) 11068 ? NREF 11069 : (ASCII_FOLD_RESTRICTED) 11070 ? NREFFA 11071 : (AT_LEAST_UNI_SEMANTICS) 11072 ? NREFFU 11073 : (LOC) 11074 ? NREFFL 11075 : NREFF), 11076 num); 11077 *flagp |= HASWIDTH; 11078 11079 Set_Node_Offset(REGNODE_p(ret), parse_start+1); 11080 Set_Node_Cur_Length(REGNODE_p(ret), parse_start); 11081 11082 nextchar(pRExC_state); 11083 return ret; 11084 } 11085 11086 /* On success, returns the offset at which any next node should be placed into 11087 * the regex engine program being compiled. 11088 * 11089 * Returns 0 otherwise, with *flagp set to indicate why: 11090 * TRYAGAIN at the end of (?) that only sets flags. 11091 * RESTART_PARSE if the parse needs to be restarted, or'd with 11092 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 11093 * Otherwise would only return 0 if regbranch() returns 0, which cannot 11094 * happen. */ 11095 STATIC regnode_offset 11096 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) 11097 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. 11098 * 2 is like 1, but indicates that nextchar() has been called to advance 11099 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and 11100 * this flag alerts us to the need to check for that */ 11101 { 11102 regnode_offset ret = 0; /* Will be the head of the group. */ 11103 regnode_offset br; 11104 regnode_offset lastbr; 11105 regnode_offset ender = 0; 11106 I32 parno = 0; 11107 I32 flags; 11108 U32 oregflags = RExC_flags; 11109 bool have_branch = 0; 11110 bool is_open = 0; 11111 I32 freeze_paren = 0; 11112 I32 after_freeze = 0; 11113 I32 num; /* numeric backreferences */ 11114 SV * max_open; /* Max number of unclosed parens */ 11115 11116 char * parse_start = RExC_parse; /* MJD */ 11117 char * const oregcomp_parse = RExC_parse; 11118 11119 GET_RE_DEBUG_FLAGS_DECL; 11120 11121 PERL_ARGS_ASSERT_REG; 11122 DEBUG_PARSE("reg "); 11123 11124 11125 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); 11126 assert(max_open); 11127 if (!SvIOK(max_open)) { 11128 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT); 11129 } 11130 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each 11131 open paren */ 11132 vFAIL("Too many nested open parens"); 11133 } 11134 11135 *flagp = 0; /* Tentatively. */ 11136 11137 /* Having this true makes it feasible to have a lot fewer tests for the 11138 * parse pointer being in scope. For example, we can write 11139 * while(isFOO(*RExC_parse)) RExC_parse++; 11140 * instead of 11141 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++; 11142 */ 11143 assert(*RExC_end == '\0'); 11144 11145 /* Make an OPEN node, if parenthesized. */ 11146 if (paren) { 11147 11148 /* Under /x, space and comments can be gobbled up between the '(' and 11149 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such 11150 * intervening space, as the sequence is a token, and a token should be 11151 * indivisible */ 11152 bool has_intervening_patws = (paren == 2) 11153 && *(RExC_parse - 1) != '('; 11154 11155 if (RExC_parse >= RExC_end) { 11156 vFAIL("Unmatched ("); 11157 } 11158 11159 if (paren == 'r') { /* Atomic script run */ 11160 paren = '>'; 11161 goto parse_rest; 11162 } 11163 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */ 11164 char *start_verb = RExC_parse + 1; 11165 STRLEN verb_len; 11166 char *start_arg = NULL; 11167 unsigned char op = 0; 11168 int arg_required = 0; 11169 int internal_argval = -1; /* if >-1 we are not allowed an argument*/ 11170 bool has_upper = FALSE; 11171 11172 if (has_intervening_patws) { 11173 RExC_parse++; /* past the '*' */ 11174 11175 /* For strict backwards compatibility, don't change the message 11176 * now that we also have lowercase operands */ 11177 if (isUPPER(*RExC_parse)) { 11178 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); 11179 } 11180 else { 11181 vFAIL("In '(*...)', the '(' and '*' must be adjacent"); 11182 } 11183 } 11184 while (RExC_parse < RExC_end && *RExC_parse != ')' ) { 11185 if ( *RExC_parse == ':' ) { 11186 start_arg = RExC_parse + 1; 11187 break; 11188 } 11189 else if (! UTF) { 11190 if (isUPPER(*RExC_parse)) { 11191 has_upper = TRUE; 11192 } 11193 RExC_parse++; 11194 } 11195 else { 11196 RExC_parse += UTF8SKIP(RExC_parse); 11197 } 11198 } 11199 verb_len = RExC_parse - start_verb; 11200 if ( start_arg ) { 11201 if (RExC_parse >= RExC_end) { 11202 goto unterminated_verb_pattern; 11203 } 11204 11205 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 11206 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) { 11207 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 11208 } 11209 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 11210 unterminated_verb_pattern: 11211 if (has_upper) { 11212 vFAIL("Unterminated verb pattern argument"); 11213 } 11214 else { 11215 vFAIL("Unterminated '(*...' argument"); 11216 } 11217 } 11218 } else { 11219 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 11220 if (has_upper) { 11221 vFAIL("Unterminated verb pattern"); 11222 } 11223 else { 11224 vFAIL("Unterminated '(*...' construct"); 11225 } 11226 } 11227 } 11228 11229 /* Here, we know that RExC_parse < RExC_end */ 11230 11231 switch ( *start_verb ) { 11232 case 'A': /* (*ACCEPT) */ 11233 if ( memEQs(start_verb, verb_len,"ACCEPT") ) { 11234 op = ACCEPT; 11235 internal_argval = RExC_nestroot; 11236 } 11237 break; 11238 case 'C': /* (*COMMIT) */ 11239 if ( memEQs(start_verb, verb_len,"COMMIT") ) 11240 op = COMMIT; 11241 break; 11242 case 'F': /* (*FAIL) */ 11243 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) { 11244 op = OPFAIL; 11245 } 11246 break; 11247 case ':': /* (*:NAME) */ 11248 case 'M': /* (*MARK:NAME) */ 11249 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) { 11250 op = MARKPOINT; 11251 arg_required = 1; 11252 } 11253 break; 11254 case 'P': /* (*PRUNE) */ 11255 if ( memEQs(start_verb, verb_len,"PRUNE") ) 11256 op = PRUNE; 11257 break; 11258 case 'S': /* (*SKIP) */ 11259 if ( memEQs(start_verb, verb_len,"SKIP") ) 11260 op = SKIP; 11261 break; 11262 case 'T': /* (*THEN) */ 11263 /* [19:06] <TimToady> :: is then */ 11264 if ( memEQs(start_verb, verb_len,"THEN") ) { 11265 op = CUTGROUP; 11266 RExC_seen |= REG_CUTGROUP_SEEN; 11267 } 11268 break; 11269 case 'a': 11270 if ( memEQs(start_verb, verb_len, "asr") 11271 || memEQs(start_verb, verb_len, "atomic_script_run")) 11272 { 11273 paren = 'r'; /* Mnemonic: recursed run */ 11274 goto script_run; 11275 } 11276 else if (memEQs(start_verb, verb_len, "atomic")) { 11277 paren = 't'; /* AtOMIC */ 11278 goto alpha_assertions; 11279 } 11280 break; 11281 case 'p': 11282 if ( memEQs(start_verb, verb_len, "plb") 11283 || memEQs(start_verb, verb_len, "positive_lookbehind")) 11284 { 11285 paren = 'b'; 11286 goto lookbehind_alpha_assertions; 11287 } 11288 else if ( memEQs(start_verb, verb_len, "pla") 11289 || memEQs(start_verb, verb_len, "positive_lookahead")) 11290 { 11291 paren = 'a'; 11292 goto alpha_assertions; 11293 } 11294 break; 11295 case 'n': 11296 if ( memEQs(start_verb, verb_len, "nlb") 11297 || memEQs(start_verb, verb_len, "negative_lookbehind")) 11298 { 11299 paren = 'B'; 11300 goto lookbehind_alpha_assertions; 11301 } 11302 else if ( memEQs(start_verb, verb_len, "nla") 11303 || memEQs(start_verb, verb_len, "negative_lookahead")) 11304 { 11305 paren = 'A'; 11306 goto alpha_assertions; 11307 } 11308 break; 11309 case 's': 11310 if ( memEQs(start_verb, verb_len, "sr") 11311 || memEQs(start_verb, verb_len, "script_run")) 11312 { 11313 regnode_offset atomic; 11314 11315 paren = 's'; 11316 11317 script_run: 11318 11319 /* This indicates Unicode rules. */ 11320 REQUIRE_UNI_RULES(flagp, 0); 11321 11322 if (! start_arg) { 11323 goto no_colon; 11324 } 11325 11326 RExC_parse = start_arg; 11327 11328 if (RExC_in_script_run) { 11329 11330 /* Nested script runs are treated as no-ops, because 11331 * if the nested one fails, the outer one must as 11332 * well. It could fail sooner, and avoid (??{} with 11333 * side effects, but that is explicitly documented as 11334 * undefined behavior. */ 11335 11336 ret = 0; 11337 11338 if (paren == 's') { 11339 paren = ':'; 11340 goto parse_rest; 11341 } 11342 11343 /* But, the atomic part of a nested atomic script run 11344 * isn't a no-op, but can be treated just like a '(?>' 11345 * */ 11346 paren = '>'; 11347 goto parse_rest; 11348 } 11349 11350 /* By doing this here, we avoid extra warnings for nested 11351 * script runs */ 11352 ckWARNexperimental(RExC_parse, 11353 WARN_EXPERIMENTAL__SCRIPT_RUN, 11354 "The script_run feature is experimental"); 11355 11356 if (paren == 's') { 11357 /* Here, we're starting a new regular script run */ 11358 ret = reg_node(pRExC_state, SROPEN); 11359 RExC_in_script_run = 1; 11360 is_open = 1; 11361 goto parse_rest; 11362 } 11363 11364 /* Here, we are starting an atomic script run. This is 11365 * handled by recursing to deal with the atomic portion 11366 * separately, enclosed in SROPEN ... SRCLOSE nodes */ 11367 11368 ret = reg_node(pRExC_state, SROPEN); 11369 11370 RExC_in_script_run = 1; 11371 11372 atomic = reg(pRExC_state, 'r', &flags, depth); 11373 if (flags & (RESTART_PARSE|NEED_UTF8)) { 11374 *flagp = flags & (RESTART_PARSE|NEED_UTF8); 11375 return 0; 11376 } 11377 11378 if (! REGTAIL(pRExC_state, ret, atomic)) { 11379 REQUIRE_BRANCHJ(flagp, 0); 11380 } 11381 11382 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state, 11383 SRCLOSE))) 11384 { 11385 REQUIRE_BRANCHJ(flagp, 0); 11386 } 11387 11388 RExC_in_script_run = 0; 11389 return ret; 11390 } 11391 11392 break; 11393 11394 lookbehind_alpha_assertions: 11395 RExC_seen |= REG_LOOKBEHIND_SEEN; 11396 RExC_in_lookbehind++; 11397 /*FALLTHROUGH*/ 11398 11399 alpha_assertions: 11400 ckWARNexperimental(RExC_parse, 11401 WARN_EXPERIMENTAL__ALPHA_ASSERTIONS, 11402 "The alpha_assertions feature is experimental"); 11403 11404 RExC_seen_zerolen++; 11405 11406 if (! start_arg) { 11407 goto no_colon; 11408 } 11409 11410 /* An empty negative lookahead assertion simply is failure */ 11411 if (paren == 'A' && RExC_parse == start_arg) { 11412 ret=reganode(pRExC_state, OPFAIL, 0); 11413 nextchar(pRExC_state); 11414 return ret; 11415 } 11416 11417 RExC_parse = start_arg; 11418 goto parse_rest; 11419 11420 no_colon: 11421 vFAIL2utf8f( 11422 "'(*%" UTF8f "' requires a terminating ':'", 11423 UTF8fARG(UTF, verb_len, start_verb)); 11424 NOT_REACHED; /*NOTREACHED*/ 11425 11426 } /* End of switch */ 11427 if ( ! op ) { 11428 RExC_parse += UTF 11429 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 11430 : 1; 11431 if (has_upper || verb_len == 0) { 11432 vFAIL2utf8f( 11433 "Unknown verb pattern '%" UTF8f "'", 11434 UTF8fARG(UTF, verb_len, start_verb)); 11435 } 11436 else { 11437 vFAIL2utf8f( 11438 "Unknown '(*...)' construct '%" UTF8f "'", 11439 UTF8fARG(UTF, verb_len, start_verb)); 11440 } 11441 } 11442 if ( RExC_parse == start_arg ) { 11443 start_arg = NULL; 11444 } 11445 if ( arg_required && !start_arg ) { 11446 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 11447 verb_len, start_verb); 11448 } 11449 if (internal_argval == -1) { 11450 ret = reganode(pRExC_state, op, 0); 11451 } else { 11452 ret = reg2Lanode(pRExC_state, op, 0, internal_argval); 11453 } 11454 RExC_seen |= REG_VERBARG_SEEN; 11455 if (start_arg) { 11456 SV *sv = newSVpvn( start_arg, 11457 RExC_parse - start_arg); 11458 ARG(REGNODE_p(ret)) = add_data( pRExC_state, 11459 STR_WITH_LEN("S")); 11460 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv; 11461 FLAGS(REGNODE_p(ret)) = 1; 11462 } else { 11463 FLAGS(REGNODE_p(ret)) = 0; 11464 } 11465 if ( internal_argval != -1 ) 11466 ARG2L_SET(REGNODE_p(ret), internal_argval); 11467 nextchar(pRExC_state); 11468 return ret; 11469 } 11470 else if (*RExC_parse == '?') { /* (?...) */ 11471 bool is_logical = 0; 11472 const char * const seqstart = RExC_parse; 11473 const char * endptr; 11474 if (has_intervening_patws) { 11475 RExC_parse++; 11476 vFAIL("In '(?...)', the '(' and '?' must be adjacent"); 11477 } 11478 11479 RExC_parse++; /* past the '?' */ 11480 paren = *RExC_parse; /* might be a trailing NUL, if not 11481 well-formed */ 11482 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 11483 if (RExC_parse > RExC_end) { 11484 paren = '\0'; 11485 } 11486 ret = 0; /* For look-ahead/behind. */ 11487 switch (paren) { 11488 11489 case 'P': /* (?P...) variants for those used to PCRE/Python */ 11490 paren = *RExC_parse; 11491 if ( paren == '<') { /* (?P<...>) named capture */ 11492 RExC_parse++; 11493 if (RExC_parse >= RExC_end) { 11494 vFAIL("Sequence (?P<... not terminated"); 11495 } 11496 goto named_capture; 11497 } 11498 else if (paren == '>') { /* (?P>name) named recursion */ 11499 RExC_parse++; 11500 if (RExC_parse >= RExC_end) { 11501 vFAIL("Sequence (?P>... not terminated"); 11502 } 11503 goto named_recursion; 11504 } 11505 else if (paren == '=') { /* (?P=...) named backref */ 11506 RExC_parse++; 11507 return handle_named_backref(pRExC_state, flagp, 11508 parse_start, ')'); 11509 } 11510 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); 11511 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 11512 vFAIL3("Sequence (%.*s...) not recognized", 11513 RExC_parse-seqstart, seqstart); 11514 NOT_REACHED; /*NOTREACHED*/ 11515 case '<': /* (?<...) */ 11516 if (*RExC_parse == '!') 11517 paren = ','; 11518 else if (*RExC_parse != '=') 11519 named_capture: 11520 { /* (?<...>) */ 11521 char *name_start; 11522 SV *svname; 11523 paren= '>'; 11524 /* FALLTHROUGH */ 11525 case '\'': /* (?'...') */ 11526 name_start = RExC_parse; 11527 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME); 11528 if ( RExC_parse == name_start 11529 || RExC_parse >= RExC_end 11530 || *RExC_parse != paren) 11531 { 11532 vFAIL2("Sequence (?%c... not terminated", 11533 paren=='>' ? '<' : paren); 11534 } 11535 { 11536 HE *he_str; 11537 SV *sv_dat = NULL; 11538 if (!svname) /* shouldn't happen */ 11539 Perl_croak(aTHX_ 11540 "panic: reg_scan_name returned NULL"); 11541 if (!RExC_paren_names) { 11542 RExC_paren_names= newHV(); 11543 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 11544 #ifdef DEBUGGING 11545 RExC_paren_name_list= newAV(); 11546 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 11547 #endif 11548 } 11549 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 11550 if ( he_str ) 11551 sv_dat = HeVAL(he_str); 11552 if ( ! sv_dat ) { 11553 /* croak baby croak */ 11554 Perl_croak(aTHX_ 11555 "panic: paren_name hash element allocation failed"); 11556 } else if ( SvPOK(sv_dat) ) { 11557 /* (?|...) can mean we have dupes so scan to check 11558 its already been stored. Maybe a flag indicating 11559 we are inside such a construct would be useful, 11560 but the arrays are likely to be quite small, so 11561 for now we punt -- dmq */ 11562 IV count = SvIV(sv_dat); 11563 I32 *pv = (I32*)SvPVX(sv_dat); 11564 IV i; 11565 for ( i = 0 ; i < count ; i++ ) { 11566 if ( pv[i] == RExC_npar ) { 11567 count = 0; 11568 break; 11569 } 11570 } 11571 if ( count ) { 11572 pv = (I32*)SvGROW(sv_dat, 11573 SvCUR(sv_dat) + sizeof(I32)+1); 11574 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 11575 pv[count] = RExC_npar; 11576 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 11577 } 11578 } else { 11579 (void)SvUPGRADE(sv_dat, SVt_PVNV); 11580 sv_setpvn(sv_dat, (char *)&(RExC_npar), 11581 sizeof(I32)); 11582 SvIOK_on(sv_dat); 11583 SvIV_set(sv_dat, 1); 11584 } 11585 #ifdef DEBUGGING 11586 /* Yes this does cause a memory leak in debugging Perls 11587 * */ 11588 if (!av_store(RExC_paren_name_list, 11589 RExC_npar, SvREFCNT_inc_NN(svname))) 11590 SvREFCNT_dec_NN(svname); 11591 #endif 11592 11593 /*sv_dump(sv_dat);*/ 11594 } 11595 nextchar(pRExC_state); 11596 paren = 1; 11597 goto capturing_parens; 11598 } 11599 11600 RExC_seen |= REG_LOOKBEHIND_SEEN; 11601 RExC_in_lookbehind++; 11602 RExC_parse++; 11603 if (RExC_parse >= RExC_end) { 11604 vFAIL("Sequence (?... not terminated"); 11605 } 11606 11607 /* FALLTHROUGH */ 11608 case '=': /* (?=...) */ 11609 RExC_seen_zerolen++; 11610 break; 11611 case '!': /* (?!...) */ 11612 RExC_seen_zerolen++; 11613 /* check if we're really just a "FAIL" assertion */ 11614 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 11615 FALSE /* Don't force to /x */ ); 11616 if (*RExC_parse == ')') { 11617 ret=reganode(pRExC_state, OPFAIL, 0); 11618 nextchar(pRExC_state); 11619 return ret; 11620 } 11621 break; 11622 case '|': /* (?|...) */ 11623 /* branch reset, behave like a (?:...) except that 11624 buffers in alternations share the same numbers */ 11625 paren = ':'; 11626 after_freeze = freeze_paren = RExC_npar; 11627 11628 /* XXX This construct currently requires an extra pass. 11629 * Investigation would be required to see if that could be 11630 * changed */ 11631 REQUIRE_PARENS_PASS; 11632 break; 11633 case ':': /* (?:...) */ 11634 case '>': /* (?>...) */ 11635 break; 11636 case '$': /* (?$...) */ 11637 case '@': /* (?@...) */ 11638 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 11639 break; 11640 case '0' : /* (?0) */ 11641 case 'R' : /* (?R) */ 11642 if (RExC_parse == RExC_end || *RExC_parse != ')') 11643 FAIL("Sequence (?R) not terminated"); 11644 num = 0; 11645 RExC_seen |= REG_RECURSE_SEEN; 11646 11647 /* XXX These constructs currently require an extra pass. 11648 * It probably could be changed */ 11649 REQUIRE_PARENS_PASS; 11650 11651 *flagp |= POSTPONED; 11652 goto gen_recurse_regop; 11653 /*notreached*/ 11654 /* named and numeric backreferences */ 11655 case '&': /* (?&NAME) */ 11656 parse_start = RExC_parse - 1; 11657 named_recursion: 11658 { 11659 SV *sv_dat = reg_scan_name(pRExC_state, 11660 REG_RSN_RETURN_DATA); 11661 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 11662 } 11663 if (RExC_parse >= RExC_end || *RExC_parse != ')') 11664 vFAIL("Sequence (?&... not terminated"); 11665 goto gen_recurse_regop; 11666 /* NOTREACHED */ 11667 case '+': 11668 if (! inRANGE(RExC_parse[0], '1', '9')) { 11669 RExC_parse++; 11670 vFAIL("Illegal pattern"); 11671 } 11672 goto parse_recursion; 11673 /* NOTREACHED*/ 11674 case '-': /* (?-1) */ 11675 if (! inRANGE(RExC_parse[0], '1', '9')) { 11676 RExC_parse--; /* rewind to let it be handled later */ 11677 goto parse_flags; 11678 } 11679 /* FALLTHROUGH */ 11680 case '1': case '2': case '3': case '4': /* (?1) */ 11681 case '5': case '6': case '7': case '8': case '9': 11682 RExC_parse = (char *) seqstart + 1; /* Point to the digit */ 11683 parse_recursion: 11684 { 11685 bool is_neg = FALSE; 11686 UV unum; 11687 parse_start = RExC_parse - 1; /* MJD */ 11688 if (*RExC_parse == '-') { 11689 RExC_parse++; 11690 is_neg = TRUE; 11691 } 11692 endptr = RExC_end; 11693 if (grok_atoUV(RExC_parse, &unum, &endptr) 11694 && unum <= I32_MAX 11695 ) { 11696 num = (I32)unum; 11697 RExC_parse = (char*)endptr; 11698 } else 11699 num = I32_MAX; 11700 if (is_neg) { 11701 /* Some limit for num? */ 11702 num = -num; 11703 } 11704 } 11705 if (*RExC_parse!=')') 11706 vFAIL("Expecting close bracket"); 11707 11708 gen_recurse_regop: 11709 if ( paren == '-' ) { 11710 /* 11711 Diagram of capture buffer numbering. 11712 Top line is the normal capture buffer numbers 11713 Bottom line is the negative indexing as from 11714 the X (the (?-2)) 11715 11716 + 1 2 3 4 5 X 6 7 11717 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 11718 - 5 4 3 2 1 X x x 11719 11720 */ 11721 num = RExC_npar + num; 11722 if (num < 1) { 11723 11724 /* It might be a forward reference; we can't fail until 11725 * we know, by completing the parse to get all the 11726 * groups, and then reparsing */ 11727 if (ALL_PARENS_COUNTED) { 11728 RExC_parse++; 11729 vFAIL("Reference to nonexistent group"); 11730 } 11731 else { 11732 REQUIRE_PARENS_PASS; 11733 } 11734 } 11735 } else if ( paren == '+' ) { 11736 num = RExC_npar + num - 1; 11737 } 11738 /* We keep track how many GOSUB items we have produced. 11739 To start off the ARG2L() of the GOSUB holds its "id", 11740 which is used later in conjunction with RExC_recurse 11741 to calculate the offset we need to jump for the GOSUB, 11742 which it will store in the final representation. 11743 We have to defer the actual calculation until much later 11744 as the regop may move. 11745 */ 11746 11747 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); 11748 if (num >= RExC_npar) { 11749 11750 /* It might be a forward reference; we can't fail until we 11751 * know, by completing the parse to get all the groups, and 11752 * then reparsing */ 11753 if (ALL_PARENS_COUNTED) { 11754 if (num >= RExC_total_parens) { 11755 RExC_parse++; 11756 vFAIL("Reference to nonexistent group"); 11757 } 11758 } 11759 else { 11760 REQUIRE_PARENS_PASS; 11761 } 11762 } 11763 RExC_recurse_count++; 11764 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 11765 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", 11766 22, "| |", (int)(depth * 2 + 1), "", 11767 (UV)ARG(REGNODE_p(ret)), 11768 (IV)ARG2L(REGNODE_p(ret)))); 11769 RExC_seen |= REG_RECURSE_SEEN; 11770 11771 Set_Node_Length(REGNODE_p(ret), 11772 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */ 11773 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */ 11774 11775 *flagp |= POSTPONED; 11776 assert(*RExC_parse == ')'); 11777 nextchar(pRExC_state); 11778 return ret; 11779 11780 /* NOTREACHED */ 11781 11782 case '?': /* (??...) */ 11783 is_logical = 1; 11784 if (*RExC_parse != '{') { 11785 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); 11786 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 11787 vFAIL2utf8f( 11788 "Sequence (%" UTF8f "...) not recognized", 11789 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 11790 NOT_REACHED; /*NOTREACHED*/ 11791 } 11792 *flagp |= POSTPONED; 11793 paren = '{'; 11794 RExC_parse++; 11795 /* FALLTHROUGH */ 11796 case '{': /* (?{...}) */ 11797 { 11798 U32 n = 0; 11799 struct reg_code_block *cb; 11800 OP * o; 11801 11802 RExC_seen_zerolen++; 11803 11804 if ( !pRExC_state->code_blocks 11805 || pRExC_state->code_index 11806 >= pRExC_state->code_blocks->count 11807 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start 11808 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) 11809 - RExC_start) 11810 ) { 11811 if (RExC_pm_flags & PMf_USE_RE_EVAL) 11812 FAIL("panic: Sequence (?{...}): no code block found\n"); 11813 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 11814 } 11815 /* this is a pre-compiled code block (?{...}) */ 11816 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index]; 11817 RExC_parse = RExC_start + cb->end; 11818 o = cb->block; 11819 if (cb->src_regex) { 11820 n = add_data(pRExC_state, STR_WITH_LEN("rl")); 11821 RExC_rxi->data->data[n] = 11822 (void*)SvREFCNT_inc((SV*)cb->src_regex); 11823 RExC_rxi->data->data[n+1] = (void*)o; 11824 } 11825 else { 11826 n = add_data(pRExC_state, 11827 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); 11828 RExC_rxi->data->data[n] = (void*)o; 11829 } 11830 pRExC_state->code_index++; 11831 nextchar(pRExC_state); 11832 11833 if (is_logical) { 11834 regnode_offset eval; 11835 ret = reg_node(pRExC_state, LOGICAL); 11836 11837 eval = reg2Lanode(pRExC_state, EVAL, 11838 n, 11839 11840 /* for later propagation into (??{}) 11841 * return value */ 11842 RExC_flags & RXf_PMf_COMPILETIME 11843 ); 11844 FLAGS(REGNODE_p(ret)) = 2; 11845 if (! REGTAIL(pRExC_state, ret, eval)) { 11846 REQUIRE_BRANCHJ(flagp, 0); 11847 } 11848 /* deal with the length of this later - MJD */ 11849 return ret; 11850 } 11851 ret = reg2Lanode(pRExC_state, EVAL, n, 0); 11852 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); 11853 Set_Node_Offset(REGNODE_p(ret), parse_start); 11854 return ret; 11855 } 11856 case '(': /* (?(?{...})...) and (?(?=...)...) */ 11857 { 11858 int is_define= 0; 11859 const int DEFINE_len = sizeof("DEFINE") - 1; 11860 if ( RExC_parse < RExC_end - 1 11861 && ( ( RExC_parse[0] == '?' /* (?(?...)) */ 11862 && ( RExC_parse[1] == '=' 11863 || RExC_parse[1] == '!' 11864 || RExC_parse[1] == '<' 11865 || RExC_parse[1] == '{')) 11866 || ( RExC_parse[0] == '*' /* (?(*...)) */ 11867 && ( memBEGINs(RExC_parse + 1, 11868 (Size_t) (RExC_end - (RExC_parse + 1)), 11869 "pla:") 11870 || memBEGINs(RExC_parse + 1, 11871 (Size_t) (RExC_end - (RExC_parse + 1)), 11872 "plb:") 11873 || memBEGINs(RExC_parse + 1, 11874 (Size_t) (RExC_end - (RExC_parse + 1)), 11875 "nla:") 11876 || memBEGINs(RExC_parse + 1, 11877 (Size_t) (RExC_end - (RExC_parse + 1)), 11878 "nlb:") 11879 || memBEGINs(RExC_parse + 1, 11880 (Size_t) (RExC_end - (RExC_parse + 1)), 11881 "positive_lookahead:") 11882 || memBEGINs(RExC_parse + 1, 11883 (Size_t) (RExC_end - (RExC_parse + 1)), 11884 "positive_lookbehind:") 11885 || memBEGINs(RExC_parse + 1, 11886 (Size_t) (RExC_end - (RExC_parse + 1)), 11887 "negative_lookahead:") 11888 || memBEGINs(RExC_parse + 1, 11889 (Size_t) (RExC_end - (RExC_parse + 1)), 11890 "negative_lookbehind:")))) 11891 ) { /* Lookahead or eval. */ 11892 I32 flag; 11893 regnode_offset tail; 11894 11895 ret = reg_node(pRExC_state, LOGICAL); 11896 FLAGS(REGNODE_p(ret)) = 1; 11897 11898 tail = reg(pRExC_state, 1, &flag, depth+1); 11899 RETURN_FAIL_ON_RESTART(flag, flagp); 11900 if (! REGTAIL(pRExC_state, ret, tail)) { 11901 REQUIRE_BRANCHJ(flagp, 0); 11902 } 11903 goto insert_if; 11904 } 11905 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 11906 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 11907 { 11908 char ch = RExC_parse[0] == '<' ? '>' : '\''; 11909 char *name_start= RExC_parse++; 11910 U32 num = 0; 11911 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 11912 if ( RExC_parse == name_start 11913 || RExC_parse >= RExC_end 11914 || *RExC_parse != ch) 11915 { 11916 vFAIL2("Sequence (?(%c... not terminated", 11917 (ch == '>' ? '<' : ch)); 11918 } 11919 RExC_parse++; 11920 if (sv_dat) { 11921 num = add_data( pRExC_state, STR_WITH_LEN("S")); 11922 RExC_rxi->data->data[num]=(void*)sv_dat; 11923 SvREFCNT_inc_simple_void_NN(sv_dat); 11924 } 11925 ret = reganode(pRExC_state, NGROUPP, num); 11926 goto insert_if_check_paren; 11927 } 11928 else if (memBEGINs(RExC_parse, 11929 (STRLEN) (RExC_end - RExC_parse), 11930 "DEFINE")) 11931 { 11932 ret = reganode(pRExC_state, DEFINEP, 0); 11933 RExC_parse += DEFINE_len; 11934 is_define = 1; 11935 goto insert_if_check_paren; 11936 } 11937 else if (RExC_parse[0] == 'R') { 11938 RExC_parse++; 11939 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" 11940 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" 11941 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" 11942 */ 11943 parno = 0; 11944 if (RExC_parse[0] == '0') { 11945 parno = 1; 11946 RExC_parse++; 11947 } 11948 else if (inRANGE(RExC_parse[0], '1', '9')) { 11949 UV uv; 11950 endptr = RExC_end; 11951 if (grok_atoUV(RExC_parse, &uv, &endptr) 11952 && uv <= I32_MAX 11953 ) { 11954 parno = (I32)uv + 1; 11955 RExC_parse = (char*)endptr; 11956 } 11957 /* else "Switch condition not recognized" below */ 11958 } else if (RExC_parse[0] == '&') { 11959 SV *sv_dat; 11960 RExC_parse++; 11961 sv_dat = reg_scan_name(pRExC_state, 11962 REG_RSN_RETURN_DATA); 11963 if (sv_dat) 11964 parno = 1 + *((I32 *)SvPVX(sv_dat)); 11965 } 11966 ret = reganode(pRExC_state, INSUBP, parno); 11967 goto insert_if_check_paren; 11968 } 11969 else if (inRANGE(RExC_parse[0], '1', '9')) { 11970 /* (?(1)...) */ 11971 char c; 11972 UV uv; 11973 endptr = RExC_end; 11974 if (grok_atoUV(RExC_parse, &uv, &endptr) 11975 && uv <= I32_MAX 11976 ) { 11977 parno = (I32)uv; 11978 RExC_parse = (char*)endptr; 11979 } 11980 else { 11981 vFAIL("panic: grok_atoUV returned FALSE"); 11982 } 11983 ret = reganode(pRExC_state, GROUPP, parno); 11984 11985 insert_if_check_paren: 11986 if (UCHARAT(RExC_parse) != ')') { 11987 RExC_parse += UTF 11988 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 11989 : 1; 11990 vFAIL("Switch condition not recognized"); 11991 } 11992 nextchar(pRExC_state); 11993 insert_if: 11994 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state, 11995 IFTHEN, 0))) 11996 { 11997 REQUIRE_BRANCHJ(flagp, 0); 11998 } 11999 br = regbranch(pRExC_state, &flags, 1, depth+1); 12000 if (br == 0) { 12001 RETURN_FAIL_ON_RESTART(flags,flagp); 12002 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 12003 (UV) flags); 12004 } else 12005 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state, 12006 LONGJMP, 0))) 12007 { 12008 REQUIRE_BRANCHJ(flagp, 0); 12009 } 12010 c = UCHARAT(RExC_parse); 12011 nextchar(pRExC_state); 12012 if (flags&HASWIDTH) 12013 *flagp |= HASWIDTH; 12014 if (c == '|') { 12015 if (is_define) 12016 vFAIL("(?(DEFINE)....) does not allow branches"); 12017 12018 /* Fake one for optimizer. */ 12019 lastbr = reganode(pRExC_state, IFTHEN, 0); 12020 12021 if (!regbranch(pRExC_state, &flags, 1, depth+1)) { 12022 RETURN_FAIL_ON_RESTART(flags, flagp); 12023 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 12024 (UV) flags); 12025 } 12026 if (! REGTAIL(pRExC_state, ret, lastbr)) { 12027 REQUIRE_BRANCHJ(flagp, 0); 12028 } 12029 if (flags&HASWIDTH) 12030 *flagp |= HASWIDTH; 12031 c = UCHARAT(RExC_parse); 12032 nextchar(pRExC_state); 12033 } 12034 else 12035 lastbr = 0; 12036 if (c != ')') { 12037 if (RExC_parse >= RExC_end) 12038 vFAIL("Switch (?(condition)... not terminated"); 12039 else 12040 vFAIL("Switch (?(condition)... contains too many branches"); 12041 } 12042 ender = reg_node(pRExC_state, TAIL); 12043 if (! REGTAIL(pRExC_state, br, ender)) { 12044 REQUIRE_BRANCHJ(flagp, 0); 12045 } 12046 if (lastbr) { 12047 if (! REGTAIL(pRExC_state, lastbr, ender)) { 12048 REQUIRE_BRANCHJ(flagp, 0); 12049 } 12050 if (! REGTAIL(pRExC_state, 12051 REGNODE_OFFSET( 12052 NEXTOPER( 12053 NEXTOPER(REGNODE_p(lastbr)))), 12054 ender)) 12055 { 12056 REQUIRE_BRANCHJ(flagp, 0); 12057 } 12058 } 12059 else 12060 if (! REGTAIL(pRExC_state, ret, ender)) { 12061 REQUIRE_BRANCHJ(flagp, 0); 12062 } 12063 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */ 12064 RExC_size++; /* XXX WHY do we need this?!! 12065 For large programs it seems to be required 12066 but I can't figure out why. -- dmq*/ 12067 #endif 12068 return ret; 12069 } 12070 RExC_parse += UTF 12071 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 12072 : 1; 12073 vFAIL("Unknown switch condition (?(...))"); 12074 } 12075 case '[': /* (?[ ... ]) */ 12076 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1, 12077 oregcomp_parse); 12078 case 0: /* A NUL */ 12079 RExC_parse--; /* for vFAIL to print correctly */ 12080 vFAIL("Sequence (? incomplete"); 12081 break; 12082 12083 case ')': 12084 if (RExC_strict) { /* [perl #132851] */ 12085 ckWARNreg(RExC_parse, "Empty (?) without any modifiers"); 12086 } 12087 /* FALLTHROUGH */ 12088 default: /* e.g., (?i) */ 12089 RExC_parse = (char *) seqstart + 1; 12090 parse_flags: 12091 parse_lparen_question_flags(pRExC_state); 12092 if (UCHARAT(RExC_parse) != ':') { 12093 if (RExC_parse < RExC_end) 12094 nextchar(pRExC_state); 12095 *flagp = TRYAGAIN; 12096 return 0; 12097 } 12098 paren = ':'; 12099 nextchar(pRExC_state); 12100 ret = 0; 12101 goto parse_rest; 12102 } /* end switch */ 12103 } 12104 else { 12105 if (*RExC_parse == '{') { 12106 ckWARNregdep(RExC_parse + 1, 12107 "Unescaped left brace in regex is " 12108 "deprecated here (and will be fatal " 12109 "in Perl 5.32), passed through"); 12110 } 12111 /* Not bothering to indent here, as the above 'else' is temporary 12112 * */ 12113 if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ 12114 capturing_parens: 12115 parno = RExC_npar; 12116 RExC_npar++; 12117 if (! ALL_PARENS_COUNTED) { 12118 /* If we are in our first pass through (and maybe only pass), 12119 * we need to allocate memory for the capturing parentheses 12120 * data structures. 12121 */ 12122 12123 if (!RExC_parens_buf_size) { 12124 /* first guess at number of parens we might encounter */ 12125 RExC_parens_buf_size = 10; 12126 12127 /* setup RExC_open_parens, which holds the address of each 12128 * OPEN tag, and to make things simpler for the 0 index the 12129 * start of the program - this is used later for offsets */ 12130 Newxz(RExC_open_parens, RExC_parens_buf_size, 12131 regnode_offset); 12132 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 12133 12134 /* setup RExC_close_parens, which holds the address of each 12135 * CLOSE tag, and to make things simpler for the 0 index 12136 * the end of the program - this is used later for offsets 12137 * */ 12138 Newxz(RExC_close_parens, RExC_parens_buf_size, 12139 regnode_offset); 12140 /* we dont know where end op starts yet, so we dont need to 12141 * set RExC_close_parens[0] like we do RExC_open_parens[0] 12142 * above */ 12143 } 12144 else if (RExC_npar > RExC_parens_buf_size) { 12145 I32 old_size = RExC_parens_buf_size; 12146 12147 RExC_parens_buf_size *= 2; 12148 12149 Renew(RExC_open_parens, RExC_parens_buf_size, 12150 regnode_offset); 12151 Zero(RExC_open_parens + old_size, 12152 RExC_parens_buf_size - old_size, regnode_offset); 12153 12154 Renew(RExC_close_parens, RExC_parens_buf_size, 12155 regnode_offset); 12156 Zero(RExC_close_parens + old_size, 12157 RExC_parens_buf_size - old_size, regnode_offset); 12158 } 12159 } 12160 12161 ret = reganode(pRExC_state, OPEN, parno); 12162 if (!RExC_nestroot) 12163 RExC_nestroot = parno; 12164 if (RExC_open_parens && !RExC_open_parens[parno]) 12165 { 12166 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 12167 "%*s%*s Setting open paren #%" IVdf " to %d\n", 12168 22, "| |", (int)(depth * 2 + 1), "", 12169 (IV)parno, ret)); 12170 RExC_open_parens[parno]= ret; 12171 } 12172 12173 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ 12174 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */ 12175 is_open = 1; 12176 } else { 12177 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ 12178 paren = ':'; 12179 ret = 0; 12180 } 12181 } 12182 } 12183 else /* ! paren */ 12184 ret = 0; 12185 12186 parse_rest: 12187 /* Pick up the branches, linking them together. */ 12188 parse_start = RExC_parse; /* MJD */ 12189 br = regbranch(pRExC_state, &flags, 1, depth+1); 12190 12191 /* branch_len = (paren != 0); */ 12192 12193 if (br == 0) { 12194 RETURN_FAIL_ON_RESTART(flags, flagp); 12195 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 12196 } 12197 if (*RExC_parse == '|') { 12198 if (RExC_use_BRANCHJ) { 12199 reginsert(pRExC_state, BRANCHJ, br, depth+1); 12200 } 12201 else { /* MJD */ 12202 reginsert(pRExC_state, BRANCH, br, depth+1); 12203 Set_Node_Length(REGNODE_p(br), paren != 0); 12204 Set_Node_Offset_To_R(br, parse_start-RExC_start); 12205 } 12206 have_branch = 1; 12207 } 12208 else if (paren == ':') { 12209 *flagp |= flags&SIMPLE; 12210 } 12211 if (is_open) { /* Starts with OPEN. */ 12212 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ 12213 REQUIRE_BRANCHJ(flagp, 0); 12214 } 12215 } 12216 else if (paren != '?') /* Not Conditional */ 12217 ret = br; 12218 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 12219 lastbr = br; 12220 while (*RExC_parse == '|') { 12221 if (RExC_use_BRANCHJ) { 12222 bool shut_gcc_up; 12223 12224 ender = reganode(pRExC_state, LONGJMP, 0); 12225 12226 /* Append to the previous. */ 12227 shut_gcc_up = REGTAIL(pRExC_state, 12228 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))), 12229 ender); 12230 PERL_UNUSED_VAR(shut_gcc_up); 12231 } 12232 nextchar(pRExC_state); 12233 if (freeze_paren) { 12234 if (RExC_npar > after_freeze) 12235 after_freeze = RExC_npar; 12236 RExC_npar = freeze_paren; 12237 } 12238 br = regbranch(pRExC_state, &flags, 0, depth+1); 12239 12240 if (br == 0) { 12241 RETURN_FAIL_ON_RESTART(flags, flagp); 12242 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 12243 } 12244 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */ 12245 REQUIRE_BRANCHJ(flagp, 0); 12246 } 12247 lastbr = br; 12248 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 12249 } 12250 12251 if (have_branch || paren != ':') { 12252 regnode * br; 12253 12254 /* Make a closing node, and hook it on the end. */ 12255 switch (paren) { 12256 case ':': 12257 ender = reg_node(pRExC_state, TAIL); 12258 break; 12259 case 1: case 2: 12260 ender = reganode(pRExC_state, CLOSE, parno); 12261 if ( RExC_close_parens ) { 12262 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 12263 "%*s%*s Setting close paren #%" IVdf " to %d\n", 12264 22, "| |", (int)(depth * 2 + 1), "", 12265 (IV)parno, ender)); 12266 RExC_close_parens[parno]= ender; 12267 if (RExC_nestroot == parno) 12268 RExC_nestroot = 0; 12269 } 12270 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */ 12271 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */ 12272 break; 12273 case 's': 12274 ender = reg_node(pRExC_state, SRCLOSE); 12275 RExC_in_script_run = 0; 12276 break; 12277 case '<': 12278 case 'a': 12279 case 'A': 12280 case 'b': 12281 case 'B': 12282 case ',': 12283 case '=': 12284 case '!': 12285 *flagp &= ~HASWIDTH; 12286 /* FALLTHROUGH */ 12287 case 't': /* aTomic */ 12288 case '>': 12289 ender = reg_node(pRExC_state, SUCCEED); 12290 break; 12291 case 0: 12292 ender = reg_node(pRExC_state, END); 12293 assert(!RExC_end_op); /* there can only be one! */ 12294 RExC_end_op = REGNODE_p(ender); 12295 if (RExC_close_parens) { 12296 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 12297 "%*s%*s Setting close paren #0 (END) to %d\n", 12298 22, "| |", (int)(depth * 2 + 1), "", 12299 ender)); 12300 12301 RExC_close_parens[0]= ender; 12302 } 12303 break; 12304 } 12305 DEBUG_PARSE_r({ 12306 DEBUG_PARSE_MSG("lsbr"); 12307 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state); 12308 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); 12309 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 12310 SvPV_nolen_const(RExC_mysv1), 12311 (IV)lastbr, 12312 SvPV_nolen_const(RExC_mysv2), 12313 (IV)ender, 12314 (IV)(ender - lastbr) 12315 ); 12316 }); 12317 if (! REGTAIL(pRExC_state, lastbr, ender)) { 12318 REQUIRE_BRANCHJ(flagp, 0); 12319 } 12320 12321 if (have_branch) { 12322 char is_nothing= 1; 12323 if (depth==1) 12324 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 12325 12326 /* Hook the tails of the branches to the closing node. */ 12327 for (br = REGNODE_p(ret); br; br = regnext(br)) { 12328 const U8 op = PL_regkind[OP(br)]; 12329 if (op == BRANCH) { 12330 if (! REGTAIL_STUDY(pRExC_state, 12331 REGNODE_OFFSET(NEXTOPER(br)), 12332 ender)) 12333 { 12334 REQUIRE_BRANCHJ(flagp, 0); 12335 } 12336 if ( OP(NEXTOPER(br)) != NOTHING 12337 || regnext(NEXTOPER(br)) != REGNODE_p(ender)) 12338 is_nothing= 0; 12339 } 12340 else if (op == BRANCHJ) { 12341 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state, 12342 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))), 12343 ender); 12344 PERL_UNUSED_VAR(shut_gcc_up); 12345 /* for now we always disable this optimisation * / 12346 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING 12347 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender)) 12348 */ 12349 is_nothing= 0; 12350 } 12351 } 12352 if (is_nothing) { 12353 regnode * ret_as_regnode = REGNODE_p(ret); 12354 br= PL_regkind[OP(ret_as_regnode)] != BRANCH 12355 ? regnext(ret_as_regnode) 12356 : ret_as_regnode; 12357 DEBUG_PARSE_r({ 12358 DEBUG_PARSE_MSG("NADA"); 12359 regprop(RExC_rx, RExC_mysv1, ret_as_regnode, 12360 NULL, pRExC_state); 12361 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), 12362 NULL, pRExC_state); 12363 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 12364 SvPV_nolen_const(RExC_mysv1), 12365 (IV)REG_NODE_NUM(ret_as_regnode), 12366 SvPV_nolen_const(RExC_mysv2), 12367 (IV)ender, 12368 (IV)(ender - ret) 12369 ); 12370 }); 12371 OP(br)= NOTHING; 12372 if (OP(REGNODE_p(ender)) == TAIL) { 12373 NEXT_OFF(br)= 0; 12374 RExC_emit= REGNODE_OFFSET(br) + 1; 12375 } else { 12376 regnode *opt; 12377 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ ) 12378 OP(opt)= OPTIMIZED; 12379 NEXT_OFF(br)= REGNODE_p(ender) - br; 12380 } 12381 } 12382 } 12383 } 12384 12385 { 12386 const char *p; 12387 /* Even/odd or x=don't care: 010101x10x */ 12388 static const char parens[] = "=!aA<,>Bbt"; 12389 /* flag below is set to 0 up through 'A'; 1 for larger */ 12390 12391 if (paren && (p = strchr(parens, paren))) { 12392 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 12393 int flag = (p - parens) > 3; 12394 12395 if (paren == '>' || paren == 't') { 12396 node = SUSPEND, flag = 0; 12397 } 12398 12399 reginsert(pRExC_state, node, ret, depth+1); 12400 Set_Node_Cur_Length(REGNODE_p(ret), parse_start); 12401 Set_Node_Offset(REGNODE_p(ret), parse_start + 1); 12402 FLAGS(REGNODE_p(ret)) = flag; 12403 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) 12404 { 12405 REQUIRE_BRANCHJ(flagp, 0); 12406 } 12407 } 12408 } 12409 12410 /* Check for proper termination. */ 12411 if (paren) { 12412 /* restore original flags, but keep (?p) and, if we've encountered 12413 * something in the parse that changes /d rules into /u, keep the /u */ 12414 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); 12415 if (DEPENDS_SEMANTICS && RExC_uni_semantics) { 12416 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 12417 } 12418 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { 12419 RExC_parse = oregcomp_parse; 12420 vFAIL("Unmatched ("); 12421 } 12422 nextchar(pRExC_state); 12423 } 12424 else if (!paren && RExC_parse < RExC_end) { 12425 if (*RExC_parse == ')') { 12426 RExC_parse++; 12427 vFAIL("Unmatched )"); 12428 } 12429 else 12430 FAIL("Junk on end of regexp"); /* "Can't happen". */ 12431 NOT_REACHED; /* NOTREACHED */ 12432 } 12433 12434 if (RExC_in_lookbehind) { 12435 RExC_in_lookbehind--; 12436 } 12437 if (after_freeze > RExC_npar) 12438 RExC_npar = after_freeze; 12439 return(ret); 12440 } 12441 12442 /* 12443 - regbranch - one alternative of an | operator 12444 * 12445 * Implements the concatenation operator. 12446 * 12447 * On success, returns the offset at which any next node should be placed into 12448 * the regex engine program being compiled. 12449 * 12450 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 12451 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 12452 * UTF-8 12453 */ 12454 STATIC regnode_offset 12455 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 12456 { 12457 regnode_offset ret; 12458 regnode_offset chain = 0; 12459 regnode_offset latest; 12460 I32 flags = 0, c = 0; 12461 GET_RE_DEBUG_FLAGS_DECL; 12462 12463 PERL_ARGS_ASSERT_REGBRANCH; 12464 12465 DEBUG_PARSE("brnc"); 12466 12467 if (first) 12468 ret = 0; 12469 else { 12470 if (RExC_use_BRANCHJ) 12471 ret = reganode(pRExC_state, BRANCHJ, 0); 12472 else { 12473 ret = reg_node(pRExC_state, BRANCH); 12474 Set_Node_Length(REGNODE_p(ret), 1); 12475 } 12476 } 12477 12478 *flagp = WORST; /* Tentatively. */ 12479 12480 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 12481 FALSE /* Don't force to /x */ ); 12482 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 12483 flags &= ~TRYAGAIN; 12484 latest = regpiece(pRExC_state, &flags, depth+1); 12485 if (latest == 0) { 12486 if (flags & TRYAGAIN) 12487 continue; 12488 RETURN_FAIL_ON_RESTART(flags, flagp); 12489 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags); 12490 } 12491 else if (ret == 0) 12492 ret = latest; 12493 *flagp |= flags&(HASWIDTH|POSTPONED); 12494 if (chain == 0) /* First piece. */ 12495 *flagp |= flags&SPSTART; 12496 else { 12497 /* FIXME adding one for every branch after the first is probably 12498 * excessive now we have TRIE support. (hv) */ 12499 MARK_NAUGHTY(1); 12500 if (! REGTAIL(pRExC_state, chain, latest)) { 12501 /* XXX We could just redo this branch, but figuring out what 12502 * bookkeeping needs to be reset is a pain, and it's likely 12503 * that other branches that goto END will also be too large */ 12504 REQUIRE_BRANCHJ(flagp, 0); 12505 } 12506 } 12507 chain = latest; 12508 c++; 12509 } 12510 if (chain == 0) { /* Loop ran zero times. */ 12511 chain = reg_node(pRExC_state, NOTHING); 12512 if (ret == 0) 12513 ret = chain; 12514 } 12515 if (c == 1) { 12516 *flagp |= flags&SIMPLE; 12517 } 12518 12519 return ret; 12520 } 12521 12522 /* 12523 - regpiece - something followed by possible quantifier * + ? {n,m} 12524 * 12525 * Note that the branching code sequences used for ? and the general cases 12526 * of * and + are somewhat optimized: they use the same NOTHING node as 12527 * both the endmarker for their branch list and the body of the last branch. 12528 * It might seem that this node could be dispensed with entirely, but the 12529 * endmarker role is not redundant. 12530 * 12531 * On success, returns the offset at which any next node should be placed into 12532 * the regex engine program being compiled. 12533 * 12534 * Returns 0 otherwise, with *flagp set to indicate why: 12535 * TRYAGAIN if regatom() returns 0 with TRYAGAIN. 12536 * RESTART_PARSE if the parse needs to be restarted, or'd with 12537 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 12538 */ 12539 STATIC regnode_offset 12540 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 12541 { 12542 regnode_offset ret; 12543 char op; 12544 char *next; 12545 I32 flags; 12546 const char * const origparse = RExC_parse; 12547 I32 min; 12548 I32 max = REG_INFTY; 12549 #ifdef RE_TRACK_PATTERN_OFFSETS 12550 char *parse_start; 12551 #endif 12552 const char *maxpos = NULL; 12553 UV uv; 12554 12555 /* Save the original in case we change the emitted regop to a FAIL. */ 12556 const regnode_offset orig_emit = RExC_emit; 12557 12558 GET_RE_DEBUG_FLAGS_DECL; 12559 12560 PERL_ARGS_ASSERT_REGPIECE; 12561 12562 DEBUG_PARSE("piec"); 12563 12564 ret = regatom(pRExC_state, &flags, depth+1); 12565 if (ret == 0) { 12566 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN); 12567 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); 12568 } 12569 12570 op = *RExC_parse; 12571 12572 if (op == '{' && regcurly(RExC_parse)) { 12573 maxpos = NULL; 12574 #ifdef RE_TRACK_PATTERN_OFFSETS 12575 parse_start = RExC_parse; /* MJD */ 12576 #endif 12577 next = RExC_parse + 1; 12578 while (isDIGIT(*next) || *next == ',') { 12579 if (*next == ',') { 12580 if (maxpos) 12581 break; 12582 else 12583 maxpos = next; 12584 } 12585 next++; 12586 } 12587 if (*next == '}') { /* got one */ 12588 const char* endptr; 12589 if (!maxpos) 12590 maxpos = next; 12591 RExC_parse++; 12592 if (isDIGIT(*RExC_parse)) { 12593 endptr = RExC_end; 12594 if (!grok_atoUV(RExC_parse, &uv, &endptr)) 12595 vFAIL("Invalid quantifier in {,}"); 12596 if (uv >= REG_INFTY) 12597 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 12598 min = (I32)uv; 12599 } else { 12600 min = 0; 12601 } 12602 if (*maxpos == ',') 12603 maxpos++; 12604 else 12605 maxpos = RExC_parse; 12606 if (isDIGIT(*maxpos)) { 12607 endptr = RExC_end; 12608 if (!grok_atoUV(maxpos, &uv, &endptr)) 12609 vFAIL("Invalid quantifier in {,}"); 12610 if (uv >= REG_INFTY) 12611 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 12612 max = (I32)uv; 12613 } else { 12614 max = REG_INFTY; /* meaning "infinity" */ 12615 } 12616 RExC_parse = next; 12617 nextchar(pRExC_state); 12618 if (max < min) { /* If can't match, warn and optimize to fail 12619 unconditionally */ 12620 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); 12621 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); 12622 NEXT_OFF(REGNODE_p(orig_emit)) = 12623 regarglen[OPFAIL] + NODE_STEP_REGNODE; 12624 return ret; 12625 } 12626 else if (min == max && *RExC_parse == '?') 12627 { 12628 ckWARN2reg(RExC_parse + 1, 12629 "Useless use of greediness modifier '%c'", 12630 *RExC_parse); 12631 } 12632 12633 do_curly: 12634 if ((flags&SIMPLE)) { 12635 if (min == 0 && max == REG_INFTY) { 12636 reginsert(pRExC_state, STAR, ret, depth+1); 12637 MARK_NAUGHTY(4); 12638 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 12639 goto nest_check; 12640 } 12641 if (min == 1 && max == REG_INFTY) { 12642 reginsert(pRExC_state, PLUS, ret, depth+1); 12643 MARK_NAUGHTY(3); 12644 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 12645 goto nest_check; 12646 } 12647 MARK_NAUGHTY_EXP(2, 2); 12648 reginsert(pRExC_state, CURLY, ret, depth+1); 12649 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ 12650 Set_Node_Cur_Length(REGNODE_p(ret), parse_start); 12651 } 12652 else { 12653 const regnode_offset w = reg_node(pRExC_state, WHILEM); 12654 12655 FLAGS(REGNODE_p(w)) = 0; 12656 if (! REGTAIL(pRExC_state, ret, w)) { 12657 REQUIRE_BRANCHJ(flagp, 0); 12658 } 12659 if (RExC_use_BRANCHJ) { 12660 reginsert(pRExC_state, LONGJMP, ret, depth+1); 12661 reginsert(pRExC_state, NOTHING, ret, depth+1); 12662 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */ 12663 } 12664 reginsert(pRExC_state, CURLYX, ret, depth+1); 12665 /* MJD hk */ 12666 Set_Node_Offset(REGNODE_p(ret), parse_start+1); 12667 Set_Node_Length(REGNODE_p(ret), 12668 op == '{' ? (RExC_parse - parse_start) : 1); 12669 12670 if (RExC_use_BRANCHJ) 12671 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to 12672 LONGJMP. */ 12673 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, 12674 NOTHING))) 12675 { 12676 REQUIRE_BRANCHJ(flagp, 0); 12677 } 12678 RExC_whilem_seen++; 12679 MARK_NAUGHTY_EXP(1, 4); /* compound interest */ 12680 } 12681 FLAGS(REGNODE_p(ret)) = 0; 12682 12683 if (min > 0) 12684 *flagp = WORST; 12685 if (max > 0) 12686 *flagp |= HASWIDTH; 12687 ARG1_SET(REGNODE_p(ret), (U16)min); 12688 ARG2_SET(REGNODE_p(ret), (U16)max); 12689 if (max == REG_INFTY) 12690 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 12691 12692 goto nest_check; 12693 } 12694 } 12695 12696 if (!ISMULT1(op)) { 12697 *flagp = flags; 12698 return(ret); 12699 } 12700 12701 #if 0 /* Now runtime fix should be reliable. */ 12702 12703 /* if this is reinstated, don't forget to put this back into perldiag: 12704 12705 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 12706 12707 (F) The part of the regexp subject to either the * or + quantifier 12708 could match an empty string. The {#} shows in the regular 12709 expression about where the problem was discovered. 12710 12711 */ 12712 12713 if (!(flags&HASWIDTH) && op != '?') 12714 vFAIL("Regexp *+ operand could be empty"); 12715 #endif 12716 12717 #ifdef RE_TRACK_PATTERN_OFFSETS 12718 parse_start = RExC_parse; 12719 #endif 12720 nextchar(pRExC_state); 12721 12722 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 12723 12724 if (op == '*') { 12725 min = 0; 12726 goto do_curly; 12727 } 12728 else if (op == '+') { 12729 min = 1; 12730 goto do_curly; 12731 } 12732 else if (op == '?') { 12733 min = 0; max = 1; 12734 goto do_curly; 12735 } 12736 nest_check: 12737 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { 12738 ckWARN2reg(RExC_parse, 12739 "%" UTF8f " matches null string many times", 12740 UTF8fARG(UTF, (RExC_parse >= origparse 12741 ? RExC_parse - origparse 12742 : 0), 12743 origparse)); 12744 } 12745 12746 if (*RExC_parse == '?') { 12747 nextchar(pRExC_state); 12748 reginsert(pRExC_state, MINMOD, ret, depth+1); 12749 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) { 12750 REQUIRE_BRANCHJ(flagp, 0); 12751 } 12752 } 12753 else if (*RExC_parse == '+') { 12754 regnode_offset ender; 12755 nextchar(pRExC_state); 12756 ender = reg_node(pRExC_state, SUCCEED); 12757 if (! REGTAIL(pRExC_state, ret, ender)) { 12758 REQUIRE_BRANCHJ(flagp, 0); 12759 } 12760 reginsert(pRExC_state, SUSPEND, ret, depth+1); 12761 ender = reg_node(pRExC_state, TAIL); 12762 if (! REGTAIL(pRExC_state, ret, ender)) { 12763 REQUIRE_BRANCHJ(flagp, 0); 12764 } 12765 } 12766 12767 if (ISMULT2(RExC_parse)) { 12768 RExC_parse++; 12769 vFAIL("Nested quantifiers"); 12770 } 12771 12772 return(ret); 12773 } 12774 12775 STATIC bool 12776 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, 12777 regnode_offset * node_p, 12778 UV * code_point_p, 12779 int * cp_count, 12780 I32 * flagp, 12781 const bool strict, 12782 const U32 depth 12783 ) 12784 { 12785 /* This routine teases apart the various meanings of \N and returns 12786 * accordingly. The input parameters constrain which meaning(s) is/are valid 12787 * in the current context. 12788 * 12789 * Exactly one of <node_p> and <code_point_p> must be non-NULL. 12790 * 12791 * If <code_point_p> is not NULL, the context is expecting the result to be a 12792 * single code point. If this \N instance turns out to a single code point, 12793 * the function returns TRUE and sets *code_point_p to that code point. 12794 * 12795 * If <node_p> is not NULL, the context is expecting the result to be one of 12796 * the things representable by a regnode. If this \N instance turns out to be 12797 * one such, the function generates the regnode, returns TRUE and sets *node_p 12798 * to point to the offset of that regnode into the regex engine program being 12799 * compiled. 12800 * 12801 * If this instance of \N isn't legal in any context, this function will 12802 * generate a fatal error and not return. 12803 * 12804 * On input, RExC_parse should point to the first char following the \N at the 12805 * time of the call. On successful return, RExC_parse will have been updated 12806 * to point to just after the sequence identified by this routine. Also 12807 * *flagp has been updated as needed. 12808 * 12809 * When there is some problem with the current context and this \N instance, 12810 * the function returns FALSE, without advancing RExC_parse, nor setting 12811 * *node_p, nor *code_point_p, nor *flagp. 12812 * 12813 * If <cp_count> is not NULL, the caller wants to know the length (in code 12814 * points) that this \N sequence matches. This is set, and the input is 12815 * parsed for errors, even if the function returns FALSE, as detailed below. 12816 * 12817 * There are 6 possibilities here, as detailed in the next 6 paragraphs. 12818 * 12819 * Probably the most common case is for the \N to specify a single code point. 12820 * *cp_count will be set to 1, and *code_point_p will be set to that code 12821 * point. 12822 * 12823 * Another possibility is for the input to be an empty \N{}. This is no 12824 * longer accepted, and will generate a fatal error. 12825 * 12826 * Another possibility is for a custom charnames handler to be in effect which 12827 * translates the input name to an empty string. *cp_count will be set to 0. 12828 * *node_p will be set to a generated NOTHING node. 12829 * 12830 * Still another possibility is for the \N to mean [^\n]. *cp_count will be 12831 * set to 0. *node_p will be set to a generated REG_ANY node. 12832 * 12833 * The fifth possibility is that \N resolves to a sequence of more than one 12834 * code points. *cp_count will be set to the number of code points in the 12835 * sequence. *node_p will be set to a generated node returned by this 12836 * function calling S_reg(). 12837 * 12838 * The final possibility is that it is premature to be calling this function; 12839 * the parse needs to be restarted. This can happen when this changes from 12840 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The 12841 * latter occurs only when the fifth possibility would otherwise be in 12842 * effect, and is because one of those code points requires the pattern to be 12843 * recompiled as UTF-8. The function returns FALSE, and sets the 12844 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this 12845 * happens, the caller needs to desist from continuing parsing, and return 12846 * this information to its caller. This is not set for when there is only one 12847 * code point, as this can be called as part of an ANYOF node, and they can 12848 * store above-Latin1 code points without the pattern having to be in UTF-8. 12849 * 12850 * For non-single-quoted regexes, the tokenizer has resolved character and 12851 * sequence names inside \N{...} into their Unicode values, normalizing the 12852 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the 12853 * hex-represented code points in the sequence. This is done there because 12854 * the names can vary based on what charnames pragma is in scope at the time, 12855 * so we need a way to take a snapshot of what they resolve to at the time of 12856 * the original parse. [perl #56444]. 12857 * 12858 * That parsing is skipped for single-quoted regexes, so here we may get 12859 * '\N{NAME}', which is parsed now. If the single-quoted regex is something 12860 * like '\N{U+41}', that code point is Unicode, and has to be translated into 12861 * the native character set for non-ASCII platforms. The other possibilities 12862 * are already native, so no translation is done. */ 12863 12864 char * endbrace; /* points to '}' following the name */ 12865 char* p = RExC_parse; /* Temporary */ 12866 12867 SV * substitute_parse = NULL; 12868 char *orig_end; 12869 char *save_start; 12870 I32 flags; 12871 12872 GET_RE_DEBUG_FLAGS_DECL; 12873 12874 PERL_ARGS_ASSERT_GROK_BSLASH_N; 12875 12876 GET_RE_DEBUG_FLAGS; 12877 12878 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ 12879 assert(! (node_p && cp_count)); /* At most 1 should be set */ 12880 12881 if (cp_count) { /* Initialize return for the most common case */ 12882 *cp_count = 1; 12883 } 12884 12885 /* The [^\n] meaning of \N ignores spaces and comments under the /x 12886 * modifier. The other meanings do not, so use a temporary until we find 12887 * out which we are being called with */ 12888 skip_to_be_ignored_text(pRExC_state, &p, 12889 FALSE /* Don't force to /x */ ); 12890 12891 /* Disambiguate between \N meaning a named character versus \N meaning 12892 * [^\n]. The latter is assumed when the {...} following the \N is a legal 12893 * quantifier, or if there is no '{' at all */ 12894 if (*p != '{' || regcurly(p)) { 12895 RExC_parse = p; 12896 if (cp_count) { 12897 *cp_count = -1; 12898 } 12899 12900 if (! node_p) { 12901 return FALSE; 12902 } 12903 12904 *node_p = reg_node(pRExC_state, REG_ANY); 12905 *flagp |= HASWIDTH|SIMPLE; 12906 MARK_NAUGHTY(1); 12907 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */ 12908 return TRUE; 12909 } 12910 12911 /* The test above made sure that the next real character is a '{', but 12912 * under the /x modifier, it could be separated by space (or a comment and 12913 * \n) and this is not allowed (for consistency with \x{...} and the 12914 * tokenizer handling of \N{NAME}). */ 12915 if (*RExC_parse != '{') { 12916 vFAIL("Missing braces on \\N{}"); 12917 } 12918 12919 RExC_parse++; /* Skip past the '{' */ 12920 12921 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 12922 if (! endbrace) { /* no trailing brace */ 12923 vFAIL2("Missing right brace on \\%c{}", 'N'); 12924 } 12925 12926 /* Here, we have decided it should be a named character or sequence. These 12927 * imply Unicode semantics */ 12928 REQUIRE_UNI_RULES(flagp, FALSE); 12929 12930 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to 12931 * nothing at all (not allowed under strict) */ 12932 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') { 12933 RExC_parse = endbrace; 12934 if (strict) { 12935 RExC_parse++; /* Position after the "}" */ 12936 vFAIL("Zero length \\N{}"); 12937 } 12938 12939 if (cp_count) { 12940 *cp_count = 0; 12941 } 12942 nextchar(pRExC_state); 12943 if (! node_p) { 12944 return FALSE; 12945 } 12946 12947 *node_p = reg_node(pRExC_state, NOTHING); 12948 return TRUE; 12949 } 12950 12951 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) { 12952 12953 /* Here, the name isn't of the form U+.... This can happen if the 12954 * pattern is single-quoted, so didn't get evaluated in toke.c. Now 12955 * is the time to find out what the name means */ 12956 12957 const STRLEN name_len = endbrace - RExC_parse; 12958 SV * value_sv; /* What does this name evaluate to */ 12959 SV ** value_svp; 12960 const U8 * value; /* string of name's value */ 12961 STRLEN value_len; /* and its length */ 12962 12963 /* RExC_unlexed_names is a hash of names that weren't evaluated by 12964 * toke.c, and their values. Make sure is initialized */ 12965 if (! RExC_unlexed_names) { 12966 RExC_unlexed_names = newHV(); 12967 } 12968 12969 /* If we have already seen this name in this pattern, use that. This 12970 * allows us to only call the charnames handler once per name per 12971 * pattern. A broken or malicious handler could return something 12972 * different each time, which could cause the results to vary depending 12973 * on if something gets added or subtracted from the pattern that 12974 * causes the number of passes to change, for example */ 12975 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse, 12976 name_len, 0))) 12977 { 12978 value_sv = *value_svp; 12979 } 12980 else { /* Otherwise we have to go out and get the name */ 12981 const char * error_msg = NULL; 12982 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace, 12983 UTF, 12984 &error_msg); 12985 if (error_msg) { 12986 RExC_parse = endbrace; 12987 vFAIL(error_msg); 12988 } 12989 12990 /* If no error message, should have gotten a valid return */ 12991 assert (value_sv); 12992 12993 /* Save the name's meaning for later use */ 12994 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len, 12995 value_sv, 0)) 12996 { 12997 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 12998 } 12999 } 13000 13001 /* Here, we have the value the name evaluates to in 'value_sv' */ 13002 value = (U8 *) SvPV(value_sv, value_len); 13003 13004 /* See if the result is one code point vs 0 or multiple */ 13005 if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv)) 13006 ? UTF8SKIP(value) 13007 : 1)) 13008 { 13009 /* Here, exactly one code point. If that isn't what is wanted, 13010 * fail */ 13011 if (! code_point_p) { 13012 RExC_parse = p; 13013 return FALSE; 13014 } 13015 13016 /* Convert from string to numeric code point */ 13017 *code_point_p = (SvUTF8(value_sv)) 13018 ? valid_utf8_to_uvchr(value, NULL) 13019 : *value; 13020 13021 /* Have parsed this entire single code point \N{...}. *cp_count 13022 * has already been set to 1, so don't do it again. */ 13023 RExC_parse = endbrace; 13024 nextchar(pRExC_state); 13025 return TRUE; 13026 } /* End of is a single code point */ 13027 13028 /* Count the code points, if caller desires. The API says to do this 13029 * even if we will later return FALSE */ 13030 if (cp_count) { 13031 *cp_count = 0; 13032 13033 *cp_count = (SvUTF8(value_sv)) 13034 ? utf8_length(value, value + value_len) 13035 : value_len; 13036 } 13037 13038 /* Fail if caller doesn't want to handle a multi-code-point sequence. 13039 * But don't back the pointer up if the caller wants to know how many 13040 * code points there are (they need to handle it themselves in this 13041 * case). */ 13042 if (! node_p) { 13043 if (! cp_count) { 13044 RExC_parse = p; 13045 } 13046 return FALSE; 13047 } 13048 13049 /* Convert this to a sub-pattern of the form "(?: ... )", and then call 13050 * reg recursively to parse it. That way, it retains its atomicness, 13051 * while not having to worry about any special handling that some code 13052 * points may have. */ 13053 13054 substitute_parse = newSVpvs("?:"); 13055 sv_catsv(substitute_parse, value_sv); 13056 sv_catpv(substitute_parse, ")"); 13057 13058 #ifdef EBCDIC 13059 /* The value should already be native, so no need to convert on EBCDIC 13060 * platforms.*/ 13061 assert(! RExC_recode_x_to_native); 13062 #endif 13063 13064 } 13065 else { /* \N{U+...} */ 13066 Size_t count = 0; /* code point count kept internally */ 13067 13068 /* We can get to here when the input is \N{U+...} or when toke.c has 13069 * converted a name to the \N{U+...} form. This include changing a 13070 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */ 13071 13072 RExC_parse += 2; /* Skip past the 'U+' */ 13073 13074 /* Code points are separated by dots. The '}' terminates the whole 13075 * thing. */ 13076 13077 do { /* Loop until the ending brace */ 13078 UV cp = 0; 13079 char * start_digit; /* The first of the current code point */ 13080 if (! isXDIGIT(*RExC_parse)) { 13081 RExC_parse++; 13082 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 13083 } 13084 13085 start_digit = RExC_parse; 13086 count++; 13087 13088 /* Loop through the hex digits of the current code point */ 13089 do { 13090 /* Adding this digit will shift the result 4 bits. If that 13091 * result would be above the legal max, it's overflow */ 13092 if (cp > MAX_LEGAL_CP >> 4) { 13093 13094 /* Find the end of the code point */ 13095 do { 13096 RExC_parse ++; 13097 } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_'); 13098 13099 /* Be sure to synchronize this message with the similar one 13100 * in utf8.c */ 13101 vFAIL4("Use of code point 0x%.*s is not allowed; the" 13102 " permissible max is 0x%" UVxf, 13103 (int) (RExC_parse - start_digit), start_digit, 13104 MAX_LEGAL_CP); 13105 } 13106 13107 /* Accumulate this (valid) digit into the running total */ 13108 cp = (cp << 4) + READ_XDIGIT(RExC_parse); 13109 13110 /* READ_XDIGIT advanced the input pointer. Ignore a single 13111 * underscore separator */ 13112 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) { 13113 RExC_parse++; 13114 } 13115 } while (isXDIGIT(*RExC_parse)); 13116 13117 /* Here, have accumulated the next code point */ 13118 if (RExC_parse >= endbrace) { /* If done ... */ 13119 if (count != 1) { 13120 goto do_concat; 13121 } 13122 13123 /* Here, is a single code point; fail if doesn't want that */ 13124 if (! code_point_p) { 13125 RExC_parse = p; 13126 return FALSE; 13127 } 13128 13129 /* A single code point is easy to handle; just return it */ 13130 *code_point_p = UNI_TO_NATIVE(cp); 13131 RExC_parse = endbrace; 13132 nextchar(pRExC_state); 13133 return TRUE; 13134 } 13135 13136 /* Here, the only legal thing would be a multiple character 13137 * sequence (of the form "\N{U+c1.c2. ... }". So the next 13138 * character must be a dot (and the one after that can't be the 13139 * endbrace, or we'd have something like \N{U+100.} ) */ 13140 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) { 13141 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ 13142 ? UTF8SKIP(RExC_parse) 13143 : 1; 13144 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */ 13145 RExC_parse = endbrace; 13146 } 13147 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 13148 } 13149 13150 /* Here, looks like its really a multiple character sequence. Fail 13151 * if that's not what the caller wants. But continue with counting 13152 * and error checking if they still want a count */ 13153 if (! node_p && ! cp_count) { 13154 return FALSE; 13155 } 13156 13157 /* What is done here is to convert this to a sub-pattern of the 13158 * form \x{char1}\x{char2}... and then call reg recursively to 13159 * parse it (enclosing in "(?: ... )" ). That way, it retains its 13160 * atomicness, while not having to worry about special handling 13161 * that some code points may have. We don't create a subpattern, 13162 * but go through the motions of code point counting and error 13163 * checking, if the caller doesn't want a node returned. */ 13164 13165 if (node_p && count == 1) { 13166 substitute_parse = newSVpvs("?:"); 13167 } 13168 13169 do_concat: 13170 13171 if (node_p) { 13172 /* Convert to notation the rest of the code understands */ 13173 sv_catpvs(substitute_parse, "\\x{"); 13174 sv_catpvn(substitute_parse, start_digit, 13175 RExC_parse - start_digit); 13176 sv_catpvs(substitute_parse, "}"); 13177 } 13178 13179 /* Move to after the dot (or ending brace the final time through.) 13180 * */ 13181 RExC_parse++; 13182 count++; 13183 13184 } while (RExC_parse < endbrace); 13185 13186 if (! node_p) { /* Doesn't want the node */ 13187 assert (cp_count); 13188 13189 *cp_count = count; 13190 return FALSE; 13191 } 13192 13193 sv_catpvs(substitute_parse, ")"); 13194 13195 #ifdef EBCDIC 13196 /* The values are Unicode, and therefore have to be converted to native 13197 * on a non-Unicode (meaning non-ASCII) platform. */ 13198 RExC_recode_x_to_native = 1; 13199 #endif 13200 13201 } 13202 13203 /* Here, we have the string the name evaluates to, ready to be parsed, 13204 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}" 13205 * constructs. This can be called from within a substitute parse already. 13206 * The error reporting mechanism doesn't work for 2 levels of this, but the 13207 * code above has validated this new construct, so there should be no 13208 * errors generated by the below. And this isn' an exact copy, so the 13209 * mechanism to seamlessly deal with this won't work, so turn off warnings 13210 * during it */ 13211 save_start = RExC_start; 13212 orig_end = RExC_end; 13213 13214 RExC_parse = RExC_start = SvPVX(substitute_parse); 13215 RExC_end = RExC_parse + SvCUR(substitute_parse); 13216 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 13217 13218 *node_p = reg(pRExC_state, 1, &flags, depth+1); 13219 13220 /* Restore the saved values */ 13221 RESTORE_WARNINGS; 13222 RExC_start = save_start; 13223 RExC_parse = endbrace; 13224 RExC_end = orig_end; 13225 #ifdef EBCDIC 13226 RExC_recode_x_to_native = 0; 13227 #endif 13228 13229 SvREFCNT_dec_NN(substitute_parse); 13230 13231 if (! *node_p) { 13232 RETURN_FAIL_ON_RESTART(flags, flagp); 13233 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf, 13234 (UV) flags); 13235 } 13236 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 13237 13238 nextchar(pRExC_state); 13239 13240 return TRUE; 13241 } 13242 13243 13244 PERL_STATIC_INLINE U8 13245 S_compute_EXACTish(RExC_state_t *pRExC_state) 13246 { 13247 U8 op; 13248 13249 PERL_ARGS_ASSERT_COMPUTE_EXACTISH; 13250 13251 if (! FOLD) { 13252 return (LOC) 13253 ? EXACTL 13254 : EXACT; 13255 } 13256 13257 op = get_regex_charset(RExC_flags); 13258 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { 13259 op--; /* /a is same as /u, and map /aa's offset to what /a's would have 13260 been, so there is no hole */ 13261 } 13262 13263 return op + EXACTF; 13264 } 13265 13266 STATIC bool 13267 S_new_regcurly(const char *s, const char *e) 13268 { 13269 /* This is a temporary function designed to match the most lenient form of 13270 * a {m,n} quantifier we ever envision, with either number omitted, and 13271 * spaces anywhere between/before/after them. 13272 * 13273 * If this function fails, then the string it matches is very unlikely to 13274 * ever be considered a valid quantifier, so we can allow the '{' that 13275 * begins it to be considered as a literal */ 13276 13277 bool has_min = FALSE; 13278 bool has_max = FALSE; 13279 13280 PERL_ARGS_ASSERT_NEW_REGCURLY; 13281 13282 if (s >= e || *s++ != '{') 13283 return FALSE; 13284 13285 while (s < e && isSPACE(*s)) { 13286 s++; 13287 } 13288 while (s < e && isDIGIT(*s)) { 13289 has_min = TRUE; 13290 s++; 13291 } 13292 while (s < e && isSPACE(*s)) { 13293 s++; 13294 } 13295 13296 if (*s == ',') { 13297 s++; 13298 while (s < e && isSPACE(*s)) { 13299 s++; 13300 } 13301 while (s < e && isDIGIT(*s)) { 13302 has_max = TRUE; 13303 s++; 13304 } 13305 while (s < e && isSPACE(*s)) { 13306 s++; 13307 } 13308 } 13309 13310 return s < e && *s == '}' && (has_min || has_max); 13311 } 13312 13313 /* Parse backref decimal value, unless it's too big to sensibly be a backref, 13314 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ 13315 13316 static I32 13317 S_backref_value(char *p, char *e) 13318 { 13319 const char* endptr = e; 13320 UV val; 13321 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) 13322 return (I32)val; 13323 return I32_MAX; 13324 } 13325 13326 13327 /* 13328 - regatom - the lowest level 13329 13330 Try to identify anything special at the start of the current parse position. 13331 If there is, then handle it as required. This may involve generating a 13332 single regop, such as for an assertion; or it may involve recursing, such as 13333 to handle a () structure. 13334 13335 If the string doesn't start with something special then we gobble up 13336 as much literal text as we can. If we encounter a quantifier, we have to 13337 back off the final literal character, as that quantifier applies to just it 13338 and not to the whole string of literals. 13339 13340 Once we have been able to handle whatever type of thing started the 13341 sequence, we return the offset into the regex engine program being compiled 13342 at which any next regnode should be placed. 13343 13344 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN. 13345 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be 13346 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 13347 Otherwise does not return 0. 13348 13349 Note: we have to be careful with escapes, as they can be both literal 13350 and special, and in the case of \10 and friends, context determines which. 13351 13352 A summary of the code structure is: 13353 13354 switch (first_byte) { 13355 cases for each special: 13356 handle this special; 13357 break; 13358 case '\\': 13359 switch (2nd byte) { 13360 cases for each unambiguous special: 13361 handle this special; 13362 break; 13363 cases for each ambigous special/literal: 13364 disambiguate; 13365 if (special) handle here 13366 else goto defchar; 13367 default: // unambiguously literal: 13368 goto defchar; 13369 } 13370 default: // is a literal char 13371 // FALL THROUGH 13372 defchar: 13373 create EXACTish node for literal; 13374 while (more input and node isn't full) { 13375 switch (input_byte) { 13376 cases for each special; 13377 make sure parse pointer is set so that the next call to 13378 regatom will see this special first 13379 goto loopdone; // EXACTish node terminated by prev. char 13380 default: 13381 append char to EXACTISH node; 13382 } 13383 get next input byte; 13384 } 13385 loopdone: 13386 } 13387 return the generated node; 13388 13389 Specifically there are two separate switches for handling 13390 escape sequences, with the one for handling literal escapes requiring 13391 a dummy entry for all of the special escapes that are actually handled 13392 by the other. 13393 13394 */ 13395 13396 STATIC regnode_offset 13397 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 13398 { 13399 dVAR; 13400 regnode_offset ret = 0; 13401 I32 flags = 0; 13402 char *parse_start; 13403 U8 op; 13404 int invert = 0; 13405 U8 arg; 13406 13407 GET_RE_DEBUG_FLAGS_DECL; 13408 13409 *flagp = WORST; /* Tentatively. */ 13410 13411 DEBUG_PARSE("atom"); 13412 13413 PERL_ARGS_ASSERT_REGATOM; 13414 13415 tryagain: 13416 parse_start = RExC_parse; 13417 assert(RExC_parse < RExC_end); 13418 switch ((U8)*RExC_parse) { 13419 case '^': 13420 RExC_seen_zerolen++; 13421 nextchar(pRExC_state); 13422 if (RExC_flags & RXf_PMf_MULTILINE) 13423 ret = reg_node(pRExC_state, MBOL); 13424 else 13425 ret = reg_node(pRExC_state, SBOL); 13426 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ 13427 break; 13428 case '$': 13429 nextchar(pRExC_state); 13430 if (*RExC_parse) 13431 RExC_seen_zerolen++; 13432 if (RExC_flags & RXf_PMf_MULTILINE) 13433 ret = reg_node(pRExC_state, MEOL); 13434 else 13435 ret = reg_node(pRExC_state, SEOL); 13436 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ 13437 break; 13438 case '.': 13439 nextchar(pRExC_state); 13440 if (RExC_flags & RXf_PMf_SINGLELINE) 13441 ret = reg_node(pRExC_state, SANY); 13442 else 13443 ret = reg_node(pRExC_state, REG_ANY); 13444 *flagp |= HASWIDTH|SIMPLE; 13445 MARK_NAUGHTY(1); 13446 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ 13447 break; 13448 case '[': 13449 { 13450 char * const oregcomp_parse = ++RExC_parse; 13451 ret = regclass(pRExC_state, flagp, depth+1, 13452 FALSE, /* means parse the whole char class */ 13453 TRUE, /* allow multi-char folds */ 13454 FALSE, /* don't silence non-portable warnings. */ 13455 (bool) RExC_strict, 13456 TRUE, /* Allow an optimized regnode result */ 13457 NULL); 13458 if (ret == 0) { 13459 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 13460 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 13461 (UV) *flagp); 13462 } 13463 if (*RExC_parse != ']') { 13464 RExC_parse = oregcomp_parse; 13465 vFAIL("Unmatched ["); 13466 } 13467 nextchar(pRExC_state); 13468 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */ 13469 break; 13470 } 13471 case '(': 13472 nextchar(pRExC_state); 13473 ret = reg(pRExC_state, 2, &flags, depth+1); 13474 if (ret == 0) { 13475 if (flags & TRYAGAIN) { 13476 if (RExC_parse >= RExC_end) { 13477 /* Make parent create an empty node if needed. */ 13478 *flagp |= TRYAGAIN; 13479 return(0); 13480 } 13481 goto tryagain; 13482 } 13483 RETURN_FAIL_ON_RESTART(flags, flagp); 13484 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf, 13485 (UV) flags); 13486 } 13487 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 13488 break; 13489 case '|': 13490 case ')': 13491 if (flags & TRYAGAIN) { 13492 *flagp |= TRYAGAIN; 13493 return 0; 13494 } 13495 vFAIL("Internal urp"); 13496 /* Supposed to be caught earlier. */ 13497 break; 13498 case '?': 13499 case '+': 13500 case '*': 13501 RExC_parse++; 13502 vFAIL("Quantifier follows nothing"); 13503 break; 13504 case '\\': 13505 /* Special Escapes 13506 13507 This switch handles escape sequences that resolve to some kind 13508 of special regop and not to literal text. Escape sequences that 13509 resolve to literal text are handled below in the switch marked 13510 "Literal Escapes". 13511 13512 Every entry in this switch *must* have a corresponding entry 13513 in the literal escape switch. However, the opposite is not 13514 required, as the default for this switch is to jump to the 13515 literal text handling code. 13516 */ 13517 RExC_parse++; 13518 switch ((U8)*RExC_parse) { 13519 /* Special Escapes */ 13520 case 'A': 13521 RExC_seen_zerolen++; 13522 ret = reg_node(pRExC_state, SBOL); 13523 /* SBOL is shared with /^/ so we set the flags so we can tell 13524 * /\A/ from /^/ in split. */ 13525 FLAGS(REGNODE_p(ret)) = 1; 13526 *flagp |= SIMPLE; 13527 goto finish_meta_pat; 13528 case 'G': 13529 ret = reg_node(pRExC_state, GPOS); 13530 RExC_seen |= REG_GPOS_SEEN; 13531 *flagp |= SIMPLE; 13532 goto finish_meta_pat; 13533 case 'K': 13534 RExC_seen_zerolen++; 13535 ret = reg_node(pRExC_state, KEEPS); 13536 *flagp |= SIMPLE; 13537 /* XXX:dmq : disabling in-place substitution seems to 13538 * be necessary here to avoid cases of memory corruption, as 13539 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 13540 */ 13541 RExC_seen |= REG_LOOKBEHIND_SEEN; 13542 goto finish_meta_pat; 13543 case 'Z': 13544 ret = reg_node(pRExC_state, SEOL); 13545 *flagp |= SIMPLE; 13546 RExC_seen_zerolen++; /* Do not optimize RE away */ 13547 goto finish_meta_pat; 13548 case 'z': 13549 ret = reg_node(pRExC_state, EOS); 13550 *flagp |= SIMPLE; 13551 RExC_seen_zerolen++; /* Do not optimize RE away */ 13552 goto finish_meta_pat; 13553 case 'C': 13554 vFAIL("\\C no longer supported"); 13555 case 'X': 13556 ret = reg_node(pRExC_state, CLUMP); 13557 *flagp |= HASWIDTH; 13558 goto finish_meta_pat; 13559 13560 case 'W': 13561 invert = 1; 13562 /* FALLTHROUGH */ 13563 case 'w': 13564 arg = ANYOF_WORDCHAR; 13565 goto join_posix; 13566 13567 case 'B': 13568 invert = 1; 13569 /* FALLTHROUGH */ 13570 case 'b': 13571 { 13572 U8 flags = 0; 13573 regex_charset charset = get_regex_charset(RExC_flags); 13574 13575 RExC_seen_zerolen++; 13576 RExC_seen |= REG_LOOKBEHIND_SEEN; 13577 op = BOUND + charset; 13578 13579 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { 13580 flags = TRADITIONAL_BOUND; 13581 if (op > BOUNDA) { /* /aa is same as /a */ 13582 op = BOUNDA; 13583 } 13584 } 13585 else { 13586 STRLEN length; 13587 char name = *RExC_parse; 13588 char * endbrace = NULL; 13589 RExC_parse += 2; 13590 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 13591 13592 if (! endbrace) { 13593 vFAIL2("Missing right brace on \\%c{}", name); 13594 } 13595 /* XXX Need to decide whether to take spaces or not. Should be 13596 * consistent with \p{}, but that currently is SPACE, which 13597 * means vertical too, which seems wrong 13598 * while (isBLANK(*RExC_parse)) { 13599 RExC_parse++; 13600 }*/ 13601 if (endbrace == RExC_parse) { 13602 RExC_parse++; /* After the '}' */ 13603 vFAIL2("Empty \\%c{}", name); 13604 } 13605 length = endbrace - RExC_parse; 13606 /*while (isBLANK(*(RExC_parse + length - 1))) { 13607 length--; 13608 }*/ 13609 switch (*RExC_parse) { 13610 case 'g': 13611 if ( length != 1 13612 && (memNEs(RExC_parse + 1, length - 1, "cb"))) 13613 { 13614 goto bad_bound_type; 13615 } 13616 flags = GCB_BOUND; 13617 break; 13618 case 'l': 13619 if (length != 2 || *(RExC_parse + 1) != 'b') { 13620 goto bad_bound_type; 13621 } 13622 flags = LB_BOUND; 13623 break; 13624 case 's': 13625 if (length != 2 || *(RExC_parse + 1) != 'b') { 13626 goto bad_bound_type; 13627 } 13628 flags = SB_BOUND; 13629 break; 13630 case 'w': 13631 if (length != 2 || *(RExC_parse + 1) != 'b') { 13632 goto bad_bound_type; 13633 } 13634 flags = WB_BOUND; 13635 break; 13636 default: 13637 bad_bound_type: 13638 RExC_parse = endbrace; 13639 vFAIL2utf8f( 13640 "'%" UTF8f "' is an unknown bound type", 13641 UTF8fARG(UTF, length, endbrace - length)); 13642 NOT_REACHED; /*NOTREACHED*/ 13643 } 13644 RExC_parse = endbrace; 13645 REQUIRE_UNI_RULES(flagp, 0); 13646 13647 if (op == BOUND) { 13648 op = BOUNDU; 13649 } 13650 else if (op >= BOUNDA) { /* /aa is same as /a */ 13651 op = BOUNDU; 13652 length += 4; 13653 13654 /* Don't have to worry about UTF-8, in this message because 13655 * to get here the contents of the \b must be ASCII */ 13656 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ 13657 "Using /u for '%.*s' instead of /%s", 13658 (unsigned) length, 13659 endbrace - length + 1, 13660 (charset == REGEX_ASCII_RESTRICTED_CHARSET) 13661 ? ASCII_RESTRICT_PAT_MODS 13662 : ASCII_MORE_RESTRICT_PAT_MODS); 13663 } 13664 } 13665 13666 if (op == BOUND) { 13667 RExC_seen_d_op = TRUE; 13668 } 13669 else if (op == BOUNDL) { 13670 RExC_contains_locale = 1; 13671 } 13672 13673 if (invert) { 13674 op += NBOUND - BOUND; 13675 } 13676 13677 ret = reg_node(pRExC_state, op); 13678 FLAGS(REGNODE_p(ret)) = flags; 13679 13680 *flagp |= SIMPLE; 13681 13682 goto finish_meta_pat; 13683 } 13684 13685 case 'D': 13686 invert = 1; 13687 /* FALLTHROUGH */ 13688 case 'd': 13689 arg = ANYOF_DIGIT; 13690 if (! DEPENDS_SEMANTICS) { 13691 goto join_posix; 13692 } 13693 13694 /* \d doesn't have any matches in the upper Latin1 range, hence /d 13695 * is equivalent to /u. Changing to /u saves some branches at 13696 * runtime */ 13697 op = POSIXU; 13698 goto join_posix_op_known; 13699 13700 case 'R': 13701 ret = reg_node(pRExC_state, LNBREAK); 13702 *flagp |= HASWIDTH|SIMPLE; 13703 goto finish_meta_pat; 13704 13705 case 'H': 13706 invert = 1; 13707 /* FALLTHROUGH */ 13708 case 'h': 13709 arg = ANYOF_BLANK; 13710 op = POSIXU; 13711 goto join_posix_op_known; 13712 13713 case 'V': 13714 invert = 1; 13715 /* FALLTHROUGH */ 13716 case 'v': 13717 arg = ANYOF_VERTWS; 13718 op = POSIXU; 13719 goto join_posix_op_known; 13720 13721 case 'S': 13722 invert = 1; 13723 /* FALLTHROUGH */ 13724 case 's': 13725 arg = ANYOF_SPACE; 13726 13727 join_posix: 13728 13729 op = POSIXD + get_regex_charset(RExC_flags); 13730 if (op > POSIXA) { /* /aa is same as /a */ 13731 op = POSIXA; 13732 } 13733 else if (op == POSIXL) { 13734 RExC_contains_locale = 1; 13735 } 13736 else if (op == POSIXD) { 13737 RExC_seen_d_op = TRUE; 13738 } 13739 13740 join_posix_op_known: 13741 13742 if (invert) { 13743 op += NPOSIXD - POSIXD; 13744 } 13745 13746 ret = reg_node(pRExC_state, op); 13747 FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg); 13748 13749 *flagp |= HASWIDTH|SIMPLE; 13750 /* FALLTHROUGH */ 13751 13752 finish_meta_pat: 13753 if ( UCHARAT(RExC_parse + 1) == '{' 13754 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end))) 13755 { 13756 RExC_parse += 2; 13757 vFAIL("Unescaped left brace in regex is illegal here"); 13758 } 13759 nextchar(pRExC_state); 13760 Set_Node_Length(REGNODE_p(ret), 2); /* MJD */ 13761 break; 13762 case 'p': 13763 case 'P': 13764 RExC_parse--; 13765 13766 ret = regclass(pRExC_state, flagp, depth+1, 13767 TRUE, /* means just parse this element */ 13768 FALSE, /* don't allow multi-char folds */ 13769 FALSE, /* don't silence non-portable warnings. It 13770 would be a bug if these returned 13771 non-portables */ 13772 (bool) RExC_strict, 13773 TRUE, /* Allow an optimized regnode result */ 13774 NULL); 13775 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 13776 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 13777 * multi-char folds are allowed. */ 13778 if (!ret) 13779 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 13780 (UV) *flagp); 13781 13782 RExC_parse--; 13783 13784 Set_Node_Offset(REGNODE_p(ret), parse_start); 13785 Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2); 13786 nextchar(pRExC_state); 13787 break; 13788 case 'N': 13789 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the 13790 * \N{...} evaluates to a sequence of more than one code points). 13791 * The function call below returns a regnode, which is our result. 13792 * The parameters cause it to fail if the \N{} evaluates to a 13793 * single code point; we handle those like any other literal. The 13794 * reason that the multicharacter case is handled here and not as 13795 * part of the EXACtish code is because of quantifiers. In 13796 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it 13797 * this way makes that Just Happen. dmq. 13798 * join_exact() will join this up with adjacent EXACTish nodes 13799 * later on, if appropriate. */ 13800 ++RExC_parse; 13801 if (grok_bslash_N(pRExC_state, 13802 &ret, /* Want a regnode returned */ 13803 NULL, /* Fail if evaluates to a single code 13804 point */ 13805 NULL, /* Don't need a count of how many code 13806 points */ 13807 flagp, 13808 RExC_strict, 13809 depth) 13810 ) { 13811 break; 13812 } 13813 13814 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 13815 13816 /* Here, evaluates to a single code point. Go get that */ 13817 RExC_parse = parse_start; 13818 goto defchar; 13819 13820 case 'k': /* Handle \k<NAME> and \k'NAME' */ 13821 parse_named_seq: 13822 { 13823 char ch; 13824 if ( RExC_parse >= RExC_end - 1 13825 || (( ch = RExC_parse[1]) != '<' 13826 && ch != '\'' 13827 && ch != '{')) 13828 { 13829 RExC_parse++; 13830 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 13831 vFAIL2("Sequence %.2s... not terminated", parse_start); 13832 } else { 13833 RExC_parse += 2; 13834 ret = handle_named_backref(pRExC_state, 13835 flagp, 13836 parse_start, 13837 (ch == '<') 13838 ? '>' 13839 : (ch == '{') 13840 ? '}' 13841 : '\''); 13842 } 13843 break; 13844 } 13845 case 'g': 13846 case '1': case '2': case '3': case '4': 13847 case '5': case '6': case '7': case '8': case '9': 13848 { 13849 I32 num; 13850 bool hasbrace = 0; 13851 13852 if (*RExC_parse == 'g') { 13853 bool isrel = 0; 13854 13855 RExC_parse++; 13856 if (*RExC_parse == '{') { 13857 RExC_parse++; 13858 hasbrace = 1; 13859 } 13860 if (*RExC_parse == '-') { 13861 RExC_parse++; 13862 isrel = 1; 13863 } 13864 if (hasbrace && !isDIGIT(*RExC_parse)) { 13865 if (isrel) RExC_parse--; 13866 RExC_parse -= 2; 13867 goto parse_named_seq; 13868 } 13869 13870 if (RExC_parse >= RExC_end) { 13871 goto unterminated_g; 13872 } 13873 num = S_backref_value(RExC_parse, RExC_end); 13874 if (num == 0) 13875 vFAIL("Reference to invalid group 0"); 13876 else if (num == I32_MAX) { 13877 if (isDIGIT(*RExC_parse)) 13878 vFAIL("Reference to nonexistent group"); 13879 else 13880 unterminated_g: 13881 vFAIL("Unterminated \\g... pattern"); 13882 } 13883 13884 if (isrel) { 13885 num = RExC_npar - num; 13886 if (num < 1) 13887 vFAIL("Reference to nonexistent or unclosed group"); 13888 } 13889 } 13890 else { 13891 num = S_backref_value(RExC_parse, RExC_end); 13892 /* bare \NNN might be backref or octal - if it is larger 13893 * than or equal RExC_npar then it is assumed to be an 13894 * octal escape. Note RExC_npar is +1 from the actual 13895 * number of parens. */ 13896 /* Note we do NOT check if num == I32_MAX here, as that is 13897 * handled by the RExC_npar check */ 13898 13899 if ( 13900 /* any numeric escape < 10 is always a backref */ 13901 num > 9 13902 /* any numeric escape < RExC_npar is a backref */ 13903 && num >= RExC_npar 13904 /* cannot be an octal escape if it starts with 8 */ 13905 && *RExC_parse != '8' 13906 /* cannot be an octal escape it it starts with 9 */ 13907 && *RExC_parse != '9' 13908 ) { 13909 /* Probably not meant to be a backref, instead likely 13910 * to be an octal character escape, e.g. \35 or \777. 13911 * The above logic should make it obvious why using 13912 * octal escapes in patterns is problematic. - Yves */ 13913 RExC_parse = parse_start; 13914 goto defchar; 13915 } 13916 } 13917 13918 /* At this point RExC_parse points at a numeric escape like 13919 * \12 or \88 or something similar, which we should NOT treat 13920 * as an octal escape. It may or may not be a valid backref 13921 * escape. For instance \88888888 is unlikely to be a valid 13922 * backref. */ 13923 while (isDIGIT(*RExC_parse)) 13924 RExC_parse++; 13925 if (hasbrace) { 13926 if (*RExC_parse != '}') 13927 vFAIL("Unterminated \\g{...} pattern"); 13928 RExC_parse++; 13929 } 13930 if (num >= (I32)RExC_npar) { 13931 13932 /* It might be a forward reference; we can't fail until we 13933 * know, by completing the parse to get all the groups, and 13934 * then reparsing */ 13935 if (ALL_PARENS_COUNTED) { 13936 if (num >= RExC_total_parens) { 13937 vFAIL("Reference to nonexistent group"); 13938 } 13939 } 13940 else { 13941 REQUIRE_PARENS_PASS; 13942 } 13943 } 13944 RExC_sawback = 1; 13945 ret = reganode(pRExC_state, 13946 ((! FOLD) 13947 ? REF 13948 : (ASCII_FOLD_RESTRICTED) 13949 ? REFFA 13950 : (AT_LEAST_UNI_SEMANTICS) 13951 ? REFFU 13952 : (LOC) 13953 ? REFFL 13954 : REFF), 13955 num); 13956 if (OP(REGNODE_p(ret)) == REFF) { 13957 RExC_seen_d_op = TRUE; 13958 } 13959 *flagp |= HASWIDTH; 13960 13961 /* override incorrect value set in reganode MJD */ 13962 Set_Node_Offset(REGNODE_p(ret), parse_start); 13963 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1); 13964 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 13965 FALSE /* Don't force to /x */ ); 13966 } 13967 break; 13968 case '\0': 13969 if (RExC_parse >= RExC_end) 13970 FAIL("Trailing \\"); 13971 /* FALLTHROUGH */ 13972 default: 13973 /* Do not generate "unrecognized" warnings here, we fall 13974 back into the quick-grab loop below */ 13975 RExC_parse = parse_start; 13976 goto defchar; 13977 } /* end of switch on a \foo sequence */ 13978 break; 13979 13980 case '#': 13981 13982 /* '#' comments should have been spaced over before this function was 13983 * called */ 13984 assert((RExC_flags & RXf_PMf_EXTENDED) == 0); 13985 /* 13986 if (RExC_flags & RXf_PMf_EXTENDED) { 13987 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); 13988 if (RExC_parse < RExC_end) 13989 goto tryagain; 13990 } 13991 */ 13992 13993 /* FALLTHROUGH */ 13994 13995 default: 13996 defchar: { 13997 13998 /* Here, we have determined that the next thing is probably a 13999 * literal character. RExC_parse points to the first byte of its 14000 * definition. (It still may be an escape sequence that evaluates 14001 * to a single character) */ 14002 14003 STRLEN len = 0; 14004 UV ender = 0; 14005 char *p; 14006 char *s; 14007 14008 /* This allows us to fill a node with just enough spare so that if the final 14009 * character folds, its expansion is guaranteed to fit */ 14010 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE) 14011 14012 char *s0; 14013 U8 upper_parse = MAX_NODE_STRING_SIZE; 14014 14015 /* We start out as an EXACT node, even if under /i, until we find a 14016 * character which is in a fold. The algorithm now segregates into 14017 * separate nodes, characters that fold from those that don't under 14018 * /i. (This hopefully will create nodes that are fixed strings 14019 * even under /i, giving the optimizer something to grab on to.) 14020 * So, if a node has something in it and the next character is in 14021 * the opposite category, that node is closed up, and the function 14022 * returns. Then regatom is called again, and a new node is 14023 * created for the new category. */ 14024 U8 node_type = EXACT; 14025 14026 /* Assume the node will be fully used; the excess is given back at 14027 * the end. We can't make any other length assumptions, as a byte 14028 * input sequence could shrink down. */ 14029 Ptrdiff_t initial_size = STR_SZ(256); 14030 14031 bool next_is_quantifier; 14032 char * oldp = NULL; 14033 14034 /* We can convert EXACTF nodes to EXACTFU if they contain only 14035 * characters that match identically regardless of the target 14036 * string's UTF8ness. The reason to do this is that EXACTF is not 14037 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at 14038 * runtime. 14039 * 14040 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they 14041 * contain only above-Latin1 characters (hence must be in UTF8), 14042 * which don't participate in folds with Latin1-range characters, 14043 * as the latter's folds aren't known until runtime. */ 14044 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); 14045 14046 /* Single-character EXACTish nodes are almost always SIMPLE. This 14047 * allows us to override this as encountered */ 14048 U8 maybe_SIMPLE = SIMPLE; 14049 14050 /* Does this node contain something that can't match unless the 14051 * target string is (also) in UTF-8 */ 14052 bool requires_utf8_target = FALSE; 14053 14054 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */ 14055 bool has_ss = FALSE; 14056 14057 /* So is the MICRO SIGN */ 14058 bool has_micro_sign = FALSE; 14059 14060 /* Allocate an EXACT node. The node_type may change below to 14061 * another EXACTish node, but since the size of the node doesn't 14062 * change, it works */ 14063 ret = regnode_guts(pRExC_state, node_type, initial_size, "exact"); 14064 FILL_NODE(ret, node_type); 14065 RExC_emit++; 14066 14067 s = STRING(REGNODE_p(ret)); 14068 14069 s0 = s; 14070 14071 reparse: 14072 14073 /* This breaks under rare circumstances. If folding, we do not 14074 * want to split a node at a character that is a non-final in a 14075 * multi-char fold, as an input string could just happen to want to 14076 * match across the node boundary. The code at the end of the loop 14077 * looks for this, and backs off until it finds not such a 14078 * character, but it is possible (though extremely, extremely 14079 * unlikely) for all characters in the node to be non-final fold 14080 * ones, in which case we just leave the node fully filled, and 14081 * hope that it doesn't match the string in just the wrong place */ 14082 14083 assert( ! UTF /* Is at the beginning of a character */ 14084 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) 14085 || UTF8_IS_START(UCHARAT(RExC_parse))); 14086 14087 /* Here, we have a literal character. Find the maximal string of 14088 * them in the input that we can fit into a single EXACTish node. 14089 * We quit at the first non-literal or when the node gets full, or 14090 * under /i the categorization of folding/non-folding character 14091 * changes */ 14092 for (p = RExC_parse; len < upper_parse && p < RExC_end; ) { 14093 14094 /* In most cases each iteration adds one byte to the output. 14095 * The exceptions override this */ 14096 Size_t added_len = 1; 14097 14098 oldp = p; 14099 14100 /* White space has already been ignored */ 14101 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 14102 || ! is_PATWS_safe((p), RExC_end, UTF)); 14103 14104 switch ((U8)*p) { 14105 case '^': 14106 case '$': 14107 case '.': 14108 case '[': 14109 case '(': 14110 case ')': 14111 case '|': 14112 goto loopdone; 14113 case '\\': 14114 /* Literal Escapes Switch 14115 14116 This switch is meant to handle escape sequences that 14117 resolve to a literal character. 14118 14119 Every escape sequence that represents something 14120 else, like an assertion or a char class, is handled 14121 in the switch marked 'Special Escapes' above in this 14122 routine, but also has an entry here as anything that 14123 isn't explicitly mentioned here will be treated as 14124 an unescaped equivalent literal. 14125 */ 14126 14127 switch ((U8)*++p) { 14128 14129 /* These are all the special escapes. */ 14130 case 'A': /* Start assertion */ 14131 case 'b': case 'B': /* Word-boundary assertion*/ 14132 case 'C': /* Single char !DANGEROUS! */ 14133 case 'd': case 'D': /* digit class */ 14134 case 'g': case 'G': /* generic-backref, pos assertion */ 14135 case 'h': case 'H': /* HORIZWS */ 14136 case 'k': case 'K': /* named backref, keep marker */ 14137 case 'p': case 'P': /* Unicode property */ 14138 case 'R': /* LNBREAK */ 14139 case 's': case 'S': /* space class */ 14140 case 'v': case 'V': /* VERTWS */ 14141 case 'w': case 'W': /* word class */ 14142 case 'X': /* eXtended Unicode "combining 14143 character sequence" */ 14144 case 'z': case 'Z': /* End of line/string assertion */ 14145 --p; 14146 goto loopdone; 14147 14148 /* Anything after here is an escape that resolves to a 14149 literal. (Except digits, which may or may not) 14150 */ 14151 case 'n': 14152 ender = '\n'; 14153 p++; 14154 break; 14155 case 'N': /* Handle a single-code point named character. */ 14156 RExC_parse = p + 1; 14157 if (! grok_bslash_N(pRExC_state, 14158 NULL, /* Fail if evaluates to 14159 anything other than a 14160 single code point */ 14161 &ender, /* The returned single code 14162 point */ 14163 NULL, /* Don't need a count of 14164 how many code points */ 14165 flagp, 14166 RExC_strict, 14167 depth) 14168 ) { 14169 if (*flagp & NEED_UTF8) 14170 FAIL("panic: grok_bslash_N set NEED_UTF8"); 14171 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 14172 14173 /* Here, it wasn't a single code point. Go close 14174 * up this EXACTish node. The switch() prior to 14175 * this switch handles the other cases */ 14176 RExC_parse = p = oldp; 14177 goto loopdone; 14178 } 14179 p = RExC_parse; 14180 RExC_parse = parse_start; 14181 14182 /* The \N{} means the pattern, if previously /d, 14183 * becomes /u. That means it can't be an EXACTF node, 14184 * but an EXACTFU */ 14185 if (node_type == EXACTF) { 14186 node_type = EXACTFU; 14187 14188 /* If the node already contains something that 14189 * differs between EXACTF and EXACTFU, reparse it 14190 * as EXACTFU */ 14191 if (! maybe_exactfu) { 14192 len = 0; 14193 s = s0; 14194 goto reparse; 14195 } 14196 } 14197 14198 break; 14199 case 'r': 14200 ender = '\r'; 14201 p++; 14202 break; 14203 case 't': 14204 ender = '\t'; 14205 p++; 14206 break; 14207 case 'f': 14208 ender = '\f'; 14209 p++; 14210 break; 14211 case 'e': 14212 ender = ESC_NATIVE; 14213 p++; 14214 break; 14215 case 'a': 14216 ender = '\a'; 14217 p++; 14218 break; 14219 case 'o': 14220 { 14221 UV result; 14222 const char* error_msg; 14223 14224 bool valid = grok_bslash_o(&p, 14225 RExC_end, 14226 &result, 14227 &error_msg, 14228 TO_OUTPUT_WARNINGS(p), 14229 (bool) RExC_strict, 14230 TRUE, /* Output warnings 14231 for non- 14232 portables */ 14233 UTF); 14234 if (! valid) { 14235 RExC_parse = p; /* going to die anyway; point 14236 to exact spot of failure */ 14237 vFAIL(error_msg); 14238 } 14239 UPDATE_WARNINGS_LOC(p - 1); 14240 ender = result; 14241 break; 14242 } 14243 case 'x': 14244 { 14245 UV result = UV_MAX; /* initialize to erroneous 14246 value */ 14247 const char* error_msg; 14248 14249 bool valid = grok_bslash_x(&p, 14250 RExC_end, 14251 &result, 14252 &error_msg, 14253 TO_OUTPUT_WARNINGS(p), 14254 (bool) RExC_strict, 14255 TRUE, /* Silence warnings 14256 for non- 14257 portables */ 14258 UTF); 14259 if (! valid) { 14260 RExC_parse = p; /* going to die anyway; point 14261 to exact spot of failure */ 14262 vFAIL(error_msg); 14263 } 14264 UPDATE_WARNINGS_LOC(p - 1); 14265 ender = result; 14266 14267 if (ender < 0x100) { 14268 #ifdef EBCDIC 14269 if (RExC_recode_x_to_native) { 14270 ender = LATIN1_TO_NATIVE(ender); 14271 } 14272 #endif 14273 } 14274 break; 14275 } 14276 case 'c': 14277 p++; 14278 ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p)); 14279 UPDATE_WARNINGS_LOC(p); 14280 p++; 14281 break; 14282 case '8': case '9': /* must be a backreference */ 14283 --p; 14284 /* we have an escape like \8 which cannot be an octal escape 14285 * so we exit the loop, and let the outer loop handle this 14286 * escape which may or may not be a legitimate backref. */ 14287 goto loopdone; 14288 case '1': case '2': case '3':case '4': 14289 case '5': case '6': case '7': 14290 /* When we parse backslash escapes there is ambiguity 14291 * between backreferences and octal escapes. Any escape 14292 * from \1 - \9 is a backreference, any multi-digit 14293 * escape which does not start with 0 and which when 14294 * evaluated as decimal could refer to an already 14295 * parsed capture buffer is a back reference. Anything 14296 * else is octal. 14297 * 14298 * Note this implies that \118 could be interpreted as 14299 * 118 OR as "\11" . "8" depending on whether there 14300 * were 118 capture buffers defined already in the 14301 * pattern. */ 14302 14303 /* NOTE, RExC_npar is 1 more than the actual number of 14304 * parens we have seen so far, hence the "<" as opposed 14305 * to "<=" */ 14306 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar) 14307 { /* Not to be treated as an octal constant, go 14308 find backref */ 14309 --p; 14310 goto loopdone; 14311 } 14312 /* FALLTHROUGH */ 14313 case '0': 14314 { 14315 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 14316 STRLEN numlen = 3; 14317 ender = grok_oct(p, &numlen, &flags, NULL); 14318 p += numlen; 14319 if ( isDIGIT(*p) /* like \08, \178 */ 14320 && ckWARN(WARN_REGEXP) 14321 && numlen < 3) 14322 { 14323 reg_warn_non_literal_string( 14324 p + 1, 14325 form_short_octal_warning(p, numlen)); 14326 } 14327 } 14328 break; 14329 case '\0': 14330 if (p >= RExC_end) 14331 FAIL("Trailing \\"); 14332 /* FALLTHROUGH */ 14333 default: 14334 if (isALPHANUMERIC(*p)) { 14335 /* An alpha followed by '{' is going to fail next 14336 * iteration, so don't output this warning in that 14337 * case */ 14338 if (! isALPHA(*p) || *(p + 1) != '{') { 14339 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s" 14340 " passed through", p); 14341 } 14342 } 14343 goto normal_default; 14344 } /* End of switch on '\' */ 14345 break; 14346 case '{': 14347 /* Trying to gain new uses for '{' without breaking too 14348 * much existing code is hard. The solution currently 14349 * adopted is: 14350 * 1) If there is no ambiguity that a '{' should always 14351 * be taken literally, at the start of a construct, we 14352 * just do so. 14353 * 2) If the literal '{' conflicts with our desired use 14354 * of it as a metacharacter, we die. The deprecation 14355 * cycles for this have come and gone. 14356 * 3) If there is ambiguity, we raise a simple warning. 14357 * This could happen, for example, if the user 14358 * intended it to introduce a quantifier, but slightly 14359 * misspelled the quantifier. Without this warning, 14360 * the quantifier would silently be taken as a literal 14361 * string of characters instead of a meta construct */ 14362 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) { 14363 if ( RExC_strict 14364 || ( p > parse_start + 1 14365 && isALPHA_A(*(p - 1)) 14366 && *(p - 2) == '\\') 14367 || new_regcurly(p, RExC_end)) 14368 { 14369 RExC_parse = p + 1; 14370 vFAIL("Unescaped left brace in regex is " 14371 "illegal here"); 14372 } 14373 ckWARNreg(p + 1, "Unescaped left brace in regex is" 14374 " passed through"); 14375 } 14376 goto normal_default; 14377 case '}': 14378 case ']': 14379 if (p > RExC_parse && RExC_strict) { 14380 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p); 14381 } 14382 /*FALLTHROUGH*/ 14383 default: /* A literal character */ 14384 normal_default: 14385 if (! UTF8_IS_INVARIANT(*p) && UTF) { 14386 STRLEN numlen; 14387 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 14388 &numlen, UTF8_ALLOW_DEFAULT); 14389 p += numlen; 14390 } 14391 else 14392 ender = (U8) *p++; 14393 break; 14394 } /* End of switch on the literal */ 14395 14396 /* Here, have looked at the literal character, and <ender> 14397 * contains its ordinal; <p> points to the character after it. 14398 * */ 14399 14400 if (ender > 255) { 14401 REQUIRE_UTF8(flagp); 14402 } 14403 14404 /* We need to check if the next non-ignored thing is a 14405 * quantifier. Move <p> to after anything that should be 14406 * ignored, which, as a side effect, positions <p> for the next 14407 * loop iteration */ 14408 skip_to_be_ignored_text(pRExC_state, &p, 14409 FALSE /* Don't force to /x */ ); 14410 14411 /* If the next thing is a quantifier, it applies to this 14412 * character only, which means that this character has to be in 14413 * its own node and can't just be appended to the string in an 14414 * existing node, so if there are already other characters in 14415 * the node, close the node with just them, and set up to do 14416 * this character again next time through, when it will be the 14417 * only thing in its new node */ 14418 14419 next_is_quantifier = LIKELY(p < RExC_end) 14420 && UNLIKELY(ISMULT2(p)); 14421 14422 if (next_is_quantifier && LIKELY(len)) { 14423 p = oldp; 14424 goto loopdone; 14425 } 14426 14427 /* Ready to add 'ender' to the node */ 14428 14429 if (! FOLD) { /* The simple case, just append the literal */ 14430 14431 not_fold_common: 14432 if (UVCHR_IS_INVARIANT(ender) || ! UTF) { 14433 *(s++) = (char) ender; 14434 } 14435 else { 14436 U8 * new_s = uvchr_to_utf8((U8*)s, ender); 14437 added_len = (char *) new_s - s; 14438 s = (char *) new_s; 14439 14440 if (ender > 255) { 14441 requires_utf8_target = TRUE; 14442 } 14443 } 14444 } 14445 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { 14446 14447 /* Here are folding under /l, and the code point is 14448 * problematic. If this is the first character in the 14449 * node, change the node type to folding. Otherwise, if 14450 * this is the first problematic character, close up the 14451 * existing node, so can start a new node with this one */ 14452 if (! len) { 14453 node_type = EXACTFL; 14454 RExC_contains_locale = 1; 14455 } 14456 else if (node_type == EXACT) { 14457 p = oldp; 14458 goto loopdone; 14459 } 14460 14461 /* This problematic code point means we can't simplify 14462 * things */ 14463 maybe_exactfu = FALSE; 14464 14465 /* Here, we are adding a problematic fold character. 14466 * "Problematic" in this context means that its fold isn't 14467 * known until runtime. (The non-problematic code points 14468 * are the above-Latin1 ones that fold to also all 14469 * above-Latin1. Their folds don't vary no matter what the 14470 * locale is.) But here we have characters whose fold 14471 * depends on the locale. We just add in the unfolded 14472 * character, and wait until runtime to fold it */ 14473 goto not_fold_common; 14474 } 14475 else /* regular fold; see if actually is in a fold */ 14476 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender)) 14477 || (ender > 255 14478 && ! _invlist_contains_cp(PL_in_some_fold, ender))) 14479 { 14480 /* Here, folding, but the character isn't in a fold. 14481 * 14482 * Start a new node if previous characters in the node were 14483 * folded */ 14484 if (len && node_type != EXACT) { 14485 p = oldp; 14486 goto loopdone; 14487 } 14488 14489 /* Here, continuing a node with non-folded characters. Add 14490 * this one */ 14491 goto not_fold_common; 14492 } 14493 else { /* Here, does participate in some fold */ 14494 14495 /* If this is the first character in the node, change its 14496 * type to folding. Otherwise, if this is the first 14497 * folding character in the node, close up the existing 14498 * node, so can start a new node with this one. */ 14499 if (! len) { 14500 node_type = compute_EXACTish(pRExC_state); 14501 } 14502 else if (node_type == EXACT) { 14503 p = oldp; 14504 goto loopdone; 14505 } 14506 14507 if (UTF) { /* Use the folded value */ 14508 if (UVCHR_IS_INVARIANT(ender)) { 14509 *(s)++ = (U8) toFOLD(ender); 14510 } 14511 else { 14512 ender = _to_uni_fold_flags( 14513 ender, 14514 (U8 *) s, 14515 &added_len, 14516 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 14517 ? FOLD_FLAGS_NOMIX_ASCII 14518 : 0)); 14519 s += added_len; 14520 14521 if ( ender > 255 14522 && LIKELY(ender != GREEK_SMALL_LETTER_MU)) 14523 { 14524 /* U+B5 folds to the MU, so its possible for a 14525 * non-UTF-8 target to match it */ 14526 requires_utf8_target = TRUE; 14527 } 14528 } 14529 } 14530 else { 14531 14532 /* Here is non-UTF8. First, see if the character's 14533 * fold differs between /d and /u. */ 14534 if (PL_fold[ender] != PL_fold_latin1[ender]) { 14535 maybe_exactfu = FALSE; 14536 } 14537 14538 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 14539 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 14540 || UNICODE_DOT_DOT_VERSION > 0) 14541 14542 /* On non-ancient Unicode versions, this includes the 14543 * multi-char fold SHARP S to 'ss' */ 14544 14545 if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S) 14546 || ( isALPHA_FOLD_EQ(ender, 's') 14547 && len > 0 14548 && isALPHA_FOLD_EQ(*(s-1), 's'))) 14549 { 14550 /* Here, we have one of the following: 14551 * a) a SHARP S. This folds to 'ss' only under 14552 * /u rules. If we are in that situation, 14553 * fold the SHARP S to 'ss'. See the comments 14554 * for join_exact() as to why we fold this 14555 * non-UTF at compile time, and no others. 14556 * b) 'ss'. When under /u, there's nothing 14557 * special needed to be done here. The 14558 * previous iteration handled the first 's', 14559 * and this iteration will handle the second. 14560 * If, on the otherhand it's not /u, we have 14561 * to exclude the possibility of moving to /u, 14562 * so that we won't generate an unwanted 14563 * match, unless, at runtime, the target 14564 * string is in UTF-8. 14565 * */ 14566 14567 has_ss = TRUE; 14568 maybe_exactfu = FALSE; /* Can't generate an 14569 EXACTFU node (unless we 14570 already are in one) */ 14571 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { 14572 maybe_SIMPLE = 0; 14573 if (node_type == EXACTFU) { 14574 *(s++) = 's'; 14575 14576 /* Let the code below add in the extra 's' */ 14577 ender = 's'; 14578 added_len = 2; 14579 } 14580 } 14581 } 14582 #endif 14583 14584 else if (UNLIKELY(ender == MICRO_SIGN)) { 14585 has_micro_sign = TRUE; 14586 } 14587 14588 *(s++) = (DEPENDS_SEMANTICS) 14589 ? (char) toFOLD(ender) 14590 14591 /* Under /u, the fold of any character in 14592 * the 0-255 range happens to be its 14593 * lowercase equivalent, except for LATIN 14594 * SMALL LETTER SHARP S, which was handled 14595 * above, and the MICRO SIGN, whose fold 14596 * requires UTF-8 to represent. */ 14597 : (char) toLOWER_L1(ender); 14598 } 14599 } /* End of adding current character to the node */ 14600 14601 len += added_len; 14602 14603 if (next_is_quantifier) { 14604 14605 /* Here, the next input is a quantifier, and to get here, 14606 * the current character is the only one in the node. */ 14607 goto loopdone; 14608 } 14609 14610 } /* End of loop through literal characters */ 14611 14612 /* Here we have either exhausted the input or ran out of room in 14613 * the node. (If we encountered a character that can't be in the 14614 * node, transfer is made directly to <loopdone>, and so we 14615 * wouldn't have fallen off the end of the loop.) In the latter 14616 * case, we artificially have to split the node into two, because 14617 * we just don't have enough space to hold everything. This 14618 * creates a problem if the final character participates in a 14619 * multi-character fold in the non-final position, as a match that 14620 * should have occurred won't, due to the way nodes are matched, 14621 * and our artificial boundary. So back off until we find a non- 14622 * problematic character -- one that isn't at the beginning or 14623 * middle of such a fold. (Either it doesn't participate in any 14624 * folds, or appears only in the final position of all the folds it 14625 * does participate in.) A better solution with far fewer false 14626 * positives, and that would fill the nodes more completely, would 14627 * be to actually have available all the multi-character folds to 14628 * test against, and to back-off only far enough to be sure that 14629 * this node isn't ending with a partial one. <upper_parse> is set 14630 * further below (if we need to reparse the node) to include just 14631 * up through that final non-problematic character that this code 14632 * identifies, so when it is set to less than the full node, we can 14633 * skip the rest of this */ 14634 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { 14635 PERL_UINT_FAST8_T backup_count = 0; 14636 14637 const STRLEN full_len = len; 14638 14639 assert(len >= MAX_NODE_STRING_SIZE); 14640 14641 /* Here, <s> points to just beyond where we have output the 14642 * final character of the node. Look backwards through the 14643 * string until find a non- problematic character */ 14644 14645 if (! UTF) { 14646 14647 /* This has no multi-char folds to non-UTF characters */ 14648 if (ASCII_FOLD_RESTRICTED) { 14649 goto loopdone; 14650 } 14651 14652 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { 14653 backup_count++; 14654 } 14655 len = s - s0 + 1; 14656 } 14657 else { 14658 14659 /* Point to the first byte of the final character */ 14660 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); 14661 14662 while (s >= s0) { /* Search backwards until find 14663 a non-problematic char */ 14664 if (UTF8_IS_INVARIANT(*s)) { 14665 14666 /* There are no ascii characters that participate 14667 * in multi-char folds under /aa. In EBCDIC, the 14668 * non-ascii invariants are all control characters, 14669 * so don't ever participate in any folds. */ 14670 if (ASCII_FOLD_RESTRICTED 14671 || ! IS_NON_FINAL_FOLD(*s)) 14672 { 14673 break; 14674 } 14675 } 14676 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { 14677 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( 14678 *s, *(s+1)))) 14679 { 14680 break; 14681 } 14682 } 14683 else if (! _invlist_contains_cp( 14684 PL_NonFinalFold, 14685 valid_utf8_to_uvchr((U8 *) s, NULL))) 14686 { 14687 break; 14688 } 14689 14690 /* Here, the current character is problematic in that 14691 * it does occur in the non-final position of some 14692 * fold, so try the character before it, but have to 14693 * special case the very first byte in the string, so 14694 * we don't read outside the string */ 14695 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); 14696 backup_count++; 14697 } /* End of loop backwards through the string */ 14698 14699 /* If there were only problematic characters in the string, 14700 * <s> will point to before s0, in which case the length 14701 * should be 0, otherwise include the length of the 14702 * non-problematic character just found */ 14703 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); 14704 } 14705 14706 /* Here, have found the final character, if any, that is 14707 * non-problematic as far as ending the node without splitting 14708 * it across a potential multi-char fold. <len> contains the 14709 * number of bytes in the node up-to and including that 14710 * character, or is 0 if there is no such character, meaning 14711 * the whole node contains only problematic characters. In 14712 * this case, give up and just take the node as-is. We can't 14713 * do any better */ 14714 if (len == 0) { 14715 len = full_len; 14716 14717 } else { 14718 14719 /* Here, the node does contain some characters that aren't 14720 * problematic. If we didn't have to backup any, then the 14721 * final character in the node is non-problematic, and we 14722 * can take the node as-is */ 14723 if (backup_count == 0) { 14724 goto loopdone; 14725 } 14726 else if (backup_count == 1) { 14727 14728 /* If the final character is problematic, but the 14729 * penultimate is not, back-off that last character to 14730 * later start a new node with it */ 14731 p = oldp; 14732 goto loopdone; 14733 } 14734 14735 /* Here, the final non-problematic character is earlier 14736 * in the input than the penultimate character. What we do 14737 * is reparse from the beginning, going up only as far as 14738 * this final ok one, thus guaranteeing that the node ends 14739 * in an acceptable character. The reason we reparse is 14740 * that we know how far in the character is, but we don't 14741 * know how to correlate its position with the input parse. 14742 * An alternate implementation would be to build that 14743 * correlation as we go along during the original parse, 14744 * but that would entail extra work for every node, whereas 14745 * this code gets executed only when the string is too 14746 * large for the node, and the final two characters are 14747 * problematic, an infrequent occurrence. Yet another 14748 * possible strategy would be to save the tail of the 14749 * string, and the next time regatom is called, initialize 14750 * with that. The problem with this is that unless you 14751 * back off one more character, you won't be guaranteed 14752 * regatom will get called again, unless regbranch, 14753 * regpiece ... are also changed. If you do back off that 14754 * extra character, so that there is input guaranteed to 14755 * force calling regatom, you can't handle the case where 14756 * just the first character in the node is acceptable. I 14757 * (khw) decided to try this method which doesn't have that 14758 * pitfall; if performance issues are found, we can do a 14759 * combination of the current approach plus that one */ 14760 upper_parse = len; 14761 len = 0; 14762 s = s0; 14763 goto reparse; 14764 } 14765 } /* End of verifying node ends with an appropriate char */ 14766 14767 loopdone: /* Jumped to when encounters something that shouldn't be 14768 in the node */ 14769 14770 /* Free up any over-allocated space; cast is to silence bogus 14771 * warning in MS VC */ 14772 change_engine_size(pRExC_state, 14773 - (Ptrdiff_t) (initial_size - STR_SZ(len))); 14774 14775 /* I (khw) don't know if you can get here with zero length, but the 14776 * old code handled this situation by creating a zero-length EXACT 14777 * node. Might as well be NOTHING instead */ 14778 if (len == 0) { 14779 OP(REGNODE_p(ret)) = NOTHING; 14780 } 14781 else { 14782 14783 /* If the node type is EXACT here, check to see if it 14784 * should be EXACTL, or EXACT_ONLY8. */ 14785 if (node_type == EXACT) { 14786 if (LOC) { 14787 node_type = EXACTL; 14788 } 14789 else if (requires_utf8_target) { 14790 node_type = EXACT_ONLY8; 14791 } 14792 } else if (FOLD) { 14793 if ( UNLIKELY(has_micro_sign || has_ss) 14794 && (node_type == EXACTFU || ( node_type == EXACTF 14795 && maybe_exactfu))) 14796 { /* These two conditions are problematic in non-UTF-8 14797 EXACTFU nodes. */ 14798 assert(! UTF); 14799 node_type = EXACTFUP; 14800 } 14801 else if (node_type == EXACTFL) { 14802 14803 /* 'maybe_exactfu' is deliberately set above to 14804 * indicate this node type, where all code points in it 14805 * are above 255 */ 14806 if (maybe_exactfu) { 14807 node_type = EXACTFLU8; 14808 } 14809 else if (UNLIKELY( 14810 _invlist_contains_cp(PL_HasMultiCharFold, ender))) 14811 { 14812 /* A character that folds to more than one will 14813 * match multiple characters, so can't be SIMPLE. 14814 * We don't have to worry about this with EXACTFLU8 14815 * nodes just above, as they have already been 14816 * folded (since the fold doesn't vary at run 14817 * time). Here, if the final character in the node 14818 * folds to multiple, it can't be simple. (This 14819 * only has an effect if the node has only a single 14820 * character, hence the final one, as elsewhere we 14821 * turn off simple for nodes whose length > 1 */ 14822 maybe_SIMPLE = 0; 14823 } 14824 } 14825 else if (node_type == EXACTF) { /* Means is /di */ 14826 14827 /* If 'maybe_exactfu' is clear, then we need to stay 14828 * /di. If it is set, it means there are no code 14829 * points that match differently depending on UTF8ness 14830 * of the target string, so it can become an EXACTFU 14831 * node */ 14832 if (! maybe_exactfu) { 14833 RExC_seen_d_op = TRUE; 14834 } 14835 else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's') 14836 || isALPHA_FOLD_EQ(ender, 's')) 14837 { 14838 /* But, if the node begins or ends in an 's' we 14839 * have to defer changing it into an EXACTFU, as 14840 * the node could later get joined with another one 14841 * that ends or begins with 's' creating an 'ss' 14842 * sequence which would then wrongly match the 14843 * sharp s without the target being UTF-8. We 14844 * create a special node that we resolve later when 14845 * we join nodes together */ 14846 14847 node_type = EXACTFU_S_EDGE; 14848 } 14849 else { 14850 node_type = EXACTFU; 14851 } 14852 } 14853 14854 if (requires_utf8_target && node_type == EXACTFU) { 14855 node_type = EXACTFU_ONLY8; 14856 } 14857 } 14858 14859 OP(REGNODE_p(ret)) = node_type; 14860 STR_LEN(REGNODE_p(ret)) = len; 14861 RExC_emit += STR_SZ(len); 14862 14863 /* If the node isn't a single character, it can't be SIMPLE */ 14864 if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) { 14865 maybe_SIMPLE = 0; 14866 } 14867 14868 *flagp |= HASWIDTH | maybe_SIMPLE; 14869 } 14870 14871 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1); 14872 RExC_parse = p; 14873 14874 { 14875 /* len is STRLEN which is unsigned, need to copy to signed */ 14876 IV iv = len; 14877 if (iv < 0) 14878 vFAIL("Internal disaster"); 14879 } 14880 14881 } /* End of label 'defchar:' */ 14882 break; 14883 } /* End of giant switch on input character */ 14884 14885 /* Position parse to next real character */ 14886 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 14887 FALSE /* Don't force to /x */ ); 14888 if ( *RExC_parse == '{' 14889 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse)) 14890 { 14891 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) { 14892 RExC_parse++; 14893 vFAIL("Unescaped left brace in regex is illegal here"); 14894 } 14895 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is" 14896 " passed through"); 14897 } 14898 14899 return(ret); 14900 } 14901 14902 14903 STATIC void 14904 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) 14905 { 14906 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It 14907 * sets up the bitmap and any flags, removing those code points from the 14908 * inversion list, setting it to NULL should it become completely empty */ 14909 14910 dVAR; 14911 14912 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; 14913 assert(PL_regkind[OP(node)] == ANYOF); 14914 14915 /* There is no bitmap for this node type */ 14916 if (OP(node) == ANYOFH) { 14917 return; 14918 } 14919 14920 ANYOF_BITMAP_ZERO(node); 14921 if (*invlist_ptr) { 14922 14923 /* This gets set if we actually need to modify things */ 14924 bool change_invlist = FALSE; 14925 14926 UV start, end; 14927 14928 /* Start looking through *invlist_ptr */ 14929 invlist_iterinit(*invlist_ptr); 14930 while (invlist_iternext(*invlist_ptr, &start, &end)) { 14931 UV high; 14932 int i; 14933 14934 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { 14935 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; 14936 } 14937 14938 /* Quit if are above what we should change */ 14939 if (start >= NUM_ANYOF_CODE_POINTS) { 14940 break; 14941 } 14942 14943 change_invlist = TRUE; 14944 14945 /* Set all the bits in the range, up to the max that we are doing */ 14946 high = (end < NUM_ANYOF_CODE_POINTS - 1) 14947 ? end 14948 : NUM_ANYOF_CODE_POINTS - 1; 14949 for (i = start; i <= (int) high; i++) { 14950 if (! ANYOF_BITMAP_TEST(node, i)) { 14951 ANYOF_BITMAP_SET(node, i); 14952 } 14953 } 14954 } 14955 invlist_iterfinish(*invlist_ptr); 14956 14957 /* Done with loop; remove any code points that are in the bitmap from 14958 * *invlist_ptr; similarly for code points above the bitmap if we have 14959 * a flag to match all of them anyways */ 14960 if (change_invlist) { 14961 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); 14962 } 14963 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 14964 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); 14965 } 14966 14967 /* If have completely emptied it, remove it completely */ 14968 if (_invlist_len(*invlist_ptr) == 0) { 14969 SvREFCNT_dec_NN(*invlist_ptr); 14970 *invlist_ptr = NULL; 14971 } 14972 } 14973 } 14974 14975 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 14976 Character classes ([:foo:]) can also be negated ([:^foo:]). 14977 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 14978 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 14979 but trigger failures because they are currently unimplemented. */ 14980 14981 #define POSIXCC_DONE(c) ((c) == ':') 14982 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 14983 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 14984 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') 14985 14986 #define WARNING_PREFIX "Assuming NOT a POSIX class since " 14987 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" 14988 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" 14989 14990 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) 14991 14992 /* 'posix_warnings' and 'warn_text' are names of variables in the following 14993 * routine. q.v. */ 14994 #define ADD_POSIX_WARNING(p, text) STMT_START { \ 14995 if (posix_warnings) { \ 14996 if (! RExC_warn_text ) RExC_warn_text = \ 14997 (AV *) sv_2mortal((SV *) newAV()); \ 14998 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \ 14999 WARNING_PREFIX \ 15000 text \ 15001 REPORT_LOCATION, \ 15002 REPORT_LOCATION_ARGS(p))); \ 15003 } \ 15004 } STMT_END 15005 #define CLEAR_POSIX_WARNINGS() \ 15006 STMT_START { \ 15007 if (posix_warnings && RExC_warn_text) \ 15008 av_clear(RExC_warn_text); \ 15009 } STMT_END 15010 15011 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \ 15012 STMT_START { \ 15013 CLEAR_POSIX_WARNINGS(); \ 15014 return ret; \ 15015 } STMT_END 15016 15017 STATIC int 15018 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, 15019 15020 const char * const s, /* Where the putative posix class begins. 15021 Normally, this is one past the '['. This 15022 parameter exists so it can be somewhere 15023 besides RExC_parse. */ 15024 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or 15025 NULL */ 15026 AV ** posix_warnings, /* Where to place any generated warnings, or 15027 NULL */ 15028 const bool check_only /* Don't die if error */ 15029 ) 15030 { 15031 /* This parses what the caller thinks may be one of the three POSIX 15032 * constructs: 15033 * 1) a character class, like [:blank:] 15034 * 2) a collating symbol, like [. .] 15035 * 3) an equivalence class, like [= =] 15036 * In the latter two cases, it croaks if it finds a syntactically legal 15037 * one, as these are not handled by Perl. 15038 * 15039 * The main purpose is to look for a POSIX character class. It returns: 15040 * a) the class number 15041 * if it is a completely syntactically and semantically legal class. 15042 * 'updated_parse_ptr', if not NULL, is set to point to just after the 15043 * closing ']' of the class 15044 * b) OOB_NAMEDCLASS 15045 * if it appears that one of the three POSIX constructs was meant, but 15046 * its specification was somehow defective. 'updated_parse_ptr', if 15047 * not NULL, is set to point to the character just after the end 15048 * character of the class. See below for handling of warnings. 15049 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS 15050 * if it doesn't appear that a POSIX construct was intended. 15051 * 'updated_parse_ptr' is not changed. No warnings nor errors are 15052 * raised. 15053 * 15054 * In b) there may be errors or warnings generated. If 'check_only' is 15055 * TRUE, then any errors are discarded. Warnings are returned to the 15056 * caller via an AV* created into '*posix_warnings' if it is not NULL. If 15057 * instead it is NULL, warnings are suppressed. 15058 * 15059 * The reason for this function, and its complexity is that a bracketed 15060 * character class can contain just about anything. But it's easy to 15061 * mistype the very specific posix class syntax but yielding a valid 15062 * regular bracketed class, so it silently gets compiled into something 15063 * quite unintended. 15064 * 15065 * The solution adopted here maintains backward compatibility except that 15066 * it adds a warning if it looks like a posix class was intended but 15067 * improperly specified. The warning is not raised unless what is input 15068 * very closely resembles one of the 14 legal posix classes. To do this, 15069 * it uses fuzzy parsing. It calculates how many single-character edits it 15070 * would take to transform what was input into a legal posix class. Only 15071 * if that number is quite small does it think that the intention was a 15072 * posix class. Obviously these are heuristics, and there will be cases 15073 * where it errs on one side or another, and they can be tweaked as 15074 * experience informs. 15075 * 15076 * The syntax for a legal posix class is: 15077 * 15078 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/ 15079 * 15080 * What this routine considers syntactically to be an intended posix class 15081 * is this (the comments indicate some restrictions that the pattern 15082 * doesn't show): 15083 * 15084 * qr/(?x: \[? # The left bracket, possibly 15085 * # omitted 15086 * \h* # possibly followed by blanks 15087 * (?: \^ \h* )? # possibly a misplaced caret 15088 * [:;]? # The opening class character, 15089 * # possibly omitted. A typo 15090 * # semi-colon can also be used. 15091 * \h* 15092 * \^? # possibly a correctly placed 15093 * # caret, but not if there was also 15094 * # a misplaced one 15095 * \h* 15096 * .{3,15} # The class name. If there are 15097 * # deviations from the legal syntax, 15098 * # its edit distance must be close 15099 * # to a real class name in order 15100 * # for it to be considered to be 15101 * # an intended posix class. 15102 * \h* 15103 * [[:punct:]]? # The closing class character, 15104 * # possibly omitted. If not a colon 15105 * # nor semi colon, the class name 15106 * # must be even closer to a valid 15107 * # one 15108 * \h* 15109 * \]? # The right bracket, possibly 15110 * # omitted. 15111 * )/ 15112 * 15113 * In the above, \h must be ASCII-only. 15114 * 15115 * These are heuristics, and can be tweaked as field experience dictates. 15116 * There will be cases when someone didn't intend to specify a posix class 15117 * that this warns as being so. The goal is to minimize these, while 15118 * maximizing the catching of things intended to be a posix class that 15119 * aren't parsed as such. 15120 */ 15121 15122 const char* p = s; 15123 const char * const e = RExC_end; 15124 unsigned complement = 0; /* If to complement the class */ 15125 bool found_problem = FALSE; /* Assume OK until proven otherwise */ 15126 bool has_opening_bracket = FALSE; 15127 bool has_opening_colon = FALSE; 15128 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find 15129 valid class */ 15130 const char * possible_end = NULL; /* used for a 2nd parse pass */ 15131 const char* name_start; /* ptr to class name first char */ 15132 15133 /* If the number of single-character typos the input name is away from a 15134 * legal name is no more than this number, it is considered to have meant 15135 * the legal name */ 15136 int max_distance = 2; 15137 15138 /* to store the name. The size determines the maximum length before we 15139 * decide that no posix class was intended. Should be at least 15140 * sizeof("alphanumeric") */ 15141 UV input_text[15]; 15142 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric"); 15143 15144 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; 15145 15146 CLEAR_POSIX_WARNINGS(); 15147 15148 if (p >= e) { 15149 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 15150 } 15151 15152 if (*(p - 1) != '[') { 15153 ADD_POSIX_WARNING(p, "it doesn't start with a '['"); 15154 found_problem = TRUE; 15155 } 15156 else { 15157 has_opening_bracket = TRUE; 15158 } 15159 15160 /* They could be confused and think you can put spaces between the 15161 * components */ 15162 if (isBLANK(*p)) { 15163 found_problem = TRUE; 15164 15165 do { 15166 p++; 15167 } while (p < e && isBLANK(*p)); 15168 15169 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 15170 } 15171 15172 /* For [. .] and [= =]. These are quite different internally from [: :], 15173 * so they are handled separately. */ 15174 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' 15175 and 1 for at least one char in it 15176 */ 15177 { 15178 const char open_char = *p; 15179 const char * temp_ptr = p + 1; 15180 15181 /* These two constructs are not handled by perl, and if we find a 15182 * syntactically valid one, we croak. khw, who wrote this code, finds 15183 * this explanation of them very unclear: 15184 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html 15185 * And searching the rest of the internet wasn't very helpful either. 15186 * It looks like just about any byte can be in these constructs, 15187 * depending on the locale. But unless the pattern is being compiled 15188 * under /l, which is very rare, Perl runs under the C or POSIX locale. 15189 * In that case, it looks like [= =] isn't allowed at all, and that 15190 * [. .] could be any single code point, but for longer strings the 15191 * constituent characters would have to be the ASCII alphabetics plus 15192 * the minus-hyphen. Any sensible locale definition would limit itself 15193 * to these. And any portable one definitely should. Trying to parse 15194 * the general case is a nightmare (see [perl #127604]). So, this code 15195 * looks only for interiors of these constructs that match: 15196 * qr/.|[-\w]{2,}/ 15197 * Using \w relaxes the apparent rules a little, without adding much 15198 * danger of mistaking something else for one of these constructs. 15199 * 15200 * [. .] in some implementations described on the internet is usable to 15201 * escape a character that otherwise is special in bracketed character 15202 * classes. For example [.].] means a literal right bracket instead of 15203 * the ending of the class 15204 * 15205 * [= =] can legitimately contain a [. .] construct, but we don't 15206 * handle this case, as that [. .] construct will later get parsed 15207 * itself and croak then. And [= =] is checked for even when not under 15208 * /l, as Perl has long done so. 15209 * 15210 * The code below relies on there being a trailing NUL, so it doesn't 15211 * have to keep checking if the parse ptr < e. 15212 */ 15213 if (temp_ptr[1] == open_char) { 15214 temp_ptr++; 15215 } 15216 else while ( temp_ptr < e 15217 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) 15218 { 15219 temp_ptr++; 15220 } 15221 15222 if (*temp_ptr == open_char) { 15223 temp_ptr++; 15224 if (*temp_ptr == ']') { 15225 temp_ptr++; 15226 if (! found_problem && ! check_only) { 15227 RExC_parse = (char *) temp_ptr; 15228 vFAIL3("POSIX syntax [%c %c] is reserved for future " 15229 "extensions", open_char, open_char); 15230 } 15231 15232 /* Here, the syntax wasn't completely valid, or else the call 15233 * is to check-only */ 15234 if (updated_parse_ptr) { 15235 *updated_parse_ptr = (char *) temp_ptr; 15236 } 15237 15238 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS); 15239 } 15240 } 15241 15242 /* If we find something that started out to look like one of these 15243 * constructs, but isn't, we continue below so that it can be checked 15244 * for being a class name with a typo of '.' or '=' instead of a colon. 15245 * */ 15246 } 15247 15248 /* Here, we think there is a possibility that a [: :] class was meant, and 15249 * we have the first real character. It could be they think the '^' comes 15250 * first */ 15251 if (*p == '^') { 15252 found_problem = TRUE; 15253 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); 15254 complement = 1; 15255 p++; 15256 15257 if (isBLANK(*p)) { 15258 found_problem = TRUE; 15259 15260 do { 15261 p++; 15262 } while (p < e && isBLANK(*p)); 15263 15264 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 15265 } 15266 } 15267 15268 /* But the first character should be a colon, which they could have easily 15269 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to 15270 * distinguish from a colon, so treat that as a colon). */ 15271 if (*p == ':') { 15272 p++; 15273 has_opening_colon = TRUE; 15274 } 15275 else if (*p == ';') { 15276 found_problem = TRUE; 15277 p++; 15278 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 15279 has_opening_colon = TRUE; 15280 } 15281 else { 15282 found_problem = TRUE; 15283 ADD_POSIX_WARNING(p, "there must be a starting ':'"); 15284 15285 /* Consider an initial punctuation (not one of the recognized ones) to 15286 * be a left terminator */ 15287 if (*p != '^' && *p != ']' && isPUNCT(*p)) { 15288 p++; 15289 } 15290 } 15291 15292 /* They may think that you can put spaces between the components */ 15293 if (isBLANK(*p)) { 15294 found_problem = TRUE; 15295 15296 do { 15297 p++; 15298 } while (p < e && isBLANK(*p)); 15299 15300 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 15301 } 15302 15303 if (*p == '^') { 15304 15305 /* We consider something like [^:^alnum:]] to not have been intended to 15306 * be a posix class, but XXX maybe we should */ 15307 if (complement) { 15308 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15309 } 15310 15311 complement = 1; 15312 p++; 15313 } 15314 15315 /* Again, they may think that you can put spaces between the components */ 15316 if (isBLANK(*p)) { 15317 found_problem = TRUE; 15318 15319 do { 15320 p++; 15321 } while (p < e && isBLANK(*p)); 15322 15323 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 15324 } 15325 15326 if (*p == ']') { 15327 15328 /* XXX This ']' may be a typo, and something else was meant. But 15329 * treating it as such creates enough complications, that that 15330 * possibility isn't currently considered here. So we assume that the 15331 * ']' is what is intended, and if we've already found an initial '[', 15332 * this leaves this construct looking like [:] or [:^], which almost 15333 * certainly weren't intended to be posix classes */ 15334 if (has_opening_bracket) { 15335 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15336 } 15337 15338 /* But this function can be called when we parse the colon for 15339 * something like qr/[alpha:]]/, so we back up to look for the 15340 * beginning */ 15341 p--; 15342 15343 if (*p == ';') { 15344 found_problem = TRUE; 15345 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 15346 } 15347 else if (*p != ':') { 15348 15349 /* XXX We are currently very restrictive here, so this code doesn't 15350 * consider the possibility that, say, /[alpha.]]/ was intended to 15351 * be a posix class. */ 15352 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15353 } 15354 15355 /* Here we have something like 'foo:]'. There was no initial colon, 15356 * and we back up over 'foo. XXX Unlike the going forward case, we 15357 * don't handle typos of non-word chars in the middle */ 15358 has_opening_colon = FALSE; 15359 p--; 15360 15361 while (p > RExC_start && isWORDCHAR(*p)) { 15362 p--; 15363 } 15364 p++; 15365 15366 /* Here, we have positioned ourselves to where we think the first 15367 * character in the potential class is */ 15368 } 15369 15370 /* Now the interior really starts. There are certain key characters that 15371 * can end the interior, or these could just be typos. To catch both 15372 * cases, we may have to do two passes. In the first pass, we keep on 15373 * going unless we come to a sequence that matches 15374 * qr/ [[:punct:]] [[:blank:]]* \] /xa 15375 * This means it takes a sequence to end the pass, so two typos in a row if 15376 * that wasn't what was intended. If the class is perfectly formed, just 15377 * this one pass is needed. We also stop if there are too many characters 15378 * being accumulated, but this number is deliberately set higher than any 15379 * real class. It is set high enough so that someone who thinks that 15380 * 'alphanumeric' is a correct name would get warned that it wasn't. 15381 * While doing the pass, we keep track of where the key characters were in 15382 * it. If we don't find an end to the class, and one of the key characters 15383 * was found, we redo the pass, but stop when we get to that character. 15384 * Thus the key character was considered a typo in the first pass, but a 15385 * terminator in the second. If two key characters are found, we stop at 15386 * the second one in the first pass. Again this can miss two typos, but 15387 * catches a single one 15388 * 15389 * In the first pass, 'possible_end' starts as NULL, and then gets set to 15390 * point to the first key character. For the second pass, it starts as -1. 15391 * */ 15392 15393 name_start = p; 15394 parse_name: 15395 { 15396 bool has_blank = FALSE; 15397 bool has_upper = FALSE; 15398 bool has_terminating_colon = FALSE; 15399 bool has_terminating_bracket = FALSE; 15400 bool has_semi_colon = FALSE; 15401 unsigned int name_len = 0; 15402 int punct_count = 0; 15403 15404 while (p < e) { 15405 15406 /* Squeeze out blanks when looking up the class name below */ 15407 if (isBLANK(*p) ) { 15408 has_blank = TRUE; 15409 found_problem = TRUE; 15410 p++; 15411 continue; 15412 } 15413 15414 /* The name will end with a punctuation */ 15415 if (isPUNCT(*p)) { 15416 const char * peek = p + 1; 15417 15418 /* Treat any non-']' punctuation followed by a ']' (possibly 15419 * with intervening blanks) as trying to terminate the class. 15420 * ']]' is very likely to mean a class was intended (but 15421 * missing the colon), but the warning message that gets 15422 * generated shows the error position better if we exit the 15423 * loop at the bottom (eventually), so skip it here. */ 15424 if (*p != ']') { 15425 if (peek < e && isBLANK(*peek)) { 15426 has_blank = TRUE; 15427 found_problem = TRUE; 15428 do { 15429 peek++; 15430 } while (peek < e && isBLANK(*peek)); 15431 } 15432 15433 if (peek < e && *peek == ']') { 15434 has_terminating_bracket = TRUE; 15435 if (*p == ':') { 15436 has_terminating_colon = TRUE; 15437 } 15438 else if (*p == ';') { 15439 has_semi_colon = TRUE; 15440 has_terminating_colon = TRUE; 15441 } 15442 else { 15443 found_problem = TRUE; 15444 } 15445 p = peek + 1; 15446 goto try_posix; 15447 } 15448 } 15449 15450 /* Here we have punctuation we thought didn't end the class. 15451 * Keep track of the position of the key characters that are 15452 * more likely to have been class-enders */ 15453 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { 15454 15455 /* Allow just one such possible class-ender not actually 15456 * ending the class. */ 15457 if (possible_end) { 15458 break; 15459 } 15460 possible_end = p; 15461 } 15462 15463 /* If we have too many punctuation characters, no use in 15464 * keeping going */ 15465 if (++punct_count > max_distance) { 15466 break; 15467 } 15468 15469 /* Treat the punctuation as a typo. */ 15470 input_text[name_len++] = *p; 15471 p++; 15472 } 15473 else if (isUPPER(*p)) { /* Use lowercase for lookup */ 15474 input_text[name_len++] = toLOWER(*p); 15475 has_upper = TRUE; 15476 found_problem = TRUE; 15477 p++; 15478 } else if (! UTF || UTF8_IS_INVARIANT(*p)) { 15479 input_text[name_len++] = *p; 15480 p++; 15481 } 15482 else { 15483 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); 15484 p+= UTF8SKIP(p); 15485 } 15486 15487 /* The declaration of 'input_text' is how long we allow a potential 15488 * class name to be, before saying they didn't mean a class name at 15489 * all */ 15490 if (name_len >= C_ARRAY_LENGTH(input_text)) { 15491 break; 15492 } 15493 } 15494 15495 /* We get to here when the possible class name hasn't been properly 15496 * terminated before: 15497 * 1) we ran off the end of the pattern; or 15498 * 2) found two characters, each of which might have been intended to 15499 * be the name's terminator 15500 * 3) found so many punctuation characters in the purported name, 15501 * that the edit distance to a valid one is exceeded 15502 * 4) we decided it was more characters than anyone could have 15503 * intended to be one. */ 15504 15505 found_problem = TRUE; 15506 15507 /* In the final two cases, we know that looking up what we've 15508 * accumulated won't lead to a match, even a fuzzy one. */ 15509 if ( name_len >= C_ARRAY_LENGTH(input_text) 15510 || punct_count > max_distance) 15511 { 15512 /* If there was an intermediate key character that could have been 15513 * an intended end, redo the parse, but stop there */ 15514 if (possible_end && possible_end != (char *) -1) { 15515 possible_end = (char *) -1; /* Special signal value to say 15516 we've done a first pass */ 15517 p = name_start; 15518 goto parse_name; 15519 } 15520 15521 /* Otherwise, it can't have meant to have been a class */ 15522 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15523 } 15524 15525 /* If we ran off the end, and the final character was a punctuation 15526 * one, back up one, to look at that final one just below. Later, we 15527 * will restore the parse pointer if appropriate */ 15528 if (name_len && p == e && isPUNCT(*(p-1))) { 15529 p--; 15530 name_len--; 15531 } 15532 15533 if (p < e && isPUNCT(*p)) { 15534 if (*p == ']') { 15535 has_terminating_bracket = TRUE; 15536 15537 /* If this is a 2nd ']', and the first one is just below this 15538 * one, consider that to be the real terminator. This gives a 15539 * uniform and better positioning for the warning message */ 15540 if ( possible_end 15541 && possible_end != (char *) -1 15542 && *possible_end == ']' 15543 && name_len && input_text[name_len - 1] == ']') 15544 { 15545 name_len--; 15546 p = possible_end; 15547 15548 /* And this is actually equivalent to having done the 2nd 15549 * pass now, so set it to not try again */ 15550 possible_end = (char *) -1; 15551 } 15552 } 15553 else { 15554 if (*p == ':') { 15555 has_terminating_colon = TRUE; 15556 } 15557 else if (*p == ';') { 15558 has_semi_colon = TRUE; 15559 has_terminating_colon = TRUE; 15560 } 15561 p++; 15562 } 15563 } 15564 15565 try_posix: 15566 15567 /* Here, we have a class name to look up. We can short circuit the 15568 * stuff below for short names that can't possibly be meant to be a 15569 * class name. (We can do this on the first pass, as any second pass 15570 * will yield an even shorter name) */ 15571 if (name_len < 3) { 15572 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15573 } 15574 15575 /* Find which class it is. Initially switch on the length of the name. 15576 * */ 15577 switch (name_len) { 15578 case 4: 15579 if (memEQs(name_start, 4, "word")) { 15580 /* this is not POSIX, this is the Perl \w */ 15581 class_number = ANYOF_WORDCHAR; 15582 } 15583 break; 15584 case 5: 15585 /* Names all of length 5: alnum alpha ascii blank cntrl digit 15586 * graph lower print punct space upper 15587 * Offset 4 gives the best switch position. */ 15588 switch (name_start[4]) { 15589 case 'a': 15590 if (memBEGINs(name_start, 5, "alph")) /* alpha */ 15591 class_number = ANYOF_ALPHA; 15592 break; 15593 case 'e': 15594 if (memBEGINs(name_start, 5, "spac")) /* space */ 15595 class_number = ANYOF_SPACE; 15596 break; 15597 case 'h': 15598 if (memBEGINs(name_start, 5, "grap")) /* graph */ 15599 class_number = ANYOF_GRAPH; 15600 break; 15601 case 'i': 15602 if (memBEGINs(name_start, 5, "asci")) /* ascii */ 15603 class_number = ANYOF_ASCII; 15604 break; 15605 case 'k': 15606 if (memBEGINs(name_start, 5, "blan")) /* blank */ 15607 class_number = ANYOF_BLANK; 15608 break; 15609 case 'l': 15610 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */ 15611 class_number = ANYOF_CNTRL; 15612 break; 15613 case 'm': 15614 if (memBEGINs(name_start, 5, "alnu")) /* alnum */ 15615 class_number = ANYOF_ALPHANUMERIC; 15616 break; 15617 case 'r': 15618 if (memBEGINs(name_start, 5, "lowe")) /* lower */ 15619 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; 15620 else if (memBEGINs(name_start, 5, "uppe")) /* upper */ 15621 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; 15622 break; 15623 case 't': 15624 if (memBEGINs(name_start, 5, "digi")) /* digit */ 15625 class_number = ANYOF_DIGIT; 15626 else if (memBEGINs(name_start, 5, "prin")) /* print */ 15627 class_number = ANYOF_PRINT; 15628 else if (memBEGINs(name_start, 5, "punc")) /* punct */ 15629 class_number = ANYOF_PUNCT; 15630 break; 15631 } 15632 break; 15633 case 6: 15634 if (memEQs(name_start, 6, "xdigit")) 15635 class_number = ANYOF_XDIGIT; 15636 break; 15637 } 15638 15639 /* If the name exactly matches a posix class name the class number will 15640 * here be set to it, and the input almost certainly was meant to be a 15641 * posix class, so we can skip further checking. If instead the syntax 15642 * is exactly correct, but the name isn't one of the legal ones, we 15643 * will return that as an error below. But if neither of these apply, 15644 * it could be that no posix class was intended at all, or that one 15645 * was, but there was a typo. We tease these apart by doing fuzzy 15646 * matching on the name */ 15647 if (class_number == OOB_NAMEDCLASS && found_problem) { 15648 const UV posix_names[][6] = { 15649 { 'a', 'l', 'n', 'u', 'm' }, 15650 { 'a', 'l', 'p', 'h', 'a' }, 15651 { 'a', 's', 'c', 'i', 'i' }, 15652 { 'b', 'l', 'a', 'n', 'k' }, 15653 { 'c', 'n', 't', 'r', 'l' }, 15654 { 'd', 'i', 'g', 'i', 't' }, 15655 { 'g', 'r', 'a', 'p', 'h' }, 15656 { 'l', 'o', 'w', 'e', 'r' }, 15657 { 'p', 'r', 'i', 'n', 't' }, 15658 { 'p', 'u', 'n', 'c', 't' }, 15659 { 's', 'p', 'a', 'c', 'e' }, 15660 { 'u', 'p', 'p', 'e', 'r' }, 15661 { 'w', 'o', 'r', 'd' }, 15662 { 'x', 'd', 'i', 'g', 'i', 't' } 15663 }; 15664 /* The names of the above all have added NULs to make them the same 15665 * size, so we need to also have the real lengths */ 15666 const UV posix_name_lengths[] = { 15667 sizeof("alnum") - 1, 15668 sizeof("alpha") - 1, 15669 sizeof("ascii") - 1, 15670 sizeof("blank") - 1, 15671 sizeof("cntrl") - 1, 15672 sizeof("digit") - 1, 15673 sizeof("graph") - 1, 15674 sizeof("lower") - 1, 15675 sizeof("print") - 1, 15676 sizeof("punct") - 1, 15677 sizeof("space") - 1, 15678 sizeof("upper") - 1, 15679 sizeof("word") - 1, 15680 sizeof("xdigit")- 1 15681 }; 15682 unsigned int i; 15683 int temp_max = max_distance; /* Use a temporary, so if we 15684 reparse, we haven't changed the 15685 outer one */ 15686 15687 /* Use a smaller max edit distance if we are missing one of the 15688 * delimiters */ 15689 if ( has_opening_bracket + has_opening_colon < 2 15690 || has_terminating_bracket + has_terminating_colon < 2) 15691 { 15692 temp_max--; 15693 } 15694 15695 /* See if the input name is close to a legal one */ 15696 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { 15697 15698 /* Short circuit call if the lengths are too far apart to be 15699 * able to match */ 15700 if (abs( (int) (name_len - posix_name_lengths[i])) 15701 > temp_max) 15702 { 15703 continue; 15704 } 15705 15706 if (edit_distance(input_text, 15707 posix_names[i], 15708 name_len, 15709 posix_name_lengths[i], 15710 temp_max 15711 ) 15712 > -1) 15713 { /* If it is close, it probably was intended to be a class */ 15714 goto probably_meant_to_be; 15715 } 15716 } 15717 15718 /* Here the input name is not close enough to a valid class name 15719 * for us to consider it to be intended to be a posix class. If 15720 * we haven't already done so, and the parse found a character that 15721 * could have been terminators for the name, but which we absorbed 15722 * as typos during the first pass, repeat the parse, signalling it 15723 * to stop at that character */ 15724 if (possible_end && possible_end != (char *) -1) { 15725 possible_end = (char *) -1; 15726 p = name_start; 15727 goto parse_name; 15728 } 15729 15730 /* Here neither pass found a close-enough class name */ 15731 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 15732 } 15733 15734 probably_meant_to_be: 15735 15736 /* Here we think that a posix specification was intended. Update any 15737 * parse pointer */ 15738 if (updated_parse_ptr) { 15739 *updated_parse_ptr = (char *) p; 15740 } 15741 15742 /* If a posix class name was intended but incorrectly specified, we 15743 * output or return the warnings */ 15744 if (found_problem) { 15745 15746 /* We set flags for these issues in the parse loop above instead of 15747 * adding them to the list of warnings, because we can parse it 15748 * twice, and we only want one warning instance */ 15749 if (has_upper) { 15750 ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); 15751 } 15752 if (has_blank) { 15753 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 15754 } 15755 if (has_semi_colon) { 15756 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 15757 } 15758 else if (! has_terminating_colon) { 15759 ADD_POSIX_WARNING(p, "there is no terminating ':'"); 15760 } 15761 if (! has_terminating_bracket) { 15762 ADD_POSIX_WARNING(p, "there is no terminating ']'"); 15763 } 15764 15765 if ( posix_warnings 15766 && RExC_warn_text 15767 && av_top_index(RExC_warn_text) > -1) 15768 { 15769 *posix_warnings = RExC_warn_text; 15770 } 15771 } 15772 else if (class_number != OOB_NAMEDCLASS) { 15773 /* If it is a known class, return the class. The class number 15774 * #defines are structured so each complement is +1 to the normal 15775 * one */ 15776 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement); 15777 } 15778 else if (! check_only) { 15779 15780 /* Here, it is an unrecognized class. This is an error (unless the 15781 * call is to check only, which we've already handled above) */ 15782 const char * const complement_string = (complement) 15783 ? "^" 15784 : ""; 15785 RExC_parse = (char *) p; 15786 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown", 15787 complement_string, 15788 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); 15789 } 15790 } 15791 15792 return OOB_NAMEDCLASS; 15793 } 15794 #undef ADD_POSIX_WARNING 15795 15796 STATIC unsigned int 15797 S_regex_set_precedence(const U8 my_operator) { 15798 15799 /* Returns the precedence in the (?[...]) construct of the input operator, 15800 * specified by its character representation. The precedence follows 15801 * general Perl rules, but it extends this so that ')' and ']' have (low) 15802 * precedence even though they aren't really operators */ 15803 15804 switch (my_operator) { 15805 case '!': 15806 return 5; 15807 case '&': 15808 return 4; 15809 case '^': 15810 case '|': 15811 case '+': 15812 case '-': 15813 return 3; 15814 case ')': 15815 return 2; 15816 case ']': 15817 return 1; 15818 } 15819 15820 NOT_REACHED; /* NOTREACHED */ 15821 return 0; /* Silence compiler warning */ 15822 } 15823 15824 STATIC regnode_offset 15825 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 15826 I32 *flagp, U32 depth, 15827 char * const oregcomp_parse) 15828 { 15829 /* Handle the (?[...]) construct to do set operations */ 15830 15831 U8 curchar; /* Current character being parsed */ 15832 UV start, end; /* End points of code point ranges */ 15833 SV* final = NULL; /* The end result inversion list */ 15834 SV* result_string; /* 'final' stringified */ 15835 AV* stack; /* stack of operators and operands not yet 15836 resolved */ 15837 AV* fence_stack = NULL; /* A stack containing the positions in 15838 'stack' of where the undealt-with left 15839 parens would be if they were actually 15840 put there */ 15841 /* The 'volatile' is a workaround for an optimiser bug 15842 * in Solaris Studio 12.3. See RT #127455 */ 15843 volatile IV fence = 0; /* Position of where most recent undealt- 15844 with left paren in stack is; -1 if none. 15845 */ 15846 STRLEN len; /* Temporary */ 15847 regnode_offset node; /* Temporary, and final regnode returned by 15848 this function */ 15849 const bool save_fold = FOLD; /* Temporary */ 15850 char *save_end, *save_parse; /* Temporaries */ 15851 const bool in_locale = LOC; /* we turn off /l during processing */ 15852 15853 GET_RE_DEBUG_FLAGS_DECL; 15854 15855 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; 15856 15857 DEBUG_PARSE("xcls"); 15858 15859 if (in_locale) { 15860 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 15861 } 15862 15863 /* The use of this operator implies /u. This is required so that the 15864 * compile time values are valid in all runtime cases */ 15865 REQUIRE_UNI_RULES(flagp, 0); 15866 15867 ckWARNexperimental(RExC_parse, 15868 WARN_EXPERIMENTAL__REGEX_SETS, 15869 "The regex_sets feature is experimental"); 15870 15871 /* Everything in this construct is a metacharacter. Operands begin with 15872 * either a '\' (for an escape sequence), or a '[' for a bracketed 15873 * character class. Any other character should be an operator, or 15874 * parenthesis for grouping. Both types of operands are handled by calling 15875 * regclass() to parse them. It is called with a parameter to indicate to 15876 * return the computed inversion list. The parsing here is implemented via 15877 * a stack. Each entry on the stack is a single character representing one 15878 * of the operators; or else a pointer to an operand inversion list. */ 15879 15880 #define IS_OPERATOR(a) SvIOK(a) 15881 #define IS_OPERAND(a) (! IS_OPERATOR(a)) 15882 15883 /* The stack is kept in Łukasiewicz order. (That's pronounced similar 15884 * to luke-a-shave-itch (or -itz), but people who didn't want to bother 15885 * with pronouncing it called it Reverse Polish instead, but now that YOU 15886 * know how to pronounce it you can use the correct term, thus giving due 15887 * credit to the person who invented it, and impressing your geek friends. 15888 * Wikipedia says that the pronounciation of "Ł" has been changing so that 15889 * it is now more like an English initial W (as in wonk) than an L.) 15890 * 15891 * This means that, for example, 'a | b & c' is stored on the stack as 15892 * 15893 * c [4] 15894 * b [3] 15895 * & [2] 15896 * a [1] 15897 * | [0] 15898 * 15899 * where the numbers in brackets give the stack [array] element number. 15900 * In this implementation, parentheses are not stored on the stack. 15901 * Instead a '(' creates a "fence" so that the part of the stack below the 15902 * fence is invisible except to the corresponding ')' (this allows us to 15903 * replace testing for parens, by using instead subtraction of the fence 15904 * position). As new operands are processed they are pushed onto the stack 15905 * (except as noted in the next paragraph). New operators of higher 15906 * precedence than the current final one are inserted on the stack before 15907 * the lhs operand (so that when the rhs is pushed next, everything will be 15908 * in the correct positions shown above. When an operator of equal or 15909 * lower precedence is encountered in parsing, all the stacked operations 15910 * of equal or higher precedence are evaluated, leaving the result as the 15911 * top entry on the stack. This makes higher precedence operations 15912 * evaluate before lower precedence ones, and causes operations of equal 15913 * precedence to left associate. 15914 * 15915 * The only unary operator '!' is immediately pushed onto the stack when 15916 * encountered. When an operand is encountered, if the top of the stack is 15917 * a '!", the complement is immediately performed, and the '!' popped. The 15918 * resulting value is treated as a new operand, and the logic in the 15919 * previous paragraph is executed. Thus in the expression 15920 * [a] + ! [b] 15921 * the stack looks like 15922 * 15923 * ! 15924 * a 15925 * + 15926 * 15927 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack 15928 * becomes 15929 * 15930 * !b 15931 * a 15932 * + 15933 * 15934 * A ')' is treated as an operator with lower precedence than all the 15935 * aforementioned ones, which causes all operations on the stack above the 15936 * corresponding '(' to be evaluated down to a single resultant operand. 15937 * Then the fence for the '(' is removed, and the operand goes through the 15938 * algorithm above, without the fence. 15939 * 15940 * A separate stack is kept of the fence positions, so that the position of 15941 * the latest so-far unbalanced '(' is at the top of it. 15942 * 15943 * The ']' ending the construct is treated as the lowest operator of all, 15944 * so that everything gets evaluated down to a single operand, which is the 15945 * result */ 15946 15947 sv_2mortal((SV *)(stack = newAV())); 15948 sv_2mortal((SV *)(fence_stack = newAV())); 15949 15950 while (RExC_parse < RExC_end) { 15951 I32 top_index; /* Index of top-most element in 'stack' */ 15952 SV** top_ptr; /* Pointer to top 'stack' element */ 15953 SV* current = NULL; /* To contain the current inversion list 15954 operand */ 15955 SV* only_to_avoid_leaks; 15956 15957 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 15958 TRUE /* Force /x */ ); 15959 if (RExC_parse >= RExC_end) { /* Fail */ 15960 break; 15961 } 15962 15963 curchar = UCHARAT(RExC_parse); 15964 15965 redo_curchar: 15966 15967 #ifdef ENABLE_REGEX_SETS_DEBUGGING 15968 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */ 15969 DEBUG_U(dump_regex_sets_structures(pRExC_state, 15970 stack, fence, fence_stack)); 15971 #endif 15972 15973 top_index = av_tindex_skip_len_mg(stack); 15974 15975 switch (curchar) { 15976 SV** stacked_ptr; /* Ptr to something already on 'stack' */ 15977 char stacked_operator; /* The topmost operator on the 'stack'. */ 15978 SV* lhs; /* Operand to the left of the operator */ 15979 SV* rhs; /* Operand to the right of the operator */ 15980 SV* fence_ptr; /* Pointer to top element of the fence 15981 stack */ 15982 15983 case '(': 15984 15985 if ( RExC_parse < RExC_end - 2 15986 && UCHARAT(RExC_parse + 1) == '?' 15987 && UCHARAT(RExC_parse + 2) == '^') 15988 { 15989 /* If is a '(?', could be an embedded '(?^flags:(?[...])'. 15990 * This happens when we have some thing like 15991 * 15992 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; 15993 * ... 15994 * qr/(?[ \p{Digit} & $thai_or_lao ])/; 15995 * 15996 * Here we would be handling the interpolated 15997 * '$thai_or_lao'. We handle this by a recursive call to 15998 * ourselves which returns the inversion list the 15999 * interpolated expression evaluates to. We use the flags 16000 * from the interpolated pattern. */ 16001 U32 save_flags = RExC_flags; 16002 const char * save_parse; 16003 16004 RExC_parse += 2; /* Skip past the '(?' */ 16005 save_parse = RExC_parse; 16006 16007 /* Parse the flags for the '(?'. We already know the first 16008 * flag to parse is a '^' */ 16009 parse_lparen_question_flags(pRExC_state); 16010 16011 if ( RExC_parse >= RExC_end - 4 16012 || UCHARAT(RExC_parse) != ':' 16013 || UCHARAT(++RExC_parse) != '(' 16014 || UCHARAT(++RExC_parse) != '?' 16015 || UCHARAT(++RExC_parse) != '[') 16016 { 16017 16018 /* In combination with the above, this moves the 16019 * pointer to the point just after the first erroneous 16020 * character. */ 16021 if (RExC_parse >= RExC_end - 4) { 16022 RExC_parse = RExC_end; 16023 } 16024 else if (RExC_parse != save_parse) { 16025 RExC_parse += (UTF) 16026 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 16027 : 1; 16028 } 16029 vFAIL("Expecting '(?flags:(?[...'"); 16030 } 16031 16032 /* Recurse, with the meat of the embedded expression */ 16033 RExC_parse++; 16034 if (! handle_regex_sets(pRExC_state, ¤t, flagp, 16035 depth+1, oregcomp_parse)) 16036 { 16037 RETURN_FAIL_ON_RESTART(*flagp, flagp); 16038 } 16039 16040 /* Here, 'current' contains the embedded expression's 16041 * inversion list, and RExC_parse points to the trailing 16042 * ']'; the next character should be the ')' */ 16043 RExC_parse++; 16044 if (UCHARAT(RExC_parse) != ')') 16045 vFAIL("Expecting close paren for nested extended charclass"); 16046 16047 /* Then the ')' matching the original '(' handled by this 16048 * case: statement */ 16049 RExC_parse++; 16050 if (UCHARAT(RExC_parse) != ')') 16051 vFAIL("Expecting close paren for wrapper for nested extended charclass"); 16052 16053 RExC_flags = save_flags; 16054 goto handle_operand; 16055 } 16056 16057 /* A regular '('. Look behind for illegal syntax */ 16058 if (top_index - fence >= 0) { 16059 /* If the top entry on the stack is an operator, it had 16060 * better be a '!', otherwise the entry below the top 16061 * operand should be an operator */ 16062 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) 16063 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') 16064 || ( IS_OPERAND(*top_ptr) 16065 && ( top_index - fence < 1 16066 || ! (stacked_ptr = av_fetch(stack, 16067 top_index - 1, 16068 FALSE)) 16069 || ! IS_OPERATOR(*stacked_ptr)))) 16070 { 16071 RExC_parse++; 16072 vFAIL("Unexpected '(' with no preceding operator"); 16073 } 16074 } 16075 16076 /* Stack the position of this undealt-with left paren */ 16077 av_push(fence_stack, newSViv(fence)); 16078 fence = top_index + 1; 16079 break; 16080 16081 case '\\': 16082 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 16083 * multi-char folds are allowed. */ 16084 if (!regclass(pRExC_state, flagp, depth+1, 16085 TRUE, /* means parse just the next thing */ 16086 FALSE, /* don't allow multi-char folds */ 16087 FALSE, /* don't silence non-portable warnings. */ 16088 TRUE, /* strict */ 16089 FALSE, /* Require return to be an ANYOF */ 16090 ¤t)) 16091 { 16092 RETURN_FAIL_ON_RESTART(*flagp, flagp); 16093 goto regclass_failed; 16094 } 16095 16096 /* regclass() will return with parsing just the \ sequence, 16097 * leaving the parse pointer at the next thing to parse */ 16098 RExC_parse--; 16099 goto handle_operand; 16100 16101 case '[': /* Is a bracketed character class */ 16102 { 16103 /* See if this is a [:posix:] class. */ 16104 bool is_posix_class = (OOB_NAMEDCLASS 16105 < handle_possible_posix(pRExC_state, 16106 RExC_parse + 1, 16107 NULL, 16108 NULL, 16109 TRUE /* checking only */)); 16110 /* If it is a posix class, leave the parse pointer at the '[' 16111 * to fool regclass() into thinking it is part of a 16112 * '[[:posix:]]'. */ 16113 if (! is_posix_class) { 16114 RExC_parse++; 16115 } 16116 16117 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 16118 * multi-char folds are allowed. */ 16119 if (!regclass(pRExC_state, flagp, depth+1, 16120 is_posix_class, /* parse the whole char 16121 class only if not a 16122 posix class */ 16123 FALSE, /* don't allow multi-char folds */ 16124 TRUE, /* silence non-portable warnings. */ 16125 TRUE, /* strict */ 16126 FALSE, /* Require return to be an ANYOF */ 16127 ¤t)) 16128 { 16129 RETURN_FAIL_ON_RESTART(*flagp, flagp); 16130 goto regclass_failed; 16131 } 16132 16133 if (! current) { 16134 break; 16135 } 16136 16137 /* function call leaves parse pointing to the ']', except if we 16138 * faked it */ 16139 if (is_posix_class) { 16140 RExC_parse--; 16141 } 16142 16143 goto handle_operand; 16144 } 16145 16146 case ']': 16147 if (top_index >= 1) { 16148 goto join_operators; 16149 } 16150 16151 /* Only a single operand on the stack: are done */ 16152 goto done; 16153 16154 case ')': 16155 if (av_tindex_skip_len_mg(fence_stack) < 0) { 16156 if (UCHARAT(RExC_parse - 1) == ']') { 16157 break; 16158 } 16159 RExC_parse++; 16160 vFAIL("Unexpected ')'"); 16161 } 16162 16163 /* If nothing after the fence, is missing an operand */ 16164 if (top_index - fence < 0) { 16165 RExC_parse++; 16166 goto bad_syntax; 16167 } 16168 /* If at least two things on the stack, treat this as an 16169 * operator */ 16170 if (top_index - fence >= 1) { 16171 goto join_operators; 16172 } 16173 16174 /* Here only a single thing on the fenced stack, and there is a 16175 * fence. Get rid of it */ 16176 fence_ptr = av_pop(fence_stack); 16177 assert(fence_ptr); 16178 fence = SvIV(fence_ptr); 16179 SvREFCNT_dec_NN(fence_ptr); 16180 fence_ptr = NULL; 16181 16182 if (fence < 0) { 16183 fence = 0; 16184 } 16185 16186 /* Having gotten rid of the fence, we pop the operand at the 16187 * stack top and process it as a newly encountered operand */ 16188 current = av_pop(stack); 16189 if (IS_OPERAND(current)) { 16190 goto handle_operand; 16191 } 16192 16193 RExC_parse++; 16194 goto bad_syntax; 16195 16196 case '&': 16197 case '|': 16198 case '+': 16199 case '-': 16200 case '^': 16201 16202 /* These binary operators should have a left operand already 16203 * parsed */ 16204 if ( top_index - fence < 0 16205 || top_index - fence == 1 16206 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) 16207 || ! IS_OPERAND(*top_ptr)) 16208 { 16209 goto unexpected_binary; 16210 } 16211 16212 /* If only the one operand is on the part of the stack visible 16213 * to us, we just place this operator in the proper position */ 16214 if (top_index - fence < 2) { 16215 16216 /* Place the operator before the operand */ 16217 16218 SV* lhs = av_pop(stack); 16219 av_push(stack, newSVuv(curchar)); 16220 av_push(stack, lhs); 16221 break; 16222 } 16223 16224 /* But if there is something else on the stack, we need to 16225 * process it before this new operator if and only if the 16226 * stacked operation has equal or higher precedence than the 16227 * new one */ 16228 16229 join_operators: 16230 16231 /* The operator on the stack is supposed to be below both its 16232 * operands */ 16233 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) 16234 || IS_OPERAND(*stacked_ptr)) 16235 { 16236 /* But if not, it's legal and indicates we are completely 16237 * done if and only if we're currently processing a ']', 16238 * which should be the final thing in the expression */ 16239 if (curchar == ']') { 16240 goto done; 16241 } 16242 16243 unexpected_binary: 16244 RExC_parse++; 16245 vFAIL2("Unexpected binary operator '%c' with no " 16246 "preceding operand", curchar); 16247 } 16248 stacked_operator = (char) SvUV(*stacked_ptr); 16249 16250 if (regex_set_precedence(curchar) 16251 > regex_set_precedence(stacked_operator)) 16252 { 16253 /* Here, the new operator has higher precedence than the 16254 * stacked one. This means we need to add the new one to 16255 * the stack to await its rhs operand (and maybe more 16256 * stuff). We put it before the lhs operand, leaving 16257 * untouched the stacked operator and everything below it 16258 * */ 16259 lhs = av_pop(stack); 16260 assert(IS_OPERAND(lhs)); 16261 16262 av_push(stack, newSVuv(curchar)); 16263 av_push(stack, lhs); 16264 break; 16265 } 16266 16267 /* Here, the new operator has equal or lower precedence than 16268 * what's already there. This means the operation already 16269 * there should be performed now, before the new one. */ 16270 16271 rhs = av_pop(stack); 16272 if (! IS_OPERAND(rhs)) { 16273 16274 /* This can happen when a ! is not followed by an operand, 16275 * like in /(?[\t &!])/ */ 16276 goto bad_syntax; 16277 } 16278 16279 lhs = av_pop(stack); 16280 16281 if (! IS_OPERAND(lhs)) { 16282 16283 /* This can happen when there is an empty (), like in 16284 * /(?[[0]+()+])/ */ 16285 goto bad_syntax; 16286 } 16287 16288 switch (stacked_operator) { 16289 case '&': 16290 _invlist_intersection(lhs, rhs, &rhs); 16291 break; 16292 16293 case '|': 16294 case '+': 16295 _invlist_union(lhs, rhs, &rhs); 16296 break; 16297 16298 case '-': 16299 _invlist_subtract(lhs, rhs, &rhs); 16300 break; 16301 16302 case '^': /* The union minus the intersection */ 16303 { 16304 SV* i = NULL; 16305 SV* u = NULL; 16306 16307 _invlist_union(lhs, rhs, &u); 16308 _invlist_intersection(lhs, rhs, &i); 16309 _invlist_subtract(u, i, &rhs); 16310 SvREFCNT_dec_NN(i); 16311 SvREFCNT_dec_NN(u); 16312 break; 16313 } 16314 } 16315 SvREFCNT_dec(lhs); 16316 16317 /* Here, the higher precedence operation has been done, and the 16318 * result is in 'rhs'. We overwrite the stacked operator with 16319 * the result. Then we redo this code to either push the new 16320 * operator onto the stack or perform any higher precedence 16321 * stacked operation */ 16322 only_to_avoid_leaks = av_pop(stack); 16323 SvREFCNT_dec(only_to_avoid_leaks); 16324 av_push(stack, rhs); 16325 goto redo_curchar; 16326 16327 case '!': /* Highest priority, right associative */ 16328 16329 /* If what's already at the top of the stack is another '!", 16330 * they just cancel each other out */ 16331 if ( (top_ptr = av_fetch(stack, top_index, FALSE)) 16332 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) 16333 { 16334 only_to_avoid_leaks = av_pop(stack); 16335 SvREFCNT_dec(only_to_avoid_leaks); 16336 } 16337 else { /* Otherwise, since it's right associative, just push 16338 onto the stack */ 16339 av_push(stack, newSVuv(curchar)); 16340 } 16341 break; 16342 16343 default: 16344 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 16345 if (RExC_parse >= RExC_end) { 16346 break; 16347 } 16348 vFAIL("Unexpected character"); 16349 16350 handle_operand: 16351 16352 /* Here 'current' is the operand. If something is already on the 16353 * stack, we have to check if it is a !. But first, the code above 16354 * may have altered the stack in the time since we earlier set 16355 * 'top_index'. */ 16356 16357 top_index = av_tindex_skip_len_mg(stack); 16358 if (top_index - fence >= 0) { 16359 /* If the top entry on the stack is an operator, it had better 16360 * be a '!', otherwise the entry below the top operand should 16361 * be an operator */ 16362 top_ptr = av_fetch(stack, top_index, FALSE); 16363 assert(top_ptr); 16364 if (IS_OPERATOR(*top_ptr)) { 16365 16366 /* The only permissible operator at the top of the stack is 16367 * '!', which is applied immediately to this operand. */ 16368 curchar = (char) SvUV(*top_ptr); 16369 if (curchar != '!') { 16370 SvREFCNT_dec(current); 16371 vFAIL2("Unexpected binary operator '%c' with no " 16372 "preceding operand", curchar); 16373 } 16374 16375 _invlist_invert(current); 16376 16377 only_to_avoid_leaks = av_pop(stack); 16378 SvREFCNT_dec(only_to_avoid_leaks); 16379 16380 /* And we redo with the inverted operand. This allows 16381 * handling multiple ! in a row */ 16382 goto handle_operand; 16383 } 16384 /* Single operand is ok only for the non-binary ')' 16385 * operator */ 16386 else if ((top_index - fence == 0 && curchar != ')') 16387 || (top_index - fence > 0 16388 && (! (stacked_ptr = av_fetch(stack, 16389 top_index - 1, 16390 FALSE)) 16391 || IS_OPERAND(*stacked_ptr)))) 16392 { 16393 SvREFCNT_dec(current); 16394 vFAIL("Operand with no preceding operator"); 16395 } 16396 } 16397 16398 /* Here there was nothing on the stack or the top element was 16399 * another operand. Just add this new one */ 16400 av_push(stack, current); 16401 16402 } /* End of switch on next parse token */ 16403 16404 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 16405 } /* End of loop parsing through the construct */ 16406 16407 vFAIL("Syntax error in (?[...])"); 16408 16409 done: 16410 16411 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') { 16412 if (RExC_parse < RExC_end) { 16413 RExC_parse++; 16414 } 16415 16416 vFAIL("Unexpected ']' with no following ')' in (?[..."); 16417 } 16418 16419 if (av_tindex_skip_len_mg(fence_stack) >= 0) { 16420 vFAIL("Unmatched ("); 16421 } 16422 16423 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */ 16424 || ((final = av_pop(stack)) == NULL) 16425 || ! IS_OPERAND(final) 16426 || ! is_invlist(final) 16427 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */ 16428 { 16429 bad_syntax: 16430 SvREFCNT_dec(final); 16431 vFAIL("Incomplete expression within '(?[ ])'"); 16432 } 16433 16434 /* Here, 'final' is the resultant inversion list from evaluating the 16435 * expression. Return it if so requested */ 16436 if (return_invlist) { 16437 *return_invlist = final; 16438 return END; 16439 } 16440 16441 /* Otherwise generate a resultant node, based on 'final'. regclass() is 16442 * expecting a string of ranges and individual code points */ 16443 invlist_iterinit(final); 16444 result_string = newSVpvs(""); 16445 while (invlist_iternext(final, &start, &end)) { 16446 if (start == end) { 16447 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start); 16448 } 16449 else { 16450 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}", 16451 start, end); 16452 } 16453 } 16454 16455 /* About to generate an ANYOF (or similar) node from the inversion list we 16456 * have calculated */ 16457 save_parse = RExC_parse; 16458 RExC_parse = SvPV(result_string, len); 16459 save_end = RExC_end; 16460 RExC_end = RExC_parse + len; 16461 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 16462 16463 /* We turn off folding around the call, as the class we have constructed 16464 * already has all folding taken into consideration, and we don't want 16465 * regclass() to add to that */ 16466 RExC_flags &= ~RXf_PMf_FOLD; 16467 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char 16468 * folds are allowed. */ 16469 node = regclass(pRExC_state, flagp, depth+1, 16470 FALSE, /* means parse the whole char class */ 16471 FALSE, /* don't allow multi-char folds */ 16472 TRUE, /* silence non-portable warnings. The above may very 16473 well have generated non-portable code points, but 16474 they're valid on this machine */ 16475 FALSE, /* similarly, no need for strict */ 16476 FALSE, /* Require return to be an ANYOF */ 16477 NULL 16478 ); 16479 16480 RESTORE_WARNINGS; 16481 RExC_parse = save_parse + 1; 16482 RExC_end = save_end; 16483 SvREFCNT_dec_NN(final); 16484 SvREFCNT_dec_NN(result_string); 16485 16486 if (save_fold) { 16487 RExC_flags |= RXf_PMf_FOLD; 16488 } 16489 16490 if (!node) { 16491 RETURN_FAIL_ON_RESTART(*flagp, flagp); 16492 goto regclass_failed; 16493 } 16494 16495 /* Fix up the node type if we are in locale. (We have pretended we are 16496 * under /u for the purposes of regclass(), as this construct will only 16497 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so 16498 * as to cause any warnings about bad locales to be output in regexec.c), 16499 * and add the flag that indicates to check if not in a UTF-8 locale. The 16500 * reason we above forbid optimization into something other than an ANYOF 16501 * node is simply to minimize the number of code changes in regexec.c. 16502 * Otherwise we would have to create new EXACTish node types and deal with 16503 * them. This decision could be revisited should this construct become 16504 * popular. 16505 * 16506 * (One might think we could look at the resulting ANYOF node and suppress 16507 * the flag if everything is above 255, as those would be UTF-8 only, 16508 * but this isn't true, as the components that led to that result could 16509 * have been locale-affected, and just happen to cancel each other out 16510 * under UTF-8 locales.) */ 16511 if (in_locale) { 16512 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); 16513 16514 assert(OP(REGNODE_p(node)) == ANYOF); 16515 16516 OP(REGNODE_p(node)) = ANYOFL; 16517 ANYOF_FLAGS(REGNODE_p(node)) 16518 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 16519 } 16520 16521 nextchar(pRExC_state); 16522 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */ 16523 return node; 16524 16525 regclass_failed: 16526 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf, 16527 (UV) *flagp); 16528 } 16529 16530 #ifdef ENABLE_REGEX_SETS_DEBUGGING 16531 16532 STATIC void 16533 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, 16534 AV * stack, const IV fence, AV * fence_stack) 16535 { /* Dumps the stacks in handle_regex_sets() */ 16536 16537 const SSize_t stack_top = av_tindex_skip_len_mg(stack); 16538 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack); 16539 SSize_t i; 16540 16541 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES; 16542 16543 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse); 16544 16545 if (stack_top < 0) { 16546 PerlIO_printf(Perl_debug_log, "Nothing on stack\n"); 16547 } 16548 else { 16549 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence); 16550 for (i = stack_top; i >= 0; i--) { 16551 SV ** element_ptr = av_fetch(stack, i, FALSE); 16552 if (! element_ptr) { 16553 } 16554 16555 if (IS_OPERATOR(*element_ptr)) { 16556 PerlIO_printf(Perl_debug_log, "[%d]: %c\n", 16557 (int) i, (int) SvIV(*element_ptr)); 16558 } 16559 else { 16560 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i); 16561 sv_dump(*element_ptr); 16562 } 16563 } 16564 } 16565 16566 if (fence_stack_top < 0) { 16567 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n"); 16568 } 16569 else { 16570 PerlIO_printf(Perl_debug_log, "Fence_stack: \n"); 16571 for (i = fence_stack_top; i >= 0; i--) { 16572 SV ** element_ptr = av_fetch(fence_stack, i, FALSE); 16573 if (! element_ptr) { 16574 } 16575 16576 PerlIO_printf(Perl_debug_log, "[%d]: %d\n", 16577 (int) i, (int) SvIV(*element_ptr)); 16578 } 16579 } 16580 } 16581 16582 #endif 16583 16584 #undef IS_OPERATOR 16585 #undef IS_OPERAND 16586 16587 STATIC void 16588 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) 16589 { 16590 /* This adds the Latin1/above-Latin1 folding rules. 16591 * 16592 * This should be called only for a Latin1-range code points, cp, which is 16593 * known to be involved in a simple fold with other code points above 16594 * Latin1. It would give false results if /aa has been specified. 16595 * Multi-char folds are outside the scope of this, and must be handled 16596 * specially. */ 16597 16598 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; 16599 16600 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); 16601 16602 /* The rules that are valid for all Unicode versions are hard-coded in */ 16603 switch (cp) { 16604 case 'k': 16605 case 'K': 16606 *invlist = 16607 add_cp_to_invlist(*invlist, KELVIN_SIGN); 16608 break; 16609 case 's': 16610 case 'S': 16611 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); 16612 break; 16613 case MICRO_SIGN: 16614 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); 16615 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); 16616 break; 16617 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: 16618 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: 16619 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); 16620 break; 16621 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 16622 *invlist = add_cp_to_invlist(*invlist, 16623 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); 16624 break; 16625 16626 default: /* Other code points are checked against the data for the 16627 current Unicode version */ 16628 { 16629 Size_t folds_count; 16630 unsigned int first_fold; 16631 const unsigned int * remaining_folds; 16632 UV folded_cp; 16633 16634 if (isASCII(cp)) { 16635 folded_cp = toFOLD(cp); 16636 } 16637 else { 16638 U8 dummy_fold[UTF8_MAXBYTES_CASE+1]; 16639 Size_t dummy_len; 16640 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0); 16641 } 16642 16643 if (folded_cp > 255) { 16644 *invlist = add_cp_to_invlist(*invlist, folded_cp); 16645 } 16646 16647 folds_count = _inverse_folds(folded_cp, &first_fold, 16648 &remaining_folds); 16649 if (folds_count == 0) { 16650 16651 /* Use deprecated warning to increase the chances of this being 16652 * output */ 16653 ckWARN2reg_d(RExC_parse, 16654 "Perl folding rules are not up-to-date for 0x%02X;" 16655 " please use the perlbug utility to report;", cp); 16656 } 16657 else { 16658 unsigned int i; 16659 16660 if (first_fold > 255) { 16661 *invlist = add_cp_to_invlist(*invlist, first_fold); 16662 } 16663 for (i = 0; i < folds_count - 1; i++) { 16664 if (remaining_folds[i] > 255) { 16665 *invlist = add_cp_to_invlist(*invlist, 16666 remaining_folds[i]); 16667 } 16668 } 16669 } 16670 break; 16671 } 16672 } 16673 } 16674 16675 STATIC void 16676 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) 16677 { 16678 /* Output the elements of the array given by '*posix_warnings' as REGEXP 16679 * warnings. */ 16680 16681 SV * msg; 16682 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP)); 16683 16684 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS; 16685 16686 if (! TO_OUTPUT_WARNINGS(RExC_parse)) { 16687 return; 16688 } 16689 16690 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { 16691 if (first_is_fatal) { /* Avoid leaking this */ 16692 av_undef(posix_warnings); /* This isn't necessary if the 16693 array is mortal, but is a 16694 fail-safe */ 16695 (void) sv_2mortal(msg); 16696 PREPARE_TO_DIE; 16697 } 16698 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); 16699 SvREFCNT_dec_NN(msg); 16700 } 16701 16702 UPDATE_WARNINGS_LOC(RExC_parse); 16703 } 16704 16705 STATIC AV * 16706 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) 16707 { 16708 /* This adds the string scalar <multi_string> to the array 16709 * <multi_char_matches>. <multi_string> is known to have exactly 16710 * <cp_count> code points in it. This is used when constructing a 16711 * bracketed character class and we find something that needs to match more 16712 * than a single character. 16713 * 16714 * <multi_char_matches> is actually an array of arrays. Each top-level 16715 * element is an array that contains all the strings known so far that are 16716 * the same length. And that length (in number of code points) is the same 16717 * as the index of the top-level array. Hence, the [2] element is an 16718 * array, each element thereof is a string containing TWO code points; 16719 * while element [3] is for strings of THREE characters, and so on. Since 16720 * this is for multi-char strings there can never be a [0] nor [1] element. 16721 * 16722 * When we rewrite the character class below, we will do so such that the 16723 * longest strings are written first, so that it prefers the longest 16724 * matching strings first. This is done even if it turns out that any 16725 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom 16726 * Christiansen has agreed that this is ok. This makes the test for the 16727 * ligature 'ffi' come before the test for 'ff', for example */ 16728 16729 AV* this_array; 16730 AV** this_array_ptr; 16731 16732 PERL_ARGS_ASSERT_ADD_MULTI_MATCH; 16733 16734 if (! multi_char_matches) { 16735 multi_char_matches = newAV(); 16736 } 16737 16738 if (av_exists(multi_char_matches, cp_count)) { 16739 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE); 16740 this_array = *this_array_ptr; 16741 } 16742 else { 16743 this_array = newAV(); 16744 av_store(multi_char_matches, cp_count, 16745 (SV*) this_array); 16746 } 16747 av_push(this_array, multi_string); 16748 16749 return multi_char_matches; 16750 } 16751 16752 /* The names of properties whose definitions are not known at compile time are 16753 * stored in this SV, after a constant heading. So if the length has been 16754 * changed since initialization, then there is a run-time definition. */ 16755 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ 16756 (SvCUR(listsv) != initial_listsv_len) 16757 16758 /* There is a restricted set of white space characters that are legal when 16759 * ignoring white space in a bracketed character class. This generates the 16760 * code to skip them. 16761 * 16762 * There is a line below that uses the same white space criteria but is outside 16763 * this macro. Both here and there must use the same definition */ 16764 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ 16765 STMT_START { \ 16766 if (do_skip) { \ 16767 while (isBLANK_A(UCHARAT(p))) \ 16768 { \ 16769 p++; \ 16770 } \ 16771 } \ 16772 } STMT_END 16773 16774 STATIC regnode_offset 16775 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, 16776 const bool stop_at_1, /* Just parse the next thing, don't 16777 look for a full character class */ 16778 bool allow_mutiple_chars, 16779 const bool silence_non_portable, /* Don't output warnings 16780 about too large 16781 characters */ 16782 const bool strict, 16783 bool optimizable, /* ? Allow a non-ANYOF return 16784 node */ 16785 SV** ret_invlist /* Return an inversion list, not a node */ 16786 ) 16787 { 16788 /* parse a bracketed class specification. Most of these will produce an 16789 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an 16790 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex 16791 * under /i with multi-character folds: it will be rewritten following the 16792 * paradigm of this example, where the <multi-fold>s are characters which 16793 * fold to multiple character sequences: 16794 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i 16795 * gets effectively rewritten as: 16796 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i 16797 * reg() gets called (recursively) on the rewritten version, and this 16798 * function will return what it constructs. (Actually the <multi-fold>s 16799 * aren't physically removed from the [abcdefghi], it's just that they are 16800 * ignored in the recursion by means of a flag: 16801 * <RExC_in_multi_char_class>.) 16802 * 16803 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS 16804 * characters, with the corresponding bit set if that character is in the 16805 * list. For characters above this, an inversion list is used. There 16806 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not 16807 * determinable at compile time 16808 * 16809 * On success, returns the offset at which any next node should be placed 16810 * into the regex engine program being compiled. 16811 * 16812 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 16813 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 16814 * UTF-8 16815 */ 16816 16817 dVAR; 16818 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; 16819 IV range = 0; 16820 UV value = OOB_UNICODE, save_value = OOB_UNICODE; 16821 regnode_offset ret = -1; /* Initialized to an illegal value */ 16822 STRLEN numlen; 16823 int namedclass = OOB_NAMEDCLASS; 16824 char *rangebegin = NULL; 16825 SV *listsv = NULL; /* List of \p{user-defined} whose definitions 16826 aren't available at the time this was called */ 16827 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more 16828 than just initialized. */ 16829 SV* properties = NULL; /* Code points that match \p{} \P{} */ 16830 SV* posixes = NULL; /* Code points that match classes like [:word:], 16831 extended beyond the Latin1 range. These have to 16832 be kept separate from other code points for much 16833 of this function because their handling is 16834 different under /i, and for most classes under 16835 /d as well */ 16836 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept 16837 separate for a while from the non-complemented 16838 versions because of complications with /d 16839 matching */ 16840 SV* simple_posixes = NULL; /* But under some conditions, the classes can be 16841 treated more simply than the general case, 16842 leading to less compilation and execution 16843 work */ 16844 UV element_count = 0; /* Number of distinct elements in the class. 16845 Optimizations may be possible if this is tiny */ 16846 AV * multi_char_matches = NULL; /* Code points that fold to more than one 16847 character; used under /i */ 16848 UV n; 16849 char * stop_ptr = RExC_end; /* where to stop parsing */ 16850 16851 /* ignore unescaped whitespace? */ 16852 const bool skip_white = cBOOL( ret_invlist 16853 || (RExC_flags & RXf_PMf_EXTENDED_MORE)); 16854 16855 /* inversion list of code points this node matches only when the target 16856 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under 16857 * /d) */ 16858 SV* upper_latin1_only_utf8_matches = NULL; 16859 16860 /* Inversion list of code points this node matches regardless of things 16861 * like locale, folding, utf8ness of the target string */ 16862 SV* cp_list = NULL; 16863 16864 /* Like cp_list, but code points on this list need to be checked for things 16865 * that fold to/from them under /i */ 16866 SV* cp_foldable_list = NULL; 16867 16868 /* Like cp_list, but code points on this list are valid only when the 16869 * runtime locale is UTF-8 */ 16870 SV* only_utf8_locale_list = NULL; 16871 16872 /* In a range, if one of the endpoints is non-character-set portable, 16873 * meaning that it hard-codes a code point that may mean a different 16874 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a 16875 * mnemonic '\t' which each mean the same character no matter which 16876 * character set the platform is on. */ 16877 unsigned int non_portable_endpoint = 0; 16878 16879 /* Is the range unicode? which means on a platform that isn't 1-1 native 16880 * to Unicode (i.e. non-ASCII), each code point in it should be considered 16881 * to be a Unicode value. */ 16882 bool unicode_range = FALSE; 16883 bool invert = FALSE; /* Is this class to be complemented */ 16884 16885 bool warn_super = ALWAYS_WARN_SUPER; 16886 16887 const char * orig_parse = RExC_parse; 16888 16889 /* This variable is used to mark where the end in the input is of something 16890 * that looks like a POSIX construct but isn't. During the parse, when 16891 * something looks like it could be such a construct is encountered, it is 16892 * checked for being one, but not if we've already checked this area of the 16893 * input. Only after this position is reached do we check again */ 16894 char *not_posix_region_end = RExC_parse - 1; 16895 16896 AV* posix_warnings = NULL; 16897 const bool do_posix_warnings = ckWARN(WARN_REGEXP); 16898 U8 op = END; /* The returned node-type, initialized to an impossible 16899 one. */ 16900 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */ 16901 U32 posixl = 0; /* bit field of posix classes matched under /l */ 16902 16903 16904 /* Flags as to what things aren't knowable until runtime. (Note that these are 16905 * mutually exclusive.) */ 16906 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that 16907 haven't been defined as of yet */ 16908 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is 16909 UTF-8 or not */ 16910 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and 16911 what gets folded */ 16912 U32 has_runtime_dependency = 0; /* OR of the above flags */ 16913 16914 GET_RE_DEBUG_FLAGS_DECL; 16915 16916 PERL_ARGS_ASSERT_REGCLASS; 16917 #ifndef DEBUGGING 16918 PERL_UNUSED_ARG(depth); 16919 #endif 16920 16921 16922 /* If wants an inversion list returned, we can't optimize to something 16923 * else. */ 16924 if (ret_invlist) { 16925 optimizable = FALSE; 16926 } 16927 16928 DEBUG_PARSE("clas"); 16929 16930 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ 16931 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ 16932 && UNICODE_DOT_DOT_VERSION == 0) 16933 allow_mutiple_chars = FALSE; 16934 #endif 16935 16936 /* We include the /i status at the beginning of this so that we can 16937 * know it at runtime */ 16938 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD))); 16939 initial_listsv_len = SvCUR(listsv); 16940 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ 16941 16942 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 16943 16944 assert(RExC_parse <= RExC_end); 16945 16946 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ 16947 RExC_parse++; 16948 invert = TRUE; 16949 allow_mutiple_chars = FALSE; 16950 MARK_NAUGHTY(1); 16951 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 16952 } 16953 16954 /* Check that they didn't say [:posix:] instead of [[:posix:]] */ 16955 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { 16956 int maybe_class = handle_possible_posix(pRExC_state, 16957 RExC_parse, 16958 ¬_posix_region_end, 16959 NULL, 16960 TRUE /* checking only */); 16961 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { 16962 ckWARN4reg(not_posix_region_end, 16963 "POSIX syntax [%c %c] belongs inside character classes%s", 16964 *RExC_parse, *RExC_parse, 16965 (maybe_class == OOB_NAMEDCLASS) 16966 ? ((POSIXCC_NOTYET(*RExC_parse)) 16967 ? " (but this one isn't implemented)" 16968 : " (but this one isn't fully valid)") 16969 : "" 16970 ); 16971 } 16972 } 16973 16974 /* If the caller wants us to just parse a single element, accomplish this 16975 * by faking the loop ending condition */ 16976 if (stop_at_1 && RExC_end > RExC_parse) { 16977 stop_ptr = RExC_parse + 1; 16978 } 16979 16980 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ 16981 if (UCHARAT(RExC_parse) == ']') 16982 goto charclassloop; 16983 16984 while (1) { 16985 16986 if ( posix_warnings 16987 && av_tindex_skip_len_mg(posix_warnings) >= 0 16988 && RExC_parse > not_posix_region_end) 16989 { 16990 /* Warnings about posix class issues are considered tentative until 16991 * we are far enough along in the parse that we can no longer 16992 * change our mind, at which point we output them. This is done 16993 * each time through the loop so that a later class won't zap them 16994 * before they have been dealt with. */ 16995 output_posix_warnings(pRExC_state, posix_warnings); 16996 } 16997 16998 if (RExC_parse >= stop_ptr) { 16999 break; 17000 } 17001 17002 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 17003 17004 if (UCHARAT(RExC_parse) == ']') { 17005 break; 17006 } 17007 17008 charclassloop: 17009 17010 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 17011 save_value = value; 17012 save_prevvalue = prevvalue; 17013 17014 if (!range) { 17015 rangebegin = RExC_parse; 17016 element_count++; 17017 non_portable_endpoint = 0; 17018 } 17019 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { 17020 value = utf8n_to_uvchr((U8*)RExC_parse, 17021 RExC_end - RExC_parse, 17022 &numlen, UTF8_ALLOW_DEFAULT); 17023 RExC_parse += numlen; 17024 } 17025 else 17026 value = UCHARAT(RExC_parse++); 17027 17028 if (value == '[') { 17029 char * posix_class_end; 17030 namedclass = handle_possible_posix(pRExC_state, 17031 RExC_parse, 17032 &posix_class_end, 17033 do_posix_warnings ? &posix_warnings : NULL, 17034 FALSE /* die if error */); 17035 if (namedclass > OOB_NAMEDCLASS) { 17036 17037 /* If there was an earlier attempt to parse this particular 17038 * posix class, and it failed, it was a false alarm, as this 17039 * successful one proves */ 17040 if ( posix_warnings 17041 && av_tindex_skip_len_mg(posix_warnings) >= 0 17042 && not_posix_region_end >= RExC_parse 17043 && not_posix_region_end <= posix_class_end) 17044 { 17045 av_undef(posix_warnings); 17046 } 17047 17048 RExC_parse = posix_class_end; 17049 } 17050 else if (namedclass == OOB_NAMEDCLASS) { 17051 not_posix_region_end = posix_class_end; 17052 } 17053 else { 17054 namedclass = OOB_NAMEDCLASS; 17055 } 17056 } 17057 else if ( RExC_parse - 1 > not_posix_region_end 17058 && MAYBE_POSIXCC(value)) 17059 { 17060 (void) handle_possible_posix( 17061 pRExC_state, 17062 RExC_parse - 1, /* -1 because parse has already been 17063 advanced */ 17064 ¬_posix_region_end, 17065 do_posix_warnings ? &posix_warnings : NULL, 17066 TRUE /* checking only */); 17067 } 17068 else if ( strict && ! skip_white 17069 && ( _generic_isCC(value, _CC_VERTSPACE) 17070 || is_VERTWS_cp_high(value))) 17071 { 17072 vFAIL("Literal vertical space in [] is illegal except under /x"); 17073 } 17074 else if (value == '\\') { 17075 /* Is a backslash; get the code point of the char after it */ 17076 17077 if (RExC_parse >= RExC_end) { 17078 vFAIL("Unmatched ["); 17079 } 17080 17081 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { 17082 value = utf8n_to_uvchr((U8*)RExC_parse, 17083 RExC_end - RExC_parse, 17084 &numlen, UTF8_ALLOW_DEFAULT); 17085 RExC_parse += numlen; 17086 } 17087 else 17088 value = UCHARAT(RExC_parse++); 17089 17090 /* Some compilers cannot handle switching on 64-bit integer 17091 * values, therefore value cannot be an UV. Yes, this will 17092 * be a problem later if we want switch on Unicode. 17093 * A similar issue a little bit later when switching on 17094 * namedclass. --jhi */ 17095 17096 /* If the \ is escaping white space when white space is being 17097 * skipped, it means that that white space is wanted literally, and 17098 * is already in 'value'. Otherwise, need to translate the escape 17099 * into what it signifies. */ 17100 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { 17101 17102 case 'w': namedclass = ANYOF_WORDCHAR; break; 17103 case 'W': namedclass = ANYOF_NWORDCHAR; break; 17104 case 's': namedclass = ANYOF_SPACE; break; 17105 case 'S': namedclass = ANYOF_NSPACE; break; 17106 case 'd': namedclass = ANYOF_DIGIT; break; 17107 case 'D': namedclass = ANYOF_NDIGIT; break; 17108 case 'v': namedclass = ANYOF_VERTWS; break; 17109 case 'V': namedclass = ANYOF_NVERTWS; break; 17110 case 'h': namedclass = ANYOF_HORIZWS; break; 17111 case 'H': namedclass = ANYOF_NHORIZWS; break; 17112 case 'N': /* Handle \N{NAME} in class */ 17113 { 17114 const char * const backslash_N_beg = RExC_parse - 2; 17115 int cp_count; 17116 17117 if (! grok_bslash_N(pRExC_state, 17118 NULL, /* No regnode */ 17119 &value, /* Yes single value */ 17120 &cp_count, /* Multiple code pt count */ 17121 flagp, 17122 strict, 17123 depth) 17124 ) { 17125 17126 if (*flagp & NEED_UTF8) 17127 FAIL("panic: grok_bslash_N set NEED_UTF8"); 17128 17129 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 17130 17131 if (cp_count < 0) { 17132 vFAIL("\\N in a character class must be a named character: \\N{...}"); 17133 } 17134 else if (cp_count == 0) { 17135 ckWARNreg(RExC_parse, 17136 "Ignoring zero length \\N{} in character class"); 17137 } 17138 else { /* cp_count > 1 */ 17139 assert(cp_count > 1); 17140 if (! RExC_in_multi_char_class) { 17141 if ( ! allow_mutiple_chars 17142 || invert 17143 || range 17144 || *RExC_parse == '-') 17145 { 17146 if (strict) { 17147 RExC_parse--; 17148 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character"); 17149 } 17150 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); 17151 break; /* <value> contains the first code 17152 point. Drop out of the switch to 17153 process it */ 17154 } 17155 else { 17156 SV * multi_char_N = newSVpvn(backslash_N_beg, 17157 RExC_parse - backslash_N_beg); 17158 multi_char_matches 17159 = add_multi_match(multi_char_matches, 17160 multi_char_N, 17161 cp_count); 17162 } 17163 } 17164 } /* End of cp_count != 1 */ 17165 17166 /* This element should not be processed further in this 17167 * class */ 17168 element_count--; 17169 value = save_value; 17170 prevvalue = save_prevvalue; 17171 continue; /* Back to top of loop to get next char */ 17172 } 17173 17174 /* Here, is a single code point, and <value> contains it */ 17175 unicode_range = TRUE; /* \N{} are Unicode */ 17176 } 17177 break; 17178 case 'p': 17179 case 'P': 17180 { 17181 char *e; 17182 17183 /* \p means they want Unicode semantics */ 17184 REQUIRE_UNI_RULES(flagp, 0); 17185 17186 if (RExC_parse >= RExC_end) 17187 vFAIL2("Empty \\%c", (U8)value); 17188 if (*RExC_parse == '{') { 17189 const U8 c = (U8)value; 17190 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 17191 if (!e) { 17192 RExC_parse++; 17193 vFAIL2("Missing right brace on \\%c{}", c); 17194 } 17195 17196 RExC_parse++; 17197 17198 /* White space is allowed adjacent to the braces and after 17199 * any '^', even when not under /x */ 17200 while (isSPACE(*RExC_parse)) { 17201 RExC_parse++; 17202 } 17203 17204 if (UCHARAT(RExC_parse) == '^') { 17205 17206 /* toggle. (The rhs xor gets the single bit that 17207 * differs between P and p; the other xor inverts just 17208 * that bit) */ 17209 value ^= 'P' ^ 'p'; 17210 17211 RExC_parse++; 17212 while (isSPACE(*RExC_parse)) { 17213 RExC_parse++; 17214 } 17215 } 17216 17217 if (e == RExC_parse) 17218 vFAIL2("Empty \\%c{}", c); 17219 17220 n = e - RExC_parse; 17221 while (isSPACE(*(RExC_parse + n - 1))) 17222 n--; 17223 17224 } /* The \p isn't immediately followed by a '{' */ 17225 else if (! isALPHA(*RExC_parse)) { 17226 RExC_parse += (UTF) 17227 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 17228 : 1; 17229 vFAIL2("Character following \\%c must be '{' or a " 17230 "single-character Unicode property name", 17231 (U8) value); 17232 } 17233 else { 17234 e = RExC_parse; 17235 n = 1; 17236 } 17237 { 17238 char* name = RExC_parse; 17239 17240 /* Any message returned about expanding the definition */ 17241 SV* msg = newSVpvs_flags("", SVs_TEMP); 17242 17243 /* If set TRUE, the property is user-defined as opposed to 17244 * official Unicode */ 17245 bool user_defined = FALSE; 17246 17247 SV * prop_definition = parse_uniprop_string( 17248 name, n, UTF, FOLD, 17249 FALSE, /* This is compile-time */ 17250 17251 /* We can't defer this defn when 17252 * the full result is required in 17253 * this call */ 17254 ! cBOOL(ret_invlist), 17255 17256 &user_defined, 17257 msg, 17258 0 /* Base level */ 17259 ); 17260 if (SvCUR(msg)) { /* Assumes any error causes a msg */ 17261 assert(prop_definition == NULL); 17262 RExC_parse = e + 1; 17263 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole 17264 thing so, or else the display is 17265 mojibake */ 17266 RExC_utf8 = TRUE; 17267 } 17268 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */ 17269 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg), 17270 SvCUR(msg), SvPVX(msg))); 17271 } 17272 17273 if (! is_invlist(prop_definition)) { 17274 17275 /* Here, the definition isn't known, so we have gotten 17276 * returned a string that will be evaluated if and when 17277 * encountered at runtime. We add it to the list of 17278 * such properties, along with whether it should be 17279 * complemented or not */ 17280 if (value == 'P') { 17281 sv_catpvs(listsv, "!"); 17282 } 17283 else { 17284 sv_catpvs(listsv, "+"); 17285 } 17286 sv_catsv(listsv, prop_definition); 17287 17288 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY; 17289 17290 /* We don't know yet what this matches, so have to flag 17291 * it */ 17292 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; 17293 } 17294 else { 17295 assert (prop_definition && is_invlist(prop_definition)); 17296 17297 /* Here we do have the complete property definition 17298 * 17299 * Temporary workaround for [perl #133136]. For this 17300 * precise input that is in the .t that is failing, 17301 * load utf8.pm, which is what the test wants, so that 17302 * that .t passes */ 17303 if ( memEQs(RExC_start, e + 1 - RExC_start, 17304 "foo\\p{Alnum}") 17305 && ! hv_common(GvHVn(PL_incgv), 17306 NULL, 17307 "utf8.pm", sizeof("utf8.pm") - 1, 17308 0, HV_FETCH_ISEXISTS, NULL, 0)) 17309 { 17310 require_pv("utf8.pm"); 17311 } 17312 17313 if (! user_defined && 17314 /* We warn on matching an above-Unicode code point 17315 * if the match would return true, except don't 17316 * warn for \p{All}, which has exactly one element 17317 * = 0 */ 17318 (_invlist_contains_cp(prop_definition, 0x110000) 17319 && (! (_invlist_len(prop_definition) == 1 17320 && *invlist_array(prop_definition) == 0)))) 17321 { 17322 warn_super = TRUE; 17323 } 17324 17325 /* Invert if asking for the complement */ 17326 if (value == 'P') { 17327 _invlist_union_complement_2nd(properties, 17328 prop_definition, 17329 &properties); 17330 } 17331 else { 17332 _invlist_union(properties, prop_definition, &properties); 17333 } 17334 } 17335 } 17336 17337 RExC_parse = e + 1; 17338 namedclass = ANYOF_UNIPROP; /* no official name, but it's 17339 named */ 17340 } 17341 break; 17342 case 'n': value = '\n'; break; 17343 case 'r': value = '\r'; break; 17344 case 't': value = '\t'; break; 17345 case 'f': value = '\f'; break; 17346 case 'b': value = '\b'; break; 17347 case 'e': value = ESC_NATIVE; break; 17348 case 'a': value = '\a'; break; 17349 case 'o': 17350 RExC_parse--; /* function expects to be pointed at the 'o' */ 17351 { 17352 const char* error_msg; 17353 bool valid = grok_bslash_o(&RExC_parse, 17354 RExC_end, 17355 &value, 17356 &error_msg, 17357 TO_OUTPUT_WARNINGS(RExC_parse), 17358 strict, 17359 silence_non_portable, 17360 UTF); 17361 if (! valid) { 17362 vFAIL(error_msg); 17363 } 17364 UPDATE_WARNINGS_LOC(RExC_parse - 1); 17365 } 17366 non_portable_endpoint++; 17367 break; 17368 case 'x': 17369 RExC_parse--; /* function expects to be pointed at the 'x' */ 17370 { 17371 const char* error_msg; 17372 bool valid = grok_bslash_x(&RExC_parse, 17373 RExC_end, 17374 &value, 17375 &error_msg, 17376 TO_OUTPUT_WARNINGS(RExC_parse), 17377 strict, 17378 silence_non_portable, 17379 UTF); 17380 if (! valid) { 17381 vFAIL(error_msg); 17382 } 17383 UPDATE_WARNINGS_LOC(RExC_parse - 1); 17384 } 17385 non_portable_endpoint++; 17386 break; 17387 case 'c': 17388 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse)); 17389 UPDATE_WARNINGS_LOC(RExC_parse); 17390 RExC_parse++; 17391 non_portable_endpoint++; 17392 break; 17393 case '0': case '1': case '2': case '3': case '4': 17394 case '5': case '6': case '7': 17395 { 17396 /* Take 1-3 octal digits */ 17397 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 17398 numlen = (strict) ? 4 : 3; 17399 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 17400 RExC_parse += numlen; 17401 if (numlen != 3) { 17402 if (strict) { 17403 RExC_parse += (UTF) 17404 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 17405 : 1; 17406 vFAIL("Need exactly 3 octal digits"); 17407 } 17408 else if ( numlen < 3 /* like \08, \178 */ 17409 && RExC_parse < RExC_end 17410 && isDIGIT(*RExC_parse) 17411 && ckWARN(WARN_REGEXP)) 17412 { 17413 reg_warn_non_literal_string( 17414 RExC_parse + 1, 17415 form_short_octal_warning(RExC_parse, numlen)); 17416 } 17417 } 17418 non_portable_endpoint++; 17419 break; 17420 } 17421 default: 17422 /* Allow \_ to not give an error */ 17423 if (isWORDCHAR(value) && value != '_') { 17424 if (strict) { 17425 vFAIL2("Unrecognized escape \\%c in character class", 17426 (int)value); 17427 } 17428 else { 17429 ckWARN2reg(RExC_parse, 17430 "Unrecognized escape \\%c in character class passed through", 17431 (int)value); 17432 } 17433 } 17434 break; 17435 } /* End of switch on char following backslash */ 17436 } /* end of handling backslash escape sequences */ 17437 17438 /* Here, we have the current token in 'value' */ 17439 17440 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 17441 U8 classnum; 17442 17443 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a 17444 * literal, as is the character that began the false range, i.e. 17445 * the 'a' in the examples */ 17446 if (range) { 17447 const int w = (RExC_parse >= rangebegin) 17448 ? RExC_parse - rangebegin 17449 : 0; 17450 if (strict) { 17451 vFAIL2utf8f( 17452 "False [] range \"%" UTF8f "\"", 17453 UTF8fARG(UTF, w, rangebegin)); 17454 } 17455 else { 17456 ckWARN2reg(RExC_parse, 17457 "False [] range \"%" UTF8f "\"", 17458 UTF8fARG(UTF, w, rangebegin)); 17459 cp_list = add_cp_to_invlist(cp_list, '-'); 17460 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 17461 prevvalue); 17462 } 17463 17464 range = 0; /* this was not a true range */ 17465 element_count += 2; /* So counts for three values */ 17466 } 17467 17468 classnum = namedclass_to_classnum(namedclass); 17469 17470 if (LOC && namedclass < ANYOF_POSIXL_MAX 17471 #ifndef HAS_ISASCII 17472 && classnum != _CC_ASCII 17473 #endif 17474 ) { 17475 SV* scratch_list = NULL; 17476 17477 /* What the Posix classes (like \w, [:space:]) match isn't 17478 * generally knowable under locale until actual match time. A 17479 * special node is used for these which has extra space for a 17480 * bitmap, with a bit reserved for each named class that is to 17481 * be matched against. (This isn't needed for \p{} and 17482 * pseudo-classes, as they are not affected by locale, and 17483 * hence are dealt with separately.) However, if a named class 17484 * and its complement are both present, then it matches 17485 * everything, and there is no runtime dependency. Odd numbers 17486 * are the complements of the next lower number, so xor works. 17487 * (Note that something like [\w\D] should match everything, 17488 * because \d should be a proper subset of \w. But rather than 17489 * trust that the locale is well behaved, we leave this to 17490 * runtime to sort out) */ 17491 if (POSIXL_TEST(posixl, namedclass ^ 1)) { 17492 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX); 17493 POSIXL_ZERO(posixl); 17494 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY; 17495 anyof_flags &= ~ANYOF_MATCHES_POSIXL; 17496 continue; /* We could ignore the rest of the class, but 17497 best to parse it for any errors */ 17498 } 17499 else { /* Here, isn't the complement of any already parsed 17500 class */ 17501 POSIXL_SET(posixl, namedclass); 17502 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 17503 anyof_flags |= ANYOF_MATCHES_POSIXL; 17504 17505 /* The above-Latin1 characters are not subject to locale 17506 * rules. Just add them to the unconditionally-matched 17507 * list */ 17508 17509 /* Get the list of the above-Latin1 code points this 17510 * matches */ 17511 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, 17512 PL_XPosix_ptrs[classnum], 17513 17514 /* Odd numbers are complements, 17515 * like NDIGIT, NASCII, ... */ 17516 namedclass % 2 != 0, 17517 &scratch_list); 17518 /* Checking if 'cp_list' is NULL first saves an extra 17519 * clone. Its reference count will be decremented at the 17520 * next union, etc, or if this is the only instance, at the 17521 * end of the routine */ 17522 if (! cp_list) { 17523 cp_list = scratch_list; 17524 } 17525 else { 17526 _invlist_union(cp_list, scratch_list, &cp_list); 17527 SvREFCNT_dec_NN(scratch_list); 17528 } 17529 continue; /* Go get next character */ 17530 } 17531 } 17532 else { 17533 17534 /* Here, is not /l, or is a POSIX class for which /l doesn't 17535 * matter (or is a Unicode property, which is skipped here). */ 17536 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ 17537 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ 17538 17539 /* Here, should be \h, \H, \v, or \V. None of /d, /i 17540 * nor /l make a difference in what these match, 17541 * therefore we just add what they match to cp_list. */ 17542 if (classnum != _CC_VERTSPACE) { 17543 assert( namedclass == ANYOF_HORIZWS 17544 || namedclass == ANYOF_NHORIZWS); 17545 17546 /* It turns out that \h is just a synonym for 17547 * XPosixBlank */ 17548 classnum = _CC_BLANK; 17549 } 17550 17551 _invlist_union_maybe_complement_2nd( 17552 cp_list, 17553 PL_XPosix_ptrs[classnum], 17554 namedclass % 2 != 0, /* Complement if odd 17555 (NHORIZWS, NVERTWS) 17556 */ 17557 &cp_list); 17558 } 17559 } 17560 else if ( AT_LEAST_UNI_SEMANTICS 17561 || classnum == _CC_ASCII 17562 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT 17563 || classnum == _CC_XDIGIT))) 17564 { 17565 /* We usually have to worry about /d affecting what POSIX 17566 * classes match, with special code needed because we won't 17567 * know until runtime what all matches. But there is no 17568 * extra work needed under /u and /a; and [:ascii:] is 17569 * unaffected by /d; and :digit: and :xdigit: don't have 17570 * runtime differences under /d. So we can special case 17571 * these, and avoid some extra work below, and at runtime. 17572 * */ 17573 _invlist_union_maybe_complement_2nd( 17574 simple_posixes, 17575 ((AT_LEAST_ASCII_RESTRICTED) 17576 ? PL_Posix_ptrs[classnum] 17577 : PL_XPosix_ptrs[classnum]), 17578 namedclass % 2 != 0, 17579 &simple_posixes); 17580 } 17581 else { /* Garden variety class. If is NUPPER, NALPHA, ... 17582 complement and use nposixes */ 17583 SV** posixes_ptr = namedclass % 2 == 0 17584 ? &posixes 17585 : &nposixes; 17586 _invlist_union_maybe_complement_2nd( 17587 *posixes_ptr, 17588 PL_XPosix_ptrs[classnum], 17589 namedclass % 2 != 0, 17590 posixes_ptr); 17591 } 17592 } 17593 } /* end of namedclass \blah */ 17594 17595 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 17596 17597 /* If 'range' is set, 'value' is the ending of a range--check its 17598 * validity. (If value isn't a single code point in the case of a 17599 * range, we should have figured that out above in the code that 17600 * catches false ranges). Later, we will handle each individual code 17601 * point in the range. If 'range' isn't set, this could be the 17602 * beginning of a range, so check for that by looking ahead to see if 17603 * the next real character to be processed is the range indicator--the 17604 * minus sign */ 17605 17606 if (range) { 17607 #ifdef EBCDIC 17608 /* For unicode ranges, we have to test that the Unicode as opposed 17609 * to the native values are not decreasing. (Above 255, there is 17610 * no difference between native and Unicode) */ 17611 if (unicode_range && prevvalue < 255 && value < 255) { 17612 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { 17613 goto backwards_range; 17614 } 17615 } 17616 else 17617 #endif 17618 if (prevvalue > value) /* b-a */ { 17619 int w; 17620 #ifdef EBCDIC 17621 backwards_range: 17622 #endif 17623 w = RExC_parse - rangebegin; 17624 vFAIL2utf8f( 17625 "Invalid [] range \"%" UTF8f "\"", 17626 UTF8fARG(UTF, w, rangebegin)); 17627 NOT_REACHED; /* NOTREACHED */ 17628 } 17629 } 17630 else { 17631 prevvalue = value; /* save the beginning of the potential range */ 17632 if (! stop_at_1 /* Can't be a range if parsing just one thing */ 17633 && *RExC_parse == '-') 17634 { 17635 char* next_char_ptr = RExC_parse + 1; 17636 17637 /* Get the next real char after the '-' */ 17638 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr); 17639 17640 /* If the '-' is at the end of the class (just before the ']', 17641 * it is a literal minus; otherwise it is a range */ 17642 if (next_char_ptr < RExC_end && *next_char_ptr != ']') { 17643 RExC_parse = next_char_ptr; 17644 17645 /* a bad range like \w-, [:word:]- ? */ 17646 if (namedclass > OOB_NAMEDCLASS) { 17647 if (strict || ckWARN(WARN_REGEXP)) { 17648 const int w = RExC_parse >= rangebegin 17649 ? RExC_parse - rangebegin 17650 : 0; 17651 if (strict) { 17652 vFAIL4("False [] range \"%*.*s\"", 17653 w, w, rangebegin); 17654 } 17655 else { 17656 vWARN4(RExC_parse, 17657 "False [] range \"%*.*s\"", 17658 w, w, rangebegin); 17659 } 17660 } 17661 cp_list = add_cp_to_invlist(cp_list, '-'); 17662 element_count++; 17663 } else 17664 range = 1; /* yeah, it's a range! */ 17665 continue; /* but do it the next time */ 17666 } 17667 } 17668 } 17669 17670 if (namedclass > OOB_NAMEDCLASS) { 17671 continue; 17672 } 17673 17674 /* Here, we have a single value this time through the loop, and 17675 * <prevvalue> is the beginning of the range, if any; or <value> if 17676 * not. */ 17677 17678 /* non-Latin1 code point implies unicode semantics. */ 17679 if (value > 255) { 17680 REQUIRE_UNI_RULES(flagp, 0); 17681 } 17682 17683 /* Ready to process either the single value, or the completed range. 17684 * For single-valued non-inverted ranges, we consider the possibility 17685 * of multi-char folds. (We made a conscious decision to not do this 17686 * for the other cases because it can often lead to non-intuitive 17687 * results. For example, you have the peculiar case that: 17688 * "s s" =~ /^[^\xDF]+$/i => Y 17689 * "ss" =~ /^[^\xDF]+$/i => N 17690 * 17691 * See [perl #89750] */ 17692 if (FOLD && allow_mutiple_chars && value == prevvalue) { 17693 if ( value == LATIN_SMALL_LETTER_SHARP_S 17694 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, 17695 value))) 17696 { 17697 /* Here <value> is indeed a multi-char fold. Get what it is */ 17698 17699 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 17700 STRLEN foldlen; 17701 17702 UV folded = _to_uni_fold_flags( 17703 value, 17704 foldbuf, 17705 &foldlen, 17706 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED 17707 ? FOLD_FLAGS_NOMIX_ASCII 17708 : 0) 17709 ); 17710 17711 /* Here, <folded> should be the first character of the 17712 * multi-char fold of <value>, with <foldbuf> containing the 17713 * whole thing. But, if this fold is not allowed (because of 17714 * the flags), <fold> will be the same as <value>, and should 17715 * be processed like any other character, so skip the special 17716 * handling */ 17717 if (folded != value) { 17718 17719 /* Skip if we are recursed, currently parsing the class 17720 * again. Otherwise add this character to the list of 17721 * multi-char folds. */ 17722 if (! RExC_in_multi_char_class) { 17723 STRLEN cp_count = utf8_length(foldbuf, 17724 foldbuf + foldlen); 17725 SV* multi_fold = sv_2mortal(newSVpvs("")); 17726 17727 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value); 17728 17729 multi_char_matches 17730 = add_multi_match(multi_char_matches, 17731 multi_fold, 17732 cp_count); 17733 17734 } 17735 17736 /* This element should not be processed further in this 17737 * class */ 17738 element_count--; 17739 value = save_value; 17740 prevvalue = save_prevvalue; 17741 continue; 17742 } 17743 } 17744 } 17745 17746 if (strict && ckWARN(WARN_REGEXP)) { 17747 if (range) { 17748 17749 /* If the range starts above 255, everything is portable and 17750 * likely to be so for any forseeable character set, so don't 17751 * warn. */ 17752 if (unicode_range && non_portable_endpoint && prevvalue < 256) { 17753 vWARN(RExC_parse, "Both or neither range ends should be Unicode"); 17754 } 17755 else if (prevvalue != value) { 17756 17757 /* Under strict, ranges that stop and/or end in an ASCII 17758 * printable should have each end point be a portable value 17759 * for it (preferably like 'A', but we don't warn if it is 17760 * a (portable) Unicode name or code point), and the range 17761 * must be be all digits or all letters of the same case. 17762 * Otherwise, the range is non-portable and unclear as to 17763 * what it contains */ 17764 if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) 17765 && ( non_portable_endpoint 17766 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value)) 17767 || (isLOWER_A(prevvalue) && isLOWER_A(value)) 17768 || (isUPPER_A(prevvalue) && isUPPER_A(value)) 17769 ))) { 17770 vWARN(RExC_parse, "Ranges of ASCII printables should" 17771 " be some subset of \"0-9\"," 17772 " \"A-Z\", or \"a-z\""); 17773 } 17774 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) { 17775 SSize_t index_start; 17776 SSize_t index_final; 17777 17778 /* But the nature of Unicode and languages mean we 17779 * can't do the same checks for above-ASCII ranges, 17780 * except in the case of digit ones. These should 17781 * contain only digits from the same group of 10. The 17782 * ASCII case is handled just above. Hence here, the 17783 * range could be a range of digits. First some 17784 * unlikely special cases. Grandfather in that a range 17785 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad 17786 * if its starting value is one of the 10 digits prior 17787 * to it. This is because it is an alternate way of 17788 * writing 19D1, and some people may expect it to be in 17789 * that group. But it is bad, because it won't give 17790 * the expected results. In Unicode 5.2 it was 17791 * considered to be in that group (of 11, hence), but 17792 * this was fixed in the next version */ 17793 17794 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) { 17795 goto warn_bad_digit_range; 17796 } 17797 else if (UNLIKELY( prevvalue >= 0x1D7CE 17798 && value <= 0x1D7FF)) 17799 { 17800 /* This is the only other case currently in Unicode 17801 * where the algorithm below fails. The code 17802 * points just above are the end points of a single 17803 * range containing only decimal digits. It is 5 17804 * different series of 0-9. All other ranges of 17805 * digits currently in Unicode are just a single 17806 * series. (And mktables will notify us if a later 17807 * Unicode version breaks this.) 17808 * 17809 * If the range being checked is at most 9 long, 17810 * and the digit values represented are in 17811 * numerical order, they are from the same series. 17812 * */ 17813 if ( value - prevvalue > 9 17814 || ((( value - 0x1D7CE) % 10) 17815 <= (prevvalue - 0x1D7CE) % 10)) 17816 { 17817 goto warn_bad_digit_range; 17818 } 17819 } 17820 else { 17821 17822 /* For all other ranges of digits in Unicode, the 17823 * algorithm is just to check if both end points 17824 * are in the same series, which is the same range. 17825 * */ 17826 index_start = _invlist_search( 17827 PL_XPosix_ptrs[_CC_DIGIT], 17828 prevvalue); 17829 17830 /* Warn if the range starts and ends with a digit, 17831 * and they are not in the same group of 10. */ 17832 if ( index_start >= 0 17833 && ELEMENT_RANGE_MATCHES_INVLIST(index_start) 17834 && (index_final = 17835 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], 17836 value)) != index_start 17837 && index_final >= 0 17838 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)) 17839 { 17840 warn_bad_digit_range: 17841 vWARN(RExC_parse, "Ranges of digits should be" 17842 " from the same group of" 17843 " 10"); 17844 } 17845 } 17846 } 17847 } 17848 } 17849 if ((! range || prevvalue == value) && non_portable_endpoint) { 17850 if (isPRINT_A(value)) { 17851 char literal[3]; 17852 unsigned d = 0; 17853 if (isBACKSLASHED_PUNCT(value)) { 17854 literal[d++] = '\\'; 17855 } 17856 literal[d++] = (char) value; 17857 literal[d++] = '\0'; 17858 17859 vWARN4(RExC_parse, 17860 "\"%.*s\" is more clearly written simply as \"%s\"", 17861 (int) (RExC_parse - rangebegin), 17862 rangebegin, 17863 literal 17864 ); 17865 } 17866 else if isMNEMONIC_CNTRL(value) { 17867 vWARN4(RExC_parse, 17868 "\"%.*s\" is more clearly written simply as \"%s\"", 17869 (int) (RExC_parse - rangebegin), 17870 rangebegin, 17871 cntrl_to_mnemonic((U8) value) 17872 ); 17873 } 17874 } 17875 } 17876 17877 /* Deal with this element of the class */ 17878 17879 #ifndef EBCDIC 17880 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 17881 prevvalue, value); 17882 #else 17883 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones 17884 * that don't require special handling, we can just add the range like 17885 * we do for ASCII platforms */ 17886 if ((UNLIKELY(prevvalue == 0) && value >= 255) 17887 || ! (prevvalue < 256 17888 && (unicode_range 17889 || (! non_portable_endpoint 17890 && ((isLOWER_A(prevvalue) && isLOWER_A(value)) 17891 || (isUPPER_A(prevvalue) 17892 && isUPPER_A(value))))))) 17893 { 17894 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 17895 prevvalue, value); 17896 } 17897 else { 17898 /* Here, requires special handling. This can be because it is a 17899 * range whose code points are considered to be Unicode, and so 17900 * must be individually translated into native, or because its a 17901 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in 17902 * EBCDIC, but we have defined them to include only the "expected" 17903 * upper or lower case ASCII alphabetics. Subranges above 255 are 17904 * the same in native and Unicode, so can be added as a range */ 17905 U8 start = NATIVE_TO_LATIN1(prevvalue); 17906 unsigned j; 17907 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; 17908 for (j = start; j <= end; j++) { 17909 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); 17910 } 17911 if (value > 255) { 17912 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 17913 256, value); 17914 } 17915 } 17916 #endif 17917 17918 range = 0; /* this range (if it was one) is done now */ 17919 } /* End of loop through all the text within the brackets */ 17920 17921 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { 17922 output_posix_warnings(pRExC_state, posix_warnings); 17923 } 17924 17925 /* If anything in the class expands to more than one character, we have to 17926 * deal with them by building up a substitute parse string, and recursively 17927 * calling reg() on it, instead of proceeding */ 17928 if (multi_char_matches) { 17929 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); 17930 I32 cp_count; 17931 STRLEN len; 17932 char *save_end = RExC_end; 17933 char *save_parse = RExC_parse; 17934 char *save_start = RExC_start; 17935 Size_t constructed_prefix_len = 0; /* This gives the length of the 17936 constructed portion of the 17937 substitute parse. */ 17938 bool first_time = TRUE; /* First multi-char occurrence doesn't get 17939 a "|" */ 17940 I32 reg_flags; 17941 17942 assert(! invert); 17943 /* Only one level of recursion allowed */ 17944 assert(RExC_copy_start_in_constructed == RExC_precomp); 17945 17946 #if 0 /* Have decided not to deal with multi-char folds in inverted classes, 17947 because too confusing */ 17948 if (invert) { 17949 sv_catpvs(substitute_parse, "(?:"); 17950 } 17951 #endif 17952 17953 /* Look at the longest folds first */ 17954 for (cp_count = av_tindex_skip_len_mg(multi_char_matches); 17955 cp_count > 0; 17956 cp_count--) 17957 { 17958 17959 if (av_exists(multi_char_matches, cp_count)) { 17960 AV** this_array_ptr; 17961 SV* this_sequence; 17962 17963 this_array_ptr = (AV**) av_fetch(multi_char_matches, 17964 cp_count, FALSE); 17965 while ((this_sequence = av_pop(*this_array_ptr)) != 17966 &PL_sv_undef) 17967 { 17968 if (! first_time) { 17969 sv_catpvs(substitute_parse, "|"); 17970 } 17971 first_time = FALSE; 17972 17973 sv_catpv(substitute_parse, SvPVX(this_sequence)); 17974 } 17975 } 17976 } 17977 17978 /* If the character class contains anything else besides these 17979 * multi-character folds, have to include it in recursive parsing */ 17980 if (element_count) { 17981 sv_catpvs(substitute_parse, "|["); 17982 constructed_prefix_len = SvCUR(substitute_parse); 17983 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); 17984 17985 /* Put in a closing ']' only if not going off the end, as otherwise 17986 * we are adding something that really isn't there */ 17987 if (RExC_parse < RExC_end) { 17988 sv_catpvs(substitute_parse, "]"); 17989 } 17990 } 17991 17992 sv_catpvs(substitute_parse, ")"); 17993 #if 0 17994 if (invert) { 17995 /* This is a way to get the parse to skip forward a whole named 17996 * sequence instead of matching the 2nd character when it fails the 17997 * first */ 17998 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); 17999 } 18000 #endif 18001 18002 /* Set up the data structure so that any errors will be properly 18003 * reported. See the comments at the definition of 18004 * REPORT_LOCATION_ARGS for details */ 18005 RExC_copy_start_in_input = (char *) orig_parse; 18006 RExC_start = RExC_parse = SvPV(substitute_parse, len); 18007 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len; 18008 RExC_end = RExC_parse + len; 18009 RExC_in_multi_char_class = 1; 18010 18011 ret = reg(pRExC_state, 1, ®_flags, depth+1); 18012 18013 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8); 18014 18015 /* And restore so can parse the rest of the pattern */ 18016 RExC_parse = save_parse; 18017 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start; 18018 RExC_end = save_end; 18019 RExC_in_multi_char_class = 0; 18020 SvREFCNT_dec_NN(multi_char_matches); 18021 return ret; 18022 } 18023 18024 /* If folding, we calculate all characters that could fold to or from the 18025 * ones already on the list */ 18026 if (cp_foldable_list) { 18027 if (FOLD) { 18028 UV start, end; /* End points of code point ranges */ 18029 18030 SV* fold_intersection = NULL; 18031 SV** use_list; 18032 18033 /* Our calculated list will be for Unicode rules. For locale 18034 * matching, we have to keep a separate list that is consulted at 18035 * runtime only when the locale indicates Unicode rules (and we 18036 * don't include potential matches in the ASCII/Latin1 range, as 18037 * any code point could fold to any other, based on the run-time 18038 * locale). For non-locale, we just use the general list */ 18039 if (LOC) { 18040 use_list = &only_utf8_locale_list; 18041 } 18042 else { 18043 use_list = &cp_list; 18044 } 18045 18046 /* Only the characters in this class that participate in folds need 18047 * be checked. Get the intersection of this class and all the 18048 * possible characters that are foldable. This can quickly narrow 18049 * down a large class */ 18050 _invlist_intersection(PL_in_some_fold, cp_foldable_list, 18051 &fold_intersection); 18052 18053 /* Now look at the foldable characters in this class individually */ 18054 invlist_iterinit(fold_intersection); 18055 while (invlist_iternext(fold_intersection, &start, &end)) { 18056 UV j; 18057 UV folded; 18058 18059 /* Look at every character in the range */ 18060 for (j = start; j <= end; j++) { 18061 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 18062 STRLEN foldlen; 18063 unsigned int k; 18064 Size_t folds_count; 18065 unsigned int first_fold; 18066 const unsigned int * remaining_folds; 18067 18068 if (j < 256) { 18069 18070 /* Under /l, we don't know what code points below 256 18071 * fold to, except we do know the MICRO SIGN folds to 18072 * an above-255 character if the locale is UTF-8, so we 18073 * add it to the special list (in *use_list) Otherwise 18074 * we know now what things can match, though some folds 18075 * are valid under /d only if the target is UTF-8. 18076 * Those go in a separate list */ 18077 if ( IS_IN_SOME_FOLD_L1(j) 18078 && ! (LOC && j != MICRO_SIGN)) 18079 { 18080 18081 /* ASCII is always matched; non-ASCII is matched 18082 * only under Unicode rules (which could happen 18083 * under /l if the locale is a UTF-8 one */ 18084 if (isASCII(j) || ! DEPENDS_SEMANTICS) { 18085 *use_list = add_cp_to_invlist(*use_list, 18086 PL_fold_latin1[j]); 18087 } 18088 else if (j != PL_fold_latin1[j]) { 18089 upper_latin1_only_utf8_matches 18090 = add_cp_to_invlist( 18091 upper_latin1_only_utf8_matches, 18092 PL_fold_latin1[j]); 18093 } 18094 } 18095 18096 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) 18097 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) 18098 { 18099 add_above_Latin1_folds(pRExC_state, 18100 (U8) j, 18101 use_list); 18102 } 18103 continue; 18104 } 18105 18106 /* Here is an above Latin1 character. We don't have the 18107 * rules hard-coded for it. First, get its fold. This is 18108 * the simple fold, as the multi-character folds have been 18109 * handled earlier and separated out */ 18110 folded = _to_uni_fold_flags(j, foldbuf, &foldlen, 18111 (ASCII_FOLD_RESTRICTED) 18112 ? FOLD_FLAGS_NOMIX_ASCII 18113 : 0); 18114 18115 /* Single character fold of above Latin1. Add everything 18116 * in its fold closure to the list that this node should 18117 * match. */ 18118 folds_count = _inverse_folds(folded, &first_fold, 18119 &remaining_folds); 18120 for (k = 0; k <= folds_count; k++) { 18121 UV c = (k == 0) /* First time through use itself */ 18122 ? folded 18123 : (k == 1) /* 2nd time use, the first fold */ 18124 ? first_fold 18125 18126 /* Then the remaining ones */ 18127 : remaining_folds[k-2]; 18128 18129 /* /aa doesn't allow folds between ASCII and non- */ 18130 if (( ASCII_FOLD_RESTRICTED 18131 && (isASCII(c) != isASCII(j)))) 18132 { 18133 continue; 18134 } 18135 18136 /* Folds under /l which cross the 255/256 boundary are 18137 * added to a separate list. (These are valid only 18138 * when the locale is UTF-8.) */ 18139 if (c < 256 && LOC) { 18140 *use_list = add_cp_to_invlist(*use_list, c); 18141 continue; 18142 } 18143 18144 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) 18145 { 18146 cp_list = add_cp_to_invlist(cp_list, c); 18147 } 18148 else { 18149 /* Similarly folds involving non-ascii Latin1 18150 * characters under /d are added to their list */ 18151 upper_latin1_only_utf8_matches 18152 = add_cp_to_invlist( 18153 upper_latin1_only_utf8_matches, 18154 c); 18155 } 18156 } 18157 } 18158 } 18159 SvREFCNT_dec_NN(fold_intersection); 18160 } 18161 18162 /* Now that we have finished adding all the folds, there is no reason 18163 * to keep the foldable list separate */ 18164 _invlist_union(cp_list, cp_foldable_list, &cp_list); 18165 SvREFCNT_dec_NN(cp_foldable_list); 18166 } 18167 18168 /* And combine the result (if any) with any inversion lists from posix 18169 * classes. The lists are kept separate up to now because we don't want to 18170 * fold the classes */ 18171 if (simple_posixes) { /* These are the classes known to be unaffected by 18172 /a, /aa, and /d */ 18173 if (cp_list) { 18174 _invlist_union(cp_list, simple_posixes, &cp_list); 18175 SvREFCNT_dec_NN(simple_posixes); 18176 } 18177 else { 18178 cp_list = simple_posixes; 18179 } 18180 } 18181 if (posixes || nposixes) { 18182 if (! DEPENDS_SEMANTICS) { 18183 18184 /* For everything but /d, we can just add the current 'posixes' and 18185 * 'nposixes' to the main list */ 18186 if (posixes) { 18187 if (cp_list) { 18188 _invlist_union(cp_list, posixes, &cp_list); 18189 SvREFCNT_dec_NN(posixes); 18190 } 18191 else { 18192 cp_list = posixes; 18193 } 18194 } 18195 if (nposixes) { 18196 if (cp_list) { 18197 _invlist_union(cp_list, nposixes, &cp_list); 18198 SvREFCNT_dec_NN(nposixes); 18199 } 18200 else { 18201 cp_list = nposixes; 18202 } 18203 } 18204 } 18205 else { 18206 /* Under /d, things like \w match upper Latin1 characters only if 18207 * the target string is in UTF-8. But things like \W match all the 18208 * upper Latin1 characters if the target string is not in UTF-8. 18209 * 18210 * Handle the case with something like \W separately */ 18211 if (nposixes) { 18212 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL); 18213 18214 /* A complemented posix class matches all upper Latin1 18215 * characters if not in UTF-8. And it matches just certain 18216 * ones when in UTF-8. That means those certain ones are 18217 * matched regardless, so can just be added to the 18218 * unconditional list */ 18219 if (cp_list) { 18220 _invlist_union(cp_list, nposixes, &cp_list); 18221 SvREFCNT_dec_NN(nposixes); 18222 nposixes = NULL; 18223 } 18224 else { 18225 cp_list = nposixes; 18226 } 18227 18228 /* Likewise for 'posixes' */ 18229 _invlist_union(posixes, cp_list, &cp_list); 18230 SvREFCNT_dec(posixes); 18231 18232 /* Likewise for anything else in the range that matched only 18233 * under UTF-8 */ 18234 if (upper_latin1_only_utf8_matches) { 18235 _invlist_union(cp_list, 18236 upper_latin1_only_utf8_matches, 18237 &cp_list); 18238 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 18239 upper_latin1_only_utf8_matches = NULL; 18240 } 18241 18242 /* If we don't match all the upper Latin1 characters regardless 18243 * of UTF-8ness, we have to set a flag to match the rest when 18244 * not in UTF-8 */ 18245 _invlist_subtract(only_non_utf8_list, cp_list, 18246 &only_non_utf8_list); 18247 if (_invlist_len(only_non_utf8_list) != 0) { 18248 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 18249 } 18250 SvREFCNT_dec_NN(only_non_utf8_list); 18251 } 18252 else { 18253 /* Here there were no complemented posix classes. That means 18254 * the upper Latin1 characters in 'posixes' match only when the 18255 * target string is in UTF-8. So we have to add them to the 18256 * list of those types of code points, while adding the 18257 * remainder to the unconditional list. 18258 * 18259 * First calculate what they are */ 18260 SV* nonascii_but_latin1_properties = NULL; 18261 _invlist_intersection(posixes, PL_UpperLatin1, 18262 &nonascii_but_latin1_properties); 18263 18264 /* And add them to the final list of such characters. */ 18265 _invlist_union(upper_latin1_only_utf8_matches, 18266 nonascii_but_latin1_properties, 18267 &upper_latin1_only_utf8_matches); 18268 18269 /* Remove them from what now becomes the unconditional list */ 18270 _invlist_subtract(posixes, nonascii_but_latin1_properties, 18271 &posixes); 18272 18273 /* And add those unconditional ones to the final list */ 18274 if (cp_list) { 18275 _invlist_union(cp_list, posixes, &cp_list); 18276 SvREFCNT_dec_NN(posixes); 18277 posixes = NULL; 18278 } 18279 else { 18280 cp_list = posixes; 18281 } 18282 18283 SvREFCNT_dec(nonascii_but_latin1_properties); 18284 18285 /* Get rid of any characters from the conditional list that we 18286 * now know are matched unconditionally, which may make that 18287 * list empty */ 18288 _invlist_subtract(upper_latin1_only_utf8_matches, 18289 cp_list, 18290 &upper_latin1_only_utf8_matches); 18291 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) { 18292 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 18293 upper_latin1_only_utf8_matches = NULL; 18294 } 18295 } 18296 } 18297 } 18298 18299 /* And combine the result (if any) with any inversion list from properties. 18300 * The lists are kept separate up to now so that we can distinguish the two 18301 * in regards to matching above-Unicode. A run-time warning is generated 18302 * if a Unicode property is matched against a non-Unicode code point. But, 18303 * we allow user-defined properties to match anything, without any warning, 18304 * and we also suppress the warning if there is a portion of the character 18305 * class that isn't a Unicode property, and which matches above Unicode, \W 18306 * or [\x{110000}] for example. 18307 * (Note that in this case, unlike the Posix one above, there is no 18308 * <upper_latin1_only_utf8_matches>, because having a Unicode property 18309 * forces Unicode semantics */ 18310 if (properties) { 18311 if (cp_list) { 18312 18313 /* If it matters to the final outcome, see if a non-property 18314 * component of the class matches above Unicode. If so, the 18315 * warning gets suppressed. This is true even if just a single 18316 * such code point is specified, as, though not strictly correct if 18317 * another such code point is matched against, the fact that they 18318 * are using above-Unicode code points indicates they should know 18319 * the issues involved */ 18320 if (warn_super) { 18321 warn_super = ! (invert 18322 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); 18323 } 18324 18325 _invlist_union(properties, cp_list, &cp_list); 18326 SvREFCNT_dec_NN(properties); 18327 } 18328 else { 18329 cp_list = properties; 18330 } 18331 18332 if (warn_super) { 18333 anyof_flags 18334 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 18335 18336 /* Because an ANYOF node is the only one that warns, this node 18337 * can't be optimized into something else */ 18338 optimizable = FALSE; 18339 } 18340 } 18341 18342 /* Here, we have calculated what code points should be in the character 18343 * class. 18344 * 18345 * Now we can see about various optimizations. Fold calculation (which we 18346 * did above) needs to take place before inversion. Otherwise /[^k]/i 18347 * would invert to include K, which under /i would match k, which it 18348 * shouldn't. Therefore we can't invert folded locale now, as it won't be 18349 * folded until runtime */ 18350 18351 /* If we didn't do folding, it's because some information isn't available 18352 * until runtime; set the run-time fold flag for these We know to set the 18353 * flag if we have a non-NULL list for UTF-8 locales, or the class matches 18354 * at least one 0-255 range code point */ 18355 if (LOC && FOLD) { 18356 18357 /* Some things on the list might be unconditionally included because of 18358 * other components. Remove them, and clean up the list if it goes to 18359 * 0 elements */ 18360 if (only_utf8_locale_list && cp_list) { 18361 _invlist_subtract(only_utf8_locale_list, cp_list, 18362 &only_utf8_locale_list); 18363 18364 if (_invlist_len(only_utf8_locale_list) == 0) { 18365 SvREFCNT_dec_NN(only_utf8_locale_list); 18366 only_utf8_locale_list = NULL; 18367 } 18368 } 18369 if ( only_utf8_locale_list 18370 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 18371 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I)))) 18372 { 18373 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 18374 anyof_flags 18375 |= ANYOFL_FOLD 18376 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 18377 } 18378 else if (cp_list) { /* Look to see if a 0-255 code point is in list */ 18379 UV start, end; 18380 invlist_iterinit(cp_list); 18381 if (invlist_iternext(cp_list, &start, &end) && start < 256) { 18382 anyof_flags |= ANYOFL_FOLD; 18383 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 18384 } 18385 invlist_iterfinish(cp_list); 18386 } 18387 } 18388 else if ( DEPENDS_SEMANTICS 18389 && ( upper_latin1_only_utf8_matches 18390 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))) 18391 { 18392 RExC_seen_d_op = TRUE; 18393 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY; 18394 } 18395 18396 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at 18397 * compile time. */ 18398 if ( cp_list 18399 && invert 18400 && ! has_runtime_dependency) 18401 { 18402 _invlist_invert(cp_list); 18403 18404 /* Clear the invert flag since have just done it here */ 18405 invert = FALSE; 18406 } 18407 18408 if (ret_invlist) { 18409 *ret_invlist = cp_list; 18410 18411 return RExC_emit; 18412 } 18413 18414 /* All possible optimizations below still have these characteristics. 18415 * (Multi-char folds aren't SIMPLE, but they don't get this far in this 18416 * routine) */ 18417 *flagp |= HASWIDTH|SIMPLE; 18418 18419 if (anyof_flags & ANYOF_LOCALE_FLAGS) { 18420 RExC_contains_locale = 1; 18421 } 18422 18423 /* Some character classes are equivalent to other nodes. Such nodes take 18424 * up less room, and some nodes require fewer operations to execute, than 18425 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to 18426 * improve efficiency. */ 18427 18428 if (optimizable) { 18429 PERL_UINT_FAST8_T i; 18430 Size_t partial_cp_count = 0; 18431 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */ 18432 UV end[MAX_FOLD_FROMS+1] = { 0 }; 18433 18434 if (cp_list) { /* Count the code points in enough ranges that we would 18435 see all the ones possible in any fold in this version 18436 of Unicode */ 18437 18438 invlist_iterinit(cp_list); 18439 for (i = 0; i <= MAX_FOLD_FROMS; i++) { 18440 if (! invlist_iternext(cp_list, &start[i], &end[i])) { 18441 break; 18442 } 18443 partial_cp_count += end[i] - start[i] + 1; 18444 } 18445 18446 invlist_iterfinish(cp_list); 18447 } 18448 18449 /* If we know at compile time that this matches every possible code 18450 * point, any run-time dependencies don't matter */ 18451 if (start[0] == 0 && end[0] == UV_MAX) { 18452 if (invert) { 18453 ret = reganode(pRExC_state, OPFAIL, 0); 18454 } 18455 else { 18456 ret = reg_node(pRExC_state, SANY); 18457 MARK_NAUGHTY(1); 18458 } 18459 goto not_anyof; 18460 } 18461 18462 /* Similarly, for /l posix classes, if both a class and its 18463 * complement match, any run-time dependencies don't matter */ 18464 if (posixl) { 18465 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; 18466 namedclass += 2) 18467 { 18468 if ( POSIXL_TEST(posixl, namedclass) /* class */ 18469 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */ 18470 { 18471 if (invert) { 18472 ret = reganode(pRExC_state, OPFAIL, 0); 18473 } 18474 else { 18475 ret = reg_node(pRExC_state, SANY); 18476 MARK_NAUGHTY(1); 18477 } 18478 goto not_anyof; 18479 } 18480 } 18481 /* For well-behaved locales, some classes are subsets of others, 18482 * so complementing the subset and including the non-complemented 18483 * superset should match everything, like [\D[:alnum:]], and 18484 * [[:^alpha:][:alnum:]], but some implementations of locales are 18485 * buggy, and khw thinks its a bad idea to have optimization change 18486 * behavior, even if it avoids an OS bug in a given case */ 18487 18488 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n) 18489 18490 /* If is a single posix /l class, can optimize to just that op. 18491 * Such a node will not match anything in the Latin1 range, as that 18492 * is not determinable until runtime, but will match whatever the 18493 * class does outside that range. (Note that some classes won't 18494 * match anything outside the range, like [:ascii:]) */ 18495 if ( isSINGLE_BIT_SET(posixl) 18496 && (partial_cp_count == 0 || start[0] > 255)) 18497 { 18498 U8 classnum; 18499 SV * class_above_latin1 = NULL; 18500 bool already_inverted; 18501 bool are_equivalent; 18502 18503 /* Compute which bit is set, which is the same thing as, e.g., 18504 * ANYOF_CNTRL. From 18505 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn 18506 * */ 18507 static const int MultiplyDeBruijnBitPosition2[32] = 18508 { 18509 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 18510 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 18511 }; 18512 18513 namedclass = MultiplyDeBruijnBitPosition2[(posixl 18514 * 0x077CB531U) >> 27]; 18515 classnum = namedclass_to_classnum(namedclass); 18516 18517 /* The named classes are such that the inverted number is one 18518 * larger than the non-inverted one */ 18519 already_inverted = namedclass 18520 - classnum_to_namedclass(classnum); 18521 18522 /* Create an inversion list of the official property, inverted 18523 * if the constructed node list is inverted, and restricted to 18524 * only the above latin1 code points, which are the only ones 18525 * known at compile time */ 18526 _invlist_intersection_maybe_complement_2nd( 18527 PL_AboveLatin1, 18528 PL_XPosix_ptrs[classnum], 18529 already_inverted, 18530 &class_above_latin1); 18531 are_equivalent = _invlistEQ(class_above_latin1, cp_list, 18532 FALSE); 18533 SvREFCNT_dec_NN(class_above_latin1); 18534 18535 if (are_equivalent) { 18536 18537 /* Resolve the run-time inversion flag with this possibly 18538 * inverted class */ 18539 invert = invert ^ already_inverted; 18540 18541 ret = reg_node(pRExC_state, 18542 POSIXL + invert * (NPOSIXL - POSIXL)); 18543 FLAGS(REGNODE_p(ret)) = classnum; 18544 goto not_anyof; 18545 } 18546 } 18547 } 18548 18549 /* khw can't think of any other possible transformation involving 18550 * these. */ 18551 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) { 18552 goto is_anyof; 18553 } 18554 18555 if (! has_runtime_dependency) { 18556 18557 /* If the list is empty, nothing matches. This happens, for 18558 * example, when a Unicode property that doesn't match anything is 18559 * the only element in the character class (perluniprops.pod notes 18560 * such properties). */ 18561 if (partial_cp_count == 0) { 18562 if (invert) { 18563 ret = reg_node(pRExC_state, SANY); 18564 } 18565 else { 18566 ret = reganode(pRExC_state, OPFAIL, 0); 18567 } 18568 18569 goto not_anyof; 18570 } 18571 18572 /* If matches everything but \n */ 18573 if ( start[0] == 0 && end[0] == '\n' - 1 18574 && start[1] == '\n' + 1 && end[1] == UV_MAX) 18575 { 18576 assert (! invert); 18577 ret = reg_node(pRExC_state, REG_ANY); 18578 MARK_NAUGHTY(1); 18579 goto not_anyof; 18580 } 18581 } 18582 18583 /* Next see if can optimize classes that contain just a few code points 18584 * into an EXACTish node. The reason to do this is to let the 18585 * optimizer join this node with adjacent EXACTish ones. 18586 * 18587 * An EXACTFish node can be generated even if not under /i, and vice 18588 * versa. But care must be taken. An EXACTFish node has to be such 18589 * that it only matches precisely the code points in the class, but we 18590 * want to generate the least restrictive one that does that, to 18591 * increase the odds of being able to join with an adjacent node. For 18592 * example, if the class contains [kK], we have to make it an EXACTFAA 18593 * node to prevent the KELVIN SIGN from matching. Whether we are under 18594 * /i or not is irrelevant in this case. Less obvious is the pattern 18595 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is 18596 * supposed to match the single character U+0149 LATIN SMALL LETTER N 18597 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold 18598 * that includes \X{02BC}, there is a multi-char fold that does, and so 18599 * the node generated for it must be an EXACTFish one. On the other 18600 * hand qr/:/i should generate a plain EXACT node since the colon 18601 * participates in no fold whatsoever, and having it EXACT tells the 18602 * optimizer the target string cannot match unless it has a colon in 18603 * it. 18604 * 18605 * We don't typically generate an EXACTish node if doing so would 18606 * require changing the pattern to UTF-8, as that affects /d and 18607 * otherwise is slower. However, under /i, not changing to UTF-8 can 18608 * miss some potential multi-character folds. We calculate the 18609 * EXACTish node, and then decide if something would be missed if we 18610 * don't upgrade */ 18611 if ( ! posixl 18612 && ! invert 18613 18614 /* Only try if there are no more code points in the class than 18615 * in the max possible fold */ 18616 && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1 18617 18618 && (start[0] < 256 || UTF || FOLD)) 18619 { 18620 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) 18621 { 18622 /* We can always make a single code point class into an 18623 * EXACTish node. */ 18624 18625 if (LOC) { 18626 18627 /* Here is /l: Use EXACTL, except /li indicates EXACTFL, 18628 * as that means there is a fold not known until runtime so 18629 * shows as only a single code point here. */ 18630 op = (FOLD) ? EXACTFL : EXACTL; 18631 } 18632 else if (! FOLD) { /* Not /l and not /i */ 18633 op = (start[0] < 256) ? EXACT : EXACT_ONLY8; 18634 } 18635 else if (start[0] < 256) { /* /i, not /l, and the code point is 18636 small */ 18637 18638 /* Under /i, it gets a little tricky. A code point that 18639 * doesn't participate in a fold should be an EXACT node. 18640 * We know this one isn't the result of a simple fold, or 18641 * there'd be more than one code point in the list, but it 18642 * could be part of a multi- character fold. In that case 18643 * we better not create an EXACT node, as we would wrongly 18644 * be telling the optimizer that this code point must be in 18645 * the target string, and that is wrong. This is because 18646 * if the sequence around this code point forms a 18647 * multi-char fold, what needs to be in the string could be 18648 * the code point that folds to the sequence. 18649 * 18650 * This handles the case of below-255 code points, as we 18651 * have an easy look up for those. The next clause handles 18652 * the above-256 one */ 18653 op = IS_IN_SOME_FOLD_L1(start[0]) 18654 ? EXACTFU 18655 : EXACT; 18656 } 18657 else { /* /i, larger code point. Since we are under /i, and 18658 have just this code point, we know that it can't 18659 fold to something else, so PL_InMultiCharFold 18660 applies to it */ 18661 op = _invlist_contains_cp(PL_InMultiCharFold, 18662 start[0]) 18663 ? EXACTFU_ONLY8 18664 : EXACT_ONLY8; 18665 } 18666 18667 value = start[0]; 18668 } 18669 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY) 18670 && _invlist_contains_cp(PL_in_some_fold, start[0])) 18671 { 18672 /* Here, the only runtime dependency, if any, is from /d, and 18673 * the class matches more than one code point, and the lowest 18674 * code point participates in some fold. It might be that the 18675 * other code points are /i equivalent to this one, and hence 18676 * they would representable by an EXACTFish node. Above, we 18677 * eliminated classes that contain too many code points to be 18678 * EXACTFish, with the test for MAX_FOLD_FROMS 18679 * 18680 * First, special case the ASCII fold pairs, like 'B' and 'b'. 18681 * We do this because we have EXACTFAA at our disposal for the 18682 * ASCII range */ 18683 if (partial_cp_count == 2 && isASCII(start[0])) { 18684 18685 /* The only ASCII characters that participate in folds are 18686 * alphabetics */ 18687 assert(isALPHA(start[0])); 18688 if ( end[0] == start[0] /* First range is a single 18689 character, so 2nd exists */ 18690 && isALPHA_FOLD_EQ(start[0], start[1])) 18691 { 18692 18693 /* Here, is part of an ASCII fold pair */ 18694 18695 if ( ASCII_FOLD_RESTRICTED 18696 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0])) 18697 { 18698 /* If the second clause just above was true, it 18699 * means we can't be under /i, or else the list 18700 * would have included more than this fold pair. 18701 * Therefore we have to exclude the possibility of 18702 * whatever else it is that folds to these, by 18703 * using EXACTFAA */ 18704 op = EXACTFAA; 18705 } 18706 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) { 18707 18708 /* Here, there's no simple fold that start[0] is part 18709 * of, but there is a multi-character one. If we 18710 * are not under /i, we want to exclude that 18711 * possibility; if under /i, we want to include it 18712 * */ 18713 op = (FOLD) ? EXACTFU : EXACTFAA; 18714 } 18715 else { 18716 18717 /* Here, the only possible fold start[0] particpates in 18718 * is with start[1]. /i or not isn't relevant */ 18719 op = EXACTFU; 18720 } 18721 18722 value = toFOLD(start[0]); 18723 } 18724 } 18725 else if ( ! upper_latin1_only_utf8_matches 18726 || ( _invlist_len(upper_latin1_only_utf8_matches) 18727 == 2 18728 && PL_fold_latin1[ 18729 invlist_highest(upper_latin1_only_utf8_matches)] 18730 == start[0])) 18731 { 18732 /* Here, the smallest character is non-ascii or there are 18733 * more than 2 code points matched by this node. Also, we 18734 * either don't have /d UTF-8 dependent matches, or if we 18735 * do, they look like they could be a single character that 18736 * is the fold of the lowest one in the always-match list. 18737 * This test quickly excludes most of the false positives 18738 * when there are /d UTF-8 depdendent matches. These are 18739 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN 18740 * SMALL LETTER A WITH GRAVE iff the target string is 18741 * UTF-8. (We don't have to worry above about exceeding 18742 * the array bounds of PL_fold_latin1[] because any code 18743 * point in 'upper_latin1_only_utf8_matches' is below 256.) 18744 * 18745 * EXACTFAA would apply only to pairs (hence exactly 2 code 18746 * points) in the ASCII range, so we can't use it here to 18747 * artificially restrict the fold domain, so we check if 18748 * the class does or does not match some EXACTFish node. 18749 * Further, if we aren't under /i, and and the folded-to 18750 * character is part of a multi-character fold, we can't do 18751 * this optimization, as the sequence around it could be 18752 * that multi-character fold, and we don't here know the 18753 * context, so we have to assume it is that multi-char 18754 * fold, to prevent potential bugs. 18755 * 18756 * To do the general case, we first find the fold of the 18757 * lowest code point (which may be higher than the lowest 18758 * one), then find everything that folds to it. (The data 18759 * structure we have only maps from the folded code points, 18760 * so we have to do the earlier step.) */ 18761 18762 Size_t foldlen; 18763 U8 foldbuf[UTF8_MAXBYTES_CASE]; 18764 UV folded = _to_uni_fold_flags(start[0], 18765 foldbuf, &foldlen, 0); 18766 unsigned int first_fold; 18767 const unsigned int * remaining_folds; 18768 Size_t folds_to_this_cp_count = _inverse_folds( 18769 folded, 18770 &first_fold, 18771 &remaining_folds); 18772 Size_t folds_count = folds_to_this_cp_count + 1; 18773 SV * fold_list = _new_invlist(folds_count); 18774 unsigned int i; 18775 18776 /* If there are UTF-8 dependent matches, create a temporary 18777 * list of what this node matches, including them. */ 18778 SV * all_cp_list = NULL; 18779 SV ** use_this_list = &cp_list; 18780 18781 if (upper_latin1_only_utf8_matches) { 18782 all_cp_list = _new_invlist(0); 18783 use_this_list = &all_cp_list; 18784 _invlist_union(cp_list, 18785 upper_latin1_only_utf8_matches, 18786 use_this_list); 18787 } 18788 18789 /* Having gotten everything that participates in the fold 18790 * containing the lowest code point, we turn that into an 18791 * inversion list, making sure everything is included. */ 18792 fold_list = add_cp_to_invlist(fold_list, start[0]); 18793 fold_list = add_cp_to_invlist(fold_list, folded); 18794 if (folds_to_this_cp_count > 0) { 18795 fold_list = add_cp_to_invlist(fold_list, first_fold); 18796 for (i = 0; i + 1 < folds_to_this_cp_count; i++) { 18797 fold_list = add_cp_to_invlist(fold_list, 18798 remaining_folds[i]); 18799 } 18800 } 18801 18802 /* If the fold list is identical to what's in this ANYOF 18803 * node, the node can be represented by an EXACTFish one 18804 * instead */ 18805 if (_invlistEQ(*use_this_list, fold_list, 18806 0 /* Don't complement */ ) 18807 ) { 18808 18809 /* But, we have to be careful, as mentioned above. 18810 * Just the right sequence of characters could match 18811 * this if it is part of a multi-character fold. That 18812 * IS what we want if we are under /i. But it ISN'T 18813 * what we want if not under /i, as it could match when 18814 * it shouldn't. So, when we aren't under /i and this 18815 * character participates in a multi-char fold, we 18816 * don't optimize into an EXACTFish node. So, for each 18817 * case below we have to check if we are folding 18818 * and if not, if it is not part of a multi-char fold. 18819 * */ 18820 if (start[0] > 255) { /* Highish code point */ 18821 if (FOLD || ! _invlist_contains_cp( 18822 PL_InMultiCharFold, folded)) 18823 { 18824 op = (LOC) 18825 ? EXACTFLU8 18826 : (ASCII_FOLD_RESTRICTED) 18827 ? EXACTFAA 18828 : EXACTFU_ONLY8; 18829 value = folded; 18830 } 18831 } /* Below, the lowest code point < 256 */ 18832 else if ( FOLD 18833 && folded == 's' 18834 && DEPENDS_SEMANTICS) 18835 { /* An EXACTF node containing a single character 18836 's', can be an EXACTFU if it doesn't get 18837 joined with an adjacent 's' */ 18838 op = EXACTFU_S_EDGE; 18839 value = folded; 18840 } 18841 else if ( FOLD 18842 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0])) 18843 { 18844 if (upper_latin1_only_utf8_matches) { 18845 op = EXACTF; 18846 18847 /* We can't use the fold, as that only matches 18848 * under UTF-8 */ 18849 value = start[0]; 18850 } 18851 else if ( UNLIKELY(start[0] == MICRO_SIGN) 18852 && ! UTF) 18853 { /* EXACTFUP is a special node for this 18854 character */ 18855 op = (ASCII_FOLD_RESTRICTED) 18856 ? EXACTFAA 18857 : EXACTFUP; 18858 value = MICRO_SIGN; 18859 } 18860 else if ( ASCII_FOLD_RESTRICTED 18861 && ! isASCII(start[0])) 18862 { /* For ASCII under /iaa, we can use EXACTFU 18863 below */ 18864 op = EXACTFAA; 18865 value = folded; 18866 } 18867 else { 18868 op = EXACTFU; 18869 value = folded; 18870 } 18871 } 18872 } 18873 18874 SvREFCNT_dec_NN(fold_list); 18875 SvREFCNT_dec(all_cp_list); 18876 } 18877 } 18878 18879 if (op != END) { 18880 18881 /* Here, we have calculated what EXACTish node we would use. 18882 * But we don't use it if it would require converting the 18883 * pattern to UTF-8, unless not using it could cause us to miss 18884 * some folds (hence be buggy) */ 18885 18886 if (! UTF && value > 255) { 18887 SV * in_multis = NULL; 18888 18889 assert(FOLD); 18890 18891 /* If there is no code point that is part of a multi-char 18892 * fold, then there aren't any matches, so we don't do this 18893 * optimization. Otherwise, it could match depending on 18894 * the context around us, so we do upgrade */ 18895 _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis); 18896 if (UNLIKELY(_invlist_len(in_multis) != 0)) { 18897 REQUIRE_UTF8(flagp); 18898 } 18899 else { 18900 op = END; 18901 } 18902 } 18903 18904 if (op != END) { 18905 U8 len = (UTF) ? UVCHR_SKIP(value) : 1; 18906 18907 ret = regnode_guts(pRExC_state, op, len, "exact"); 18908 FILL_NODE(ret, op); 18909 RExC_emit += 1 + STR_SZ(len); 18910 STR_LEN(REGNODE_p(ret)) = len; 18911 if (len == 1) { 18912 *STRING(REGNODE_p(ret)) = (U8) value; 18913 } 18914 else { 18915 uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value); 18916 } 18917 goto not_anyof; 18918 } 18919 } 18920 } 18921 18922 if (! has_runtime_dependency) { 18923 18924 /* See if this can be turned into an ANYOFM node. Think about the 18925 * bit patterns in two different bytes. In some positions, the 18926 * bits in each will be 1; and in other positions both will be 0; 18927 * and in some positions the bit will be 1 in one byte, and 0 in 18928 * the other. Let 'n' be the number of positions where the bits 18929 * differ. We create a mask which has exactly 'n' 0 bits, each in 18930 * a position where the two bytes differ. Now take the set of all 18931 * bytes that when ANDed with the mask yield the same result. That 18932 * set has 2**n elements, and is representable by just two 8 bit 18933 * numbers: the result and the mask. Importantly, matching the set 18934 * can be vectorized by creating a word full of the result bytes, 18935 * and a word full of the mask bytes, yielding a significant speed 18936 * up. Here, see if this node matches such a set. As a concrete 18937 * example consider [01], and the byte representing '0' which is 18938 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the 18939 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get 18940 * 0x30. Any other bytes ANDed yield something else. So [01], 18941 * which is a common usage, is optimizable into ANYOFM, and can 18942 * benefit from the speed up. We can only do this on UTF-8 18943 * invariant bytes, because they have the same bit patterns under 18944 * UTF-8 as not. */ 18945 PERL_UINT_FAST8_T inverted = 0; 18946 #ifdef EBCDIC 18947 const PERL_UINT_FAST8_T max_permissible = 0xFF; 18948 #else 18949 const PERL_UINT_FAST8_T max_permissible = 0x7F; 18950 #endif 18951 /* If doesn't fit the criteria for ANYOFM, invert and try again. 18952 * If that works we will instead later generate an NANYOFM, and 18953 * invert back when through */ 18954 if (invlist_highest(cp_list) > max_permissible) { 18955 _invlist_invert(cp_list); 18956 inverted = 1; 18957 } 18958 18959 if (invlist_highest(cp_list) <= max_permissible) { 18960 UV this_start, this_end; 18961 UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */ 18962 U8 bits_differing = 0; 18963 Size_t full_cp_count = 0; 18964 bool first_time = TRUE; 18965 18966 /* Go through the bytes and find the bit positions that differ 18967 * */ 18968 invlist_iterinit(cp_list); 18969 while (invlist_iternext(cp_list, &this_start, &this_end)) { 18970 unsigned int i = this_start; 18971 18972 if (first_time) { 18973 if (! UVCHR_IS_INVARIANT(i)) { 18974 goto done_anyofm; 18975 } 18976 18977 first_time = FALSE; 18978 lowest_cp = this_start; 18979 18980 /* We have set up the code point to compare with. 18981 * Don't compare it with itself */ 18982 i++; 18983 } 18984 18985 /* Find the bit positions that differ from the lowest code 18986 * point in the node. Keep track of all such positions by 18987 * OR'ing */ 18988 for (; i <= this_end; i++) { 18989 if (! UVCHR_IS_INVARIANT(i)) { 18990 goto done_anyofm; 18991 } 18992 18993 bits_differing |= i ^ lowest_cp; 18994 } 18995 18996 full_cp_count += this_end - this_start + 1; 18997 } 18998 invlist_iterfinish(cp_list); 18999 19000 /* At the end of the loop, we count how many bits differ from 19001 * the bits in lowest code point, call the count 'd'. If the 19002 * set we found contains 2**d elements, it is the closure of 19003 * all code points that differ only in those bit positions. To 19004 * convince yourself of that, first note that the number in the 19005 * closure must be a power of 2, which we test for. The only 19006 * way we could have that count and it be some differing set, 19007 * is if we got some code points that don't differ from the 19008 * lowest code point in any position, but do differ from each 19009 * other in some other position. That means one code point has 19010 * a 1 in that position, and another has a 0. But that would 19011 * mean that one of them differs from the lowest code point in 19012 * that position, which possibility we've already excluded. */ 19013 if ( (inverted || full_cp_count > 1) 19014 && full_cp_count == 1U << PL_bitcount[bits_differing]) 19015 { 19016 U8 ANYOFM_mask; 19017 19018 op = ANYOFM + inverted;; 19019 19020 /* We need to make the bits that differ be 0's */ 19021 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */ 19022 19023 /* The argument is the lowest code point */ 19024 ret = reganode(pRExC_state, op, lowest_cp); 19025 FLAGS(REGNODE_p(ret)) = ANYOFM_mask; 19026 } 19027 } 19028 done_anyofm: 19029 19030 if (inverted) { 19031 _invlist_invert(cp_list); 19032 } 19033 19034 if (op != END) { 19035 goto not_anyof; 19036 } 19037 } 19038 19039 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) { 19040 PERL_UINT_FAST8_T type; 19041 SV * intersection = NULL; 19042 SV* d_invlist = NULL; 19043 19044 /* See if this matches any of the POSIX classes. The POSIXA and 19045 * POSIXD ones are about the same speed as ANYOF ops, but take less 19046 * room; the ones that have above-Latin1 code point matches are 19047 * somewhat faster than ANYOF. */ 19048 19049 for (type = POSIXA; type >= POSIXD; type--) { 19050 int posix_class; 19051 19052 if (type == POSIXL) { /* But not /l posix classes */ 19053 continue; 19054 } 19055 19056 for (posix_class = 0; 19057 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; 19058 posix_class++) 19059 { 19060 SV** our_code_points = &cp_list; 19061 SV** official_code_points; 19062 int try_inverted; 19063 19064 if (type == POSIXA) { 19065 official_code_points = &PL_Posix_ptrs[posix_class]; 19066 } 19067 else { 19068 official_code_points = &PL_XPosix_ptrs[posix_class]; 19069 } 19070 19071 /* Skip non-existent classes of this type. e.g. \v only 19072 * has an entry in PL_XPosix_ptrs */ 19073 if (! *official_code_points) { 19074 continue; 19075 } 19076 19077 /* Try both the regular class, and its inversion */ 19078 for (try_inverted = 0; try_inverted < 2; try_inverted++) { 19079 bool this_inverted = invert ^ try_inverted; 19080 19081 if (type != POSIXD) { 19082 19083 /* This class that isn't /d can't match if we have 19084 * /d dependencies */ 19085 if (has_runtime_dependency 19086 & HAS_D_RUNTIME_DEPENDENCY) 19087 { 19088 continue; 19089 } 19090 } 19091 else /* is /d */ if (! this_inverted) { 19092 19093 /* /d classes don't match anything non-ASCII below 19094 * 256 unconditionally (which cp_list contains) */ 19095 _invlist_intersection(cp_list, PL_UpperLatin1, 19096 &intersection); 19097 if (_invlist_len(intersection) != 0) { 19098 continue; 19099 } 19100 19101 SvREFCNT_dec(d_invlist); 19102 d_invlist = invlist_clone(cp_list, NULL); 19103 19104 /* But under UTF-8 it turns into using /u rules. 19105 * Add the things it matches under these conditions 19106 * so that we check below that these are identical 19107 * to what the tested class should match */ 19108 if (upper_latin1_only_utf8_matches) { 19109 _invlist_union( 19110 d_invlist, 19111 upper_latin1_only_utf8_matches, 19112 &d_invlist); 19113 } 19114 our_code_points = &d_invlist; 19115 } 19116 else { /* POSIXD, inverted. If this doesn't have this 19117 flag set, it isn't /d. */ 19118 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) 19119 { 19120 continue; 19121 } 19122 our_code_points = &cp_list; 19123 } 19124 19125 /* Here, have weeded out some things. We want to see 19126 * if the list of characters this node contains 19127 * ('*our_code_points') precisely matches those of the 19128 * class we are currently checking against 19129 * ('*official_code_points'). */ 19130 if (_invlistEQ(*our_code_points, 19131 *official_code_points, 19132 try_inverted)) 19133 { 19134 /* Here, they precisely match. Optimize this ANYOF 19135 * node into its equivalent POSIX one of the 19136 * correct type, possibly inverted */ 19137 ret = reg_node(pRExC_state, (try_inverted) 19138 ? type + NPOSIXA 19139 - POSIXA 19140 : type); 19141 FLAGS(REGNODE_p(ret)) = posix_class; 19142 SvREFCNT_dec(d_invlist); 19143 SvREFCNT_dec(intersection); 19144 goto not_anyof; 19145 } 19146 } 19147 } 19148 } 19149 SvREFCNT_dec(d_invlist); 19150 SvREFCNT_dec(intersection); 19151 } 19152 19153 /* If didn't find an optimization and there is no need for a 19154 * bitmap, optimize to indicate that */ 19155 if ( start[0] >= NUM_ANYOF_CODE_POINTS 19156 && ! LOC 19157 && ! upper_latin1_only_utf8_matches 19158 && anyof_flags == 0) 19159 { 19160 UV highest_cp = invlist_highest(cp_list); 19161 19162 /* If the lowest and highest code point in the class have the same 19163 * UTF-8 first byte, then all do, and we can store that byte for 19164 * regexec.c to use so that it can more quickly scan the target 19165 * string for potential matches for this class. We co-opt the the 19166 * flags field for this. Zero means, they don't have the same 19167 * first byte. We do accept here very large code points (for 19168 * future use), but don't bother with this optimization for them, 19169 * as it would cause other complications */ 19170 if (highest_cp > IV_MAX) { 19171 anyof_flags = 0; 19172 } 19173 else { 19174 U8 low_utf8[UTF8_MAXBYTES+1]; 19175 U8 high_utf8[UTF8_MAXBYTES+1]; 19176 19177 (void) uvchr_to_utf8(low_utf8, start[0]); 19178 (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list)); 19179 19180 anyof_flags = (low_utf8[0] == high_utf8[0]) 19181 ? low_utf8[0] 19182 : 0; 19183 } 19184 19185 op = ANYOFH; 19186 } 19187 } /* End of seeing if can optimize it into a different node */ 19188 19189 is_anyof: /* It's going to be an ANYOF node. */ 19190 if (op != ANYOFH) { 19191 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) 19192 ? ANYOFD 19193 : ((posixl) 19194 ? ANYOFPOSIXL 19195 : ((LOC) 19196 ? ANYOFL 19197 : ANYOF)); 19198 } 19199 19200 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof"); 19201 FILL_NODE(ret, op); /* We set the argument later */ 19202 RExC_emit += 1 + regarglen[op]; 19203 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; 19204 19205 /* Here, <cp_list> contains all the code points we can determine at 19206 * compile time that match under all conditions. Go through it, and 19207 * for things that belong in the bitmap, put them there, and delete from 19208 * <cp_list>. While we are at it, see if everything above 255 is in the 19209 * list, and if so, set a flag to speed up execution */ 19210 19211 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list); 19212 19213 if (posixl) { 19214 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl); 19215 } 19216 19217 if (invert) { 19218 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT; 19219 } 19220 19221 /* Here, the bitmap has been populated with all the Latin1 code points that 19222 * always match. Can now add to the overall list those that match only 19223 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>). 19224 * */ 19225 if (upper_latin1_only_utf8_matches) { 19226 if (cp_list) { 19227 _invlist_union(cp_list, 19228 upper_latin1_only_utf8_matches, 19229 &cp_list); 19230 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 19231 } 19232 else { 19233 cp_list = upper_latin1_only_utf8_matches; 19234 } 19235 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; 19236 } 19237 19238 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, 19239 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 19240 ? listsv : NULL, 19241 only_utf8_locale_list); 19242 return ret; 19243 19244 not_anyof: 19245 19246 /* Here, the node is getting optimized into something that's not an ANYOF 19247 * one. Finish up. */ 19248 19249 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start, 19250 RExC_parse - orig_parse);; 19251 SvREFCNT_dec(cp_list);; 19252 return ret; 19253 } 19254 19255 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 19256 19257 STATIC void 19258 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, 19259 regnode* const node, 19260 SV* const cp_list, 19261 SV* const runtime_defns, 19262 SV* const only_utf8_locale_list) 19263 { 19264 /* Sets the arg field of an ANYOF-type node 'node', using information about 19265 * the node passed-in. If there is nothing outside the node's bitmap, the 19266 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to 19267 * the count returned by add_data(), having allocated and stored an array, 19268 * av, as follows: 19269 * 19270 * av[0] stores the inversion list defining this class as far as known at 19271 * this time, or PL_sv_undef if nothing definite is now known. 19272 * av[1] stores the inversion list of code points that match only if the 19273 * current locale is UTF-8, or if none, PL_sv_undef if there is an 19274 * av[2], or no entry otherwise. 19275 * av[2] stores the list of user-defined properties whose subroutine 19276 * definitions aren't known at this time, or no entry if none. */ 19277 19278 UV n; 19279 19280 PERL_ARGS_ASSERT_SET_ANYOF_ARG; 19281 19282 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { 19283 assert(! (ANYOF_FLAGS(node) 19284 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)); 19285 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); 19286 } 19287 else { 19288 AV * const av = newAV(); 19289 SV *rv; 19290 19291 if (cp_list) { 19292 av_store(av, INVLIST_INDEX, cp_list); 19293 } 19294 19295 if (only_utf8_locale_list) { 19296 av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list); 19297 } 19298 19299 if (runtime_defns) { 19300 av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns)); 19301 } 19302 19303 rv = newRV_noinc(MUTABLE_SV(av)); 19304 n = add_data(pRExC_state, STR_WITH_LEN("s")); 19305 RExC_rxi->data->data[n] = (void*)rv; 19306 ARG_SET(node, n); 19307 } 19308 } 19309 19310 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 19311 SV * 19312 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, 19313 const regnode* node, 19314 bool doinit, 19315 SV** listsvp, 19316 SV** only_utf8_locale_ptr, 19317 SV** output_invlist) 19318 19319 { 19320 /* For internal core use only. 19321 * Returns the inversion list for the input 'node' in the regex 'prog'. 19322 * If <doinit> is 'true', will attempt to create the inversion list if not 19323 * already done. 19324 * If <listsvp> is non-null, will return the printable contents of the 19325 * property definition. This can be used to get debugging information 19326 * even before the inversion list exists, by calling this function with 19327 * 'doinit' set to false, in which case the components that will be used 19328 * to eventually create the inversion list are returned (in a printable 19329 * form). 19330 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to 19331 * store an inversion list of code points that should match only if the 19332 * execution-time locale is a UTF-8 one. 19333 * If <output_invlist> is not NULL, it is where this routine is to store an 19334 * inversion list of the code points that would be instead returned in 19335 * <listsvp> if this were NULL. Thus, what gets output in <listsvp> 19336 * when this parameter is used, is just the non-code point data that 19337 * will go into creating the inversion list. This currently should be just 19338 * user-defined properties whose definitions were not known at compile 19339 * time. Using this parameter allows for easier manipulation of the 19340 * inversion list's data by the caller. It is illegal to call this 19341 * function with this parameter set, but not <listsvp> 19342 * 19343 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note 19344 * that, in spite of this function's name, the inversion list it returns 19345 * may include the bitmap data as well */ 19346 19347 SV *si = NULL; /* Input initialization string */ 19348 SV* invlist = NULL; 19349 19350 RXi_GET_DECL(prog, progi); 19351 const struct reg_data * const data = prog ? progi->data : NULL; 19352 19353 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; 19354 assert(! output_invlist || listsvp); 19355 19356 if (data && data->count) { 19357 const U32 n = ARG(node); 19358 19359 if (data->what[n] == 's') { 19360 SV * const rv = MUTABLE_SV(data->data[n]); 19361 AV * const av = MUTABLE_AV(SvRV(rv)); 19362 SV **const ary = AvARRAY(av); 19363 19364 invlist = ary[INVLIST_INDEX]; 19365 19366 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) { 19367 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX]; 19368 } 19369 19370 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { 19371 si = ary[DEFERRED_USER_DEFINED_INDEX]; 19372 } 19373 19374 if (doinit && (si || invlist)) { 19375 if (si) { 19376 bool user_defined; 19377 SV * msg = newSVpvs_flags("", SVs_TEMP); 19378 19379 SV * prop_definition = handle_user_defined_property( 19380 "", 0, FALSE, /* There is no \p{}, \P{} */ 19381 SvPVX_const(si)[1] - '0', /* /i or not has been 19382 stored here for just 19383 this occasion */ 19384 TRUE, /* run time */ 19385 FALSE, /* This call must find the defn */ 19386 si, /* The property definition */ 19387 &user_defined, 19388 msg, 19389 0 /* base level call */ 19390 ); 19391 19392 if (SvCUR(msg)) { 19393 assert(prop_definition == NULL); 19394 19395 Perl_croak(aTHX_ "%" UTF8f, 19396 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg))); 19397 } 19398 19399 if (invlist) { 19400 _invlist_union(invlist, prop_definition, &invlist); 19401 SvREFCNT_dec_NN(prop_definition); 19402 } 19403 else { 19404 invlist = prop_definition; 19405 } 19406 19407 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX); 19408 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX); 19409 19410 av_store(av, INVLIST_INDEX, invlist); 19411 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX]) 19412 ? ONLY_LOCALE_MATCHES_INDEX: 19413 INVLIST_INDEX); 19414 si = NULL; 19415 } 19416 } 19417 } 19418 } 19419 19420 /* If requested, return a printable version of what this ANYOF node matches 19421 * */ 19422 if (listsvp) { 19423 SV* matches_string = NULL; 19424 19425 /* This function can be called at compile-time, before everything gets 19426 * resolved, in which case we return the currently best available 19427 * information, which is the string that will eventually be used to do 19428 * that resolving, 'si' */ 19429 if (si) { 19430 /* Here, we only have 'si' (and possibly some passed-in data in 19431 * 'invlist', which is handled below) If the caller only wants 19432 * 'si', use that. */ 19433 if (! output_invlist) { 19434 matches_string = newSVsv(si); 19435 } 19436 else { 19437 /* But if the caller wants an inversion list of the node, we 19438 * need to parse 'si' and place as much as possible in the 19439 * desired output inversion list, making 'matches_string' only 19440 * contain the currently unresolvable things */ 19441 const char *si_string = SvPVX(si); 19442 STRLEN remaining = SvCUR(si); 19443 UV prev_cp = 0; 19444 U8 count = 0; 19445 19446 /* Ignore everything before the first new-line */ 19447 while (*si_string != '\n' && remaining > 0) { 19448 si_string++; 19449 remaining--; 19450 } 19451 assert(remaining > 0); 19452 19453 si_string++; 19454 remaining--; 19455 19456 while (remaining > 0) { 19457 19458 /* The data consists of just strings defining user-defined 19459 * property names, but in prior incarnations, and perhaps 19460 * somehow from pluggable regex engines, it could still 19461 * hold hex code point definitions. Each component of a 19462 * range would be separated by a tab, and each range by a 19463 * new-line. If these are found, instead add them to the 19464 * inversion list */ 19465 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT 19466 |PERL_SCAN_SILENT_NON_PORTABLE; 19467 STRLEN len = remaining; 19468 UV cp = grok_hex(si_string, &len, &grok_flags, NULL); 19469 19470 /* If the hex decode routine found something, it should go 19471 * up to the next \n */ 19472 if ( *(si_string + len) == '\n') { 19473 if (count) { /* 2nd code point on line */ 19474 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); 19475 } 19476 else { 19477 *output_invlist = add_cp_to_invlist(*output_invlist, cp); 19478 } 19479 count = 0; 19480 goto prepare_for_next_iteration; 19481 } 19482 19483 /* If the hex decode was instead for the lower range limit, 19484 * save it, and go parse the upper range limit */ 19485 if (*(si_string + len) == '\t') { 19486 assert(count == 0); 19487 19488 prev_cp = cp; 19489 count = 1; 19490 prepare_for_next_iteration: 19491 si_string += len + 1; 19492 remaining -= len + 1; 19493 continue; 19494 } 19495 19496 /* Here, didn't find a legal hex number. Just add it from 19497 * here to the next \n */ 19498 19499 remaining -= len; 19500 while (*(si_string + len) != '\n' && remaining > 0) { 19501 remaining--; 19502 len++; 19503 } 19504 if (*(si_string + len) == '\n') { 19505 len++; 19506 remaining--; 19507 } 19508 if (matches_string) { 19509 sv_catpvn(matches_string, si_string, len - 1); 19510 } 19511 else { 19512 matches_string = newSVpvn(si_string, len - 1); 19513 } 19514 si_string += len; 19515 sv_catpvs(matches_string, " "); 19516 } /* end of loop through the text */ 19517 19518 assert(matches_string); 19519 if (SvCUR(matches_string)) { /* Get rid of trailing blank */ 19520 SvCUR_set(matches_string, SvCUR(matches_string) - 1); 19521 } 19522 } /* end of has an 'si' */ 19523 } 19524 19525 /* Add the stuff that's already known */ 19526 if (invlist) { 19527 19528 /* Again, if the caller doesn't want the output inversion list, put 19529 * everything in 'matches-string' */ 19530 if (! output_invlist) { 19531 if ( ! matches_string) { 19532 matches_string = newSVpvs("\n"); 19533 } 19534 sv_catsv(matches_string, invlist_contents(invlist, 19535 TRUE /* traditional style */ 19536 )); 19537 } 19538 else if (! *output_invlist) { 19539 *output_invlist = invlist_clone(invlist, NULL); 19540 } 19541 else { 19542 _invlist_union(*output_invlist, invlist, output_invlist); 19543 } 19544 } 19545 19546 *listsvp = matches_string; 19547 } 19548 19549 return invlist; 19550 } 19551 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ 19552 19553 /* reg_skipcomment() 19554 19555 Absorbs an /x style # comment from the input stream, 19556 returning a pointer to the first character beyond the comment, or if the 19557 comment terminates the pattern without anything following it, this returns 19558 one past the final character of the pattern (in other words, RExC_end) and 19559 sets the REG_RUN_ON_COMMENT_SEEN flag. 19560 19561 Note it's the callers responsibility to ensure that we are 19562 actually in /x mode 19563 19564 */ 19565 19566 PERL_STATIC_INLINE char* 19567 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) 19568 { 19569 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 19570 19571 assert(*p == '#'); 19572 19573 while (p < RExC_end) { 19574 if (*(++p) == '\n') { 19575 return p+1; 19576 } 19577 } 19578 19579 /* we ran off the end of the pattern without ending the comment, so we have 19580 * to add an \n when wrapping */ 19581 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 19582 return p; 19583 } 19584 19585 STATIC void 19586 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, 19587 char ** p, 19588 const bool force_to_xmod 19589 ) 19590 { 19591 /* If the text at the current parse position '*p' is a '(?#...)' comment, 19592 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' 19593 * is /x whitespace, advance '*p' so that on exit it points to the first 19594 * byte past all such white space and comments */ 19595 19596 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); 19597 19598 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; 19599 19600 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); 19601 19602 for (;;) { 19603 if (RExC_end - (*p) >= 3 19604 && *(*p) == '(' 19605 && *(*p + 1) == '?' 19606 && *(*p + 2) == '#') 19607 { 19608 while (*(*p) != ')') { 19609 if ((*p) == RExC_end) 19610 FAIL("Sequence (?#... not terminated"); 19611 (*p)++; 19612 } 19613 (*p)++; 19614 continue; 19615 } 19616 19617 if (use_xmod) { 19618 const char * save_p = *p; 19619 while ((*p) < RExC_end) { 19620 STRLEN len; 19621 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { 19622 (*p) += len; 19623 } 19624 else if (*(*p) == '#') { 19625 (*p) = reg_skipcomment(pRExC_state, (*p)); 19626 } 19627 else { 19628 break; 19629 } 19630 } 19631 if (*p != save_p) { 19632 continue; 19633 } 19634 } 19635 19636 break; 19637 } 19638 19639 return; 19640 } 19641 19642 /* nextchar() 19643 19644 Advances the parse position by one byte, unless that byte is the beginning 19645 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In 19646 those two cases, the parse position is advanced beyond all such comments and 19647 white space. 19648 19649 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++. 19650 */ 19651 19652 STATIC void 19653 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 19654 { 19655 PERL_ARGS_ASSERT_NEXTCHAR; 19656 19657 if (RExC_parse < RExC_end) { 19658 assert( ! UTF 19659 || UTF8_IS_INVARIANT(*RExC_parse) 19660 || UTF8_IS_START(*RExC_parse)); 19661 19662 RExC_parse += (UTF) 19663 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) 19664 : 1; 19665 19666 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 19667 FALSE /* Don't force /x */ ); 19668 } 19669 } 19670 19671 STATIC void 19672 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) 19673 { 19674 /* 'size' is the delta to add or subtract from the current memory allocated 19675 * to the regex engine being constructed */ 19676 19677 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; 19678 19679 RExC_size += size; 19680 19681 Renewc(RExC_rxi, 19682 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode), 19683 /* +1 for REG_MAGIC */ 19684 char, 19685 regexp_internal); 19686 if ( RExC_rxi == NULL ) 19687 FAIL("Regexp out of space"); 19688 RXi_SET(RExC_rx, RExC_rxi); 19689 19690 RExC_emit_start = RExC_rxi->program; 19691 if (size > 0) { 19692 Zero(REGNODE_p(RExC_emit), size, regnode); 19693 } 19694 19695 #ifdef RE_TRACK_PATTERN_OFFSETS 19696 Renew(RExC_offsets, 2*RExC_size+1, U32); 19697 if (size > 0) { 19698 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32); 19699 } 19700 RExC_offsets[0] = RExC_size; 19701 #endif 19702 } 19703 19704 STATIC regnode_offset 19705 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) 19706 { 19707 /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns 19708 * and increments RExC_size and RExC_emit 19709 * 19710 * It returns the regnode's offset into the regex engine program */ 19711 19712 const regnode_offset ret = RExC_emit; 19713 19714 GET_RE_DEBUG_FLAGS_DECL; 19715 19716 PERL_ARGS_ASSERT_REGNODE_GUTS; 19717 19718 SIZE_ALIGN(RExC_size); 19719 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); 19720 NODE_ALIGN_FILL(REGNODE_p(ret)); 19721 #ifndef RE_TRACK_PATTERN_OFFSETS 19722 PERL_UNUSED_ARG(name); 19723 PERL_UNUSED_ARG(op); 19724 #else 19725 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF); 19726 19727 if (RExC_offsets) { /* MJD */ 19728 MJD_OFFSET_DEBUG( 19729 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n", 19730 name, __LINE__, 19731 PL_reg_name[op], 19732 (UV)(RExC_emit) > RExC_offsets[0] 19733 ? "Overwriting end of array!\n" : "OK", 19734 (UV)(RExC_emit), 19735 (UV)(RExC_parse - RExC_start), 19736 (UV)RExC_offsets[0])); 19737 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END)); 19738 } 19739 #endif 19740 return(ret); 19741 } 19742 19743 /* 19744 - reg_node - emit a node 19745 */ 19746 STATIC regnode_offset /* Location. */ 19747 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 19748 { 19749 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); 19750 regnode_offset ptr = ret; 19751 19752 PERL_ARGS_ASSERT_REG_NODE; 19753 19754 assert(regarglen[op] == 0); 19755 19756 FILL_ADVANCE_NODE(ptr, op); 19757 RExC_emit = ptr; 19758 return(ret); 19759 } 19760 19761 /* 19762 - reganode - emit a node with an argument 19763 */ 19764 STATIC regnode_offset /* Location. */ 19765 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 19766 { 19767 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); 19768 regnode_offset ptr = ret; 19769 19770 PERL_ARGS_ASSERT_REGANODE; 19771 19772 /* ANYOF are special cased to allow non-length 1 args */ 19773 assert(regarglen[op] == 1); 19774 19775 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 19776 RExC_emit = ptr; 19777 return(ret); 19778 } 19779 19780 STATIC regnode_offset 19781 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) 19782 { 19783 /* emit a node with U32 and I32 arguments */ 19784 19785 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); 19786 regnode_offset ptr = ret; 19787 19788 PERL_ARGS_ASSERT_REG2LANODE; 19789 19790 assert(regarglen[op] == 2); 19791 19792 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); 19793 RExC_emit = ptr; 19794 return(ret); 19795 } 19796 19797 /* 19798 - reginsert - insert an operator in front of already-emitted operand 19799 * 19800 * That means that on exit 'operand' is the offset of the newly inserted 19801 * operator, and the original operand has been relocated. 19802 * 19803 * IMPORTANT NOTE - it is the *callers* responsibility to correctly 19804 * set up NEXT_OFF() of the inserted node if needed. Something like this: 19805 * 19806 * reginsert(pRExC, OPFAIL, orig_emit, depth+1); 19807 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE; 19808 * 19809 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well. 19810 */ 19811 STATIC void 19812 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, 19813 const regnode_offset operand, const U32 depth) 19814 { 19815 regnode *src; 19816 regnode *dst; 19817 regnode *place; 19818 const int offset = regarglen[(U8)op]; 19819 const int size = NODE_STEP_REGNODE + offset; 19820 GET_RE_DEBUG_FLAGS_DECL; 19821 19822 PERL_ARGS_ASSERT_REGINSERT; 19823 PERL_UNUSED_CONTEXT; 19824 PERL_UNUSED_ARG(depth); 19825 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 19826 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]); 19827 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started 19828 studying. If this is wrong then we need to adjust RExC_recurse 19829 below like we do with RExC_open_parens/RExC_close_parens. */ 19830 change_engine_size(pRExC_state, (Ptrdiff_t) size); 19831 src = REGNODE_p(RExC_emit); 19832 RExC_emit += size; 19833 dst = REGNODE_p(RExC_emit); 19834 19835 /* If we are in a "count the parentheses" pass, the numbers are unreliable, 19836 * and [perl #133871] shows this can lead to problems, so skip this 19837 * realignment of parens until a later pass when they are reliable */ 19838 if (! IN_PARENS_PASS && RExC_open_parens) { 19839 int paren; 19840 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ 19841 /* remember that RExC_npar is rex->nparens + 1, 19842 * iow it is 1 more than the number of parens seen in 19843 * the pattern so far. */ 19844 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 19845 /* note, RExC_open_parens[0] is the start of the 19846 * regex, it can't move. RExC_close_parens[0] is the end 19847 * of the regex, it *can* move. */ 19848 if ( paren && RExC_open_parens[paren] >= operand ) { 19849 /*DEBUG_PARSE_FMT("open"," - %d", size);*/ 19850 RExC_open_parens[paren] += size; 19851 } else { 19852 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 19853 } 19854 if ( RExC_close_parens[paren] >= operand ) { 19855 /*DEBUG_PARSE_FMT("close"," - %d", size);*/ 19856 RExC_close_parens[paren] += size; 19857 } else { 19858 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 19859 } 19860 } 19861 } 19862 if (RExC_end_op) 19863 RExC_end_op += size; 19864 19865 while (src > REGNODE_p(operand)) { 19866 StructCopy(--src, --dst, regnode); 19867 #ifdef RE_TRACK_PATTERN_OFFSETS 19868 if (RExC_offsets) { /* MJD 20010112 */ 19869 MJD_OFFSET_DEBUG( 19870 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n", 19871 "reginsert", 19872 __LINE__, 19873 PL_reg_name[op], 19874 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0] 19875 ? "Overwriting end of array!\n" : "OK", 19876 (UV)REGNODE_OFFSET(src), 19877 (UV)REGNODE_OFFSET(dst), 19878 (UV)RExC_offsets[0])); 19879 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src)); 19880 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src)); 19881 } 19882 #endif 19883 } 19884 19885 place = REGNODE_p(operand); /* Op node, where operand used to be. */ 19886 #ifdef RE_TRACK_PATTERN_OFFSETS 19887 if (RExC_offsets) { /* MJD */ 19888 MJD_OFFSET_DEBUG( 19889 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n", 19890 "reginsert", 19891 __LINE__, 19892 PL_reg_name[op], 19893 (UV)REGNODE_OFFSET(place) > RExC_offsets[0] 19894 ? "Overwriting end of array!\n" : "OK", 19895 (UV)REGNODE_OFFSET(place), 19896 (UV)(RExC_parse - RExC_start), 19897 (UV)RExC_offsets[0])); 19898 Set_Node_Offset(place, RExC_parse); 19899 Set_Node_Length(place, 1); 19900 } 19901 #endif 19902 src = NEXTOPER(place); 19903 FLAGS(place) = 0; 19904 FILL_NODE(operand, op); 19905 19906 /* Zero out any arguments in the new node */ 19907 Zero(src, offset, regnode); 19908 } 19909 19910 /* 19911 - regtail - set the next-pointer at the end of a node chain of p to val. If 19912 that value won't fit in the space available, instead returns FALSE. 19913 (Except asserts if we can't fit in the largest space the regex 19914 engine is designed for.) 19915 - SEE ALSO: regtail_study 19916 */ 19917 STATIC bool 19918 S_regtail(pTHX_ RExC_state_t * pRExC_state, 19919 const regnode_offset p, 19920 const regnode_offset val, 19921 const U32 depth) 19922 { 19923 regnode_offset scan; 19924 GET_RE_DEBUG_FLAGS_DECL; 19925 19926 PERL_ARGS_ASSERT_REGTAIL; 19927 #ifndef DEBUGGING 19928 PERL_UNUSED_ARG(depth); 19929 #endif 19930 19931 /* Find last node. */ 19932 scan = (regnode_offset) p; 19933 for (;;) { 19934 regnode * const temp = regnext(REGNODE_p(scan)); 19935 DEBUG_PARSE_r({ 19936 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 19937 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 19938 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n", 19939 SvPV_nolen_const(RExC_mysv), scan, 19940 (temp == NULL ? "->" : ""), 19941 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "") 19942 ); 19943 }); 19944 if (temp == NULL) 19945 break; 19946 scan = REGNODE_OFFSET(temp); 19947 } 19948 19949 if (reg_off_by_arg[OP(REGNODE_p(scan))]) { 19950 assert((UV) (val - scan) <= U32_MAX); 19951 ARG_SET(REGNODE_p(scan), val - scan); 19952 } 19953 else { 19954 if (val - scan > U16_MAX) { 19955 /* Populate this with something that won't loop and will likely 19956 * lead to a crash if the caller ignores the failure return, and 19957 * execution continues */ 19958 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 19959 return FALSE; 19960 } 19961 NEXT_OFF(REGNODE_p(scan)) = val - scan; 19962 } 19963 19964 return TRUE; 19965 } 19966 19967 #ifdef DEBUGGING 19968 /* 19969 - regtail_study - set the next-pointer at the end of a node chain of p to val. 19970 - Look for optimizable sequences at the same time. 19971 - currently only looks for EXACT chains. 19972 19973 This is experimental code. The idea is to use this routine to perform 19974 in place optimizations on branches and groups as they are constructed, 19975 with the long term intention of removing optimization from study_chunk so 19976 that it is purely analytical. 19977 19978 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 19979 to control which is which. 19980 19981 This used to return a value that was ignored. It was a problem that it is 19982 #ifdef'd to be another function that didn't return a value. khw has changed it 19983 so both currently return a pass/fail return. 19984 19985 */ 19986 /* TODO: All four parms should be const */ 19987 19988 STATIC bool 19989 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, 19990 const regnode_offset val, U32 depth) 19991 { 19992 regnode_offset scan; 19993 U8 exact = PSEUDO; 19994 #ifdef EXPERIMENTAL_INPLACESCAN 19995 I32 min = 0; 19996 #endif 19997 GET_RE_DEBUG_FLAGS_DECL; 19998 19999 PERL_ARGS_ASSERT_REGTAIL_STUDY; 20000 20001 20002 /* Find last node. */ 20003 20004 scan = p; 20005 for (;;) { 20006 regnode * const temp = regnext(REGNODE_p(scan)); 20007 #ifdef EXPERIMENTAL_INPLACESCAN 20008 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) { 20009 bool unfolded_multi_char; /* Unexamined in this routine */ 20010 if (join_exact(pRExC_state, scan, &min, 20011 &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) 20012 return TRUE; /* Was return EXACT */ 20013 } 20014 #endif 20015 if ( exact ) { 20016 switch (OP(REGNODE_p(scan))) { 20017 case EXACT: 20018 case EXACT_ONLY8: 20019 case EXACTL: 20020 case EXACTF: 20021 case EXACTFU_S_EDGE: 20022 case EXACTFAA_NO_TRIE: 20023 case EXACTFAA: 20024 case EXACTFU: 20025 case EXACTFU_ONLY8: 20026 case EXACTFLU8: 20027 case EXACTFUP: 20028 case EXACTFL: 20029 if( exact == PSEUDO ) 20030 exact= OP(REGNODE_p(scan)); 20031 else if ( exact != OP(REGNODE_p(scan)) ) 20032 exact= 0; 20033 case NOTHING: 20034 break; 20035 default: 20036 exact= 0; 20037 } 20038 } 20039 DEBUG_PARSE_r({ 20040 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 20041 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 20042 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n", 20043 SvPV_nolen_const(RExC_mysv), 20044 scan, 20045 PL_reg_name[exact]); 20046 }); 20047 if (temp == NULL) 20048 break; 20049 scan = REGNODE_OFFSET(temp); 20050 } 20051 DEBUG_PARSE_r({ 20052 DEBUG_PARSE_MSG(""); 20053 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state); 20054 Perl_re_printf( aTHX_ 20055 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n", 20056 SvPV_nolen_const(RExC_mysv), 20057 (IV)val, 20058 (IV)(val - scan) 20059 ); 20060 }); 20061 if (reg_off_by_arg[OP(REGNODE_p(scan))]) { 20062 assert((UV) (val - scan) <= U32_MAX); 20063 ARG_SET(REGNODE_p(scan), val - scan); 20064 } 20065 else { 20066 if (val - scan > U16_MAX) { 20067 /* Populate this with something that won't loop and will likely 20068 * lead to a crash if the caller ignores the failure return, and 20069 * execution continues */ 20070 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 20071 return FALSE; 20072 } 20073 NEXT_OFF(REGNODE_p(scan)) = val - scan; 20074 } 20075 20076 return TRUE; /* Was 'return exact' */ 20077 } 20078 #endif 20079 20080 STATIC SV* 20081 S_get_ANYOFM_contents(pTHX_ const regnode * n) { 20082 20083 /* Returns an inversion list of all the code points matched by the 20084 * ANYOFM/NANYOFM node 'n' */ 20085 20086 SV * cp_list = _new_invlist(-1); 20087 const U8 lowest = (U8) ARG(n); 20088 unsigned int i; 20089 U8 count = 0; 20090 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)]; 20091 20092 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS; 20093 20094 /* Starting with the lowest code point, any code point that ANDed with the 20095 * mask yields the lowest code point is in the set */ 20096 for (i = lowest; i <= 0xFF; i++) { 20097 if ((i & FLAGS(n)) == ARG(n)) { 20098 cp_list = add_cp_to_invlist(cp_list, i); 20099 count++; 20100 20101 /* We know how many code points (a power of two) that are in the 20102 * set. No use looking once we've got that number */ 20103 if (count >= needed) break; 20104 } 20105 } 20106 20107 if (OP(n) == NANYOFM) { 20108 _invlist_invert(cp_list); 20109 } 20110 return cp_list; 20111 } 20112 20113 /* 20114 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 20115 */ 20116 #ifdef DEBUGGING 20117 20118 static void 20119 S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 20120 { 20121 int bit; 20122 int set=0; 20123 20124 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); 20125 20126 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { 20127 if (flags & (1<<bit)) { 20128 if (!set++ && lead) 20129 Perl_re_printf( aTHX_ "%s", lead); 20130 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); 20131 } 20132 } 20133 if (lead) { 20134 if (set) 20135 Perl_re_printf( aTHX_ "\n"); 20136 else 20137 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 20138 } 20139 } 20140 20141 static void 20142 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 20143 { 20144 int bit; 20145 int set=0; 20146 regex_charset cs; 20147 20148 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); 20149 20150 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 20151 if (flags & (1<<bit)) { 20152 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 20153 continue; 20154 } 20155 if (!set++ && lead) 20156 Perl_re_printf( aTHX_ "%s", lead); 20157 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); 20158 } 20159 } 20160 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 20161 if (!set++ && lead) { 20162 Perl_re_printf( aTHX_ "%s", lead); 20163 } 20164 switch (cs) { 20165 case REGEX_UNICODE_CHARSET: 20166 Perl_re_printf( aTHX_ "UNICODE"); 20167 break; 20168 case REGEX_LOCALE_CHARSET: 20169 Perl_re_printf( aTHX_ "LOCALE"); 20170 break; 20171 case REGEX_ASCII_RESTRICTED_CHARSET: 20172 Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); 20173 break; 20174 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 20175 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); 20176 break; 20177 default: 20178 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); 20179 break; 20180 } 20181 } 20182 if (lead) { 20183 if (set) 20184 Perl_re_printf( aTHX_ "\n"); 20185 else 20186 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 20187 } 20188 } 20189 #endif 20190 20191 void 20192 Perl_regdump(pTHX_ const regexp *r) 20193 { 20194 #ifdef DEBUGGING 20195 int i; 20196 SV * const sv = sv_newmortal(); 20197 SV *dsv= sv_newmortal(); 20198 RXi_GET_DECL(r, ri); 20199 GET_RE_DEBUG_FLAGS_DECL; 20200 20201 PERL_ARGS_ASSERT_REGDUMP; 20202 20203 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 20204 20205 /* Header fields of interest. */ 20206 for (i = 0; i < 2; i++) { 20207 if (r->substrs->data[i].substr) { 20208 RE_PV_QUOTED_DECL(s, 0, dsv, 20209 SvPVX_const(r->substrs->data[i].substr), 20210 RE_SV_DUMPLEN(r->substrs->data[i].substr), 20211 PL_dump_re_max_len); 20212 Perl_re_printf( aTHX_ 20213 "%s %s%s at %" IVdf "..%" UVuf " ", 20214 i ? "floating" : "anchored", 20215 s, 20216 RE_SV_TAIL(r->substrs->data[i].substr), 20217 (IV)r->substrs->data[i].min_offset, 20218 (UV)r->substrs->data[i].max_offset); 20219 } 20220 else if (r->substrs->data[i].utf8_substr) { 20221 RE_PV_QUOTED_DECL(s, 1, dsv, 20222 SvPVX_const(r->substrs->data[i].utf8_substr), 20223 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), 20224 30); 20225 Perl_re_printf( aTHX_ 20226 "%s utf8 %s%s at %" IVdf "..%" UVuf " ", 20227 i ? "floating" : "anchored", 20228 s, 20229 RE_SV_TAIL(r->substrs->data[i].utf8_substr), 20230 (IV)r->substrs->data[i].min_offset, 20231 (UV)r->substrs->data[i].max_offset); 20232 } 20233 } 20234 20235 if (r->check_substr || r->check_utf8) 20236 Perl_re_printf( aTHX_ 20237 (const char *) 20238 ( r->check_substr == r->substrs->data[1].substr 20239 && r->check_utf8 == r->substrs->data[1].utf8_substr 20240 ? "(checking floating" : "(checking anchored")); 20241 if (r->intflags & PREGf_NOSCAN) 20242 Perl_re_printf( aTHX_ " noscan"); 20243 if (r->extflags & RXf_CHECK_ALL) 20244 Perl_re_printf( aTHX_ " isall"); 20245 if (r->check_substr || r->check_utf8) 20246 Perl_re_printf( aTHX_ ") "); 20247 20248 if (ri->regstclass) { 20249 regprop(r, sv, ri->regstclass, NULL, NULL); 20250 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); 20251 } 20252 if (r->intflags & PREGf_ANCH) { 20253 Perl_re_printf( aTHX_ "anchored"); 20254 if (r->intflags & PREGf_ANCH_MBOL) 20255 Perl_re_printf( aTHX_ "(MBOL)"); 20256 if (r->intflags & PREGf_ANCH_SBOL) 20257 Perl_re_printf( aTHX_ "(SBOL)"); 20258 if (r->intflags & PREGf_ANCH_GPOS) 20259 Perl_re_printf( aTHX_ "(GPOS)"); 20260 Perl_re_printf( aTHX_ " "); 20261 } 20262 if (r->intflags & PREGf_GPOS_SEEN) 20263 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); 20264 if (r->intflags & PREGf_SKIP) 20265 Perl_re_printf( aTHX_ "plus "); 20266 if (r->intflags & PREGf_IMPLICIT) 20267 Perl_re_printf( aTHX_ "implicit "); 20268 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); 20269 if (r->extflags & RXf_EVAL_SEEN) 20270 Perl_re_printf( aTHX_ "with eval "); 20271 Perl_re_printf( aTHX_ "\n"); 20272 DEBUG_FLAGS_r({ 20273 regdump_extflags("r->extflags: ", r->extflags); 20274 regdump_intflags("r->intflags: ", r->intflags); 20275 }); 20276 #else 20277 PERL_ARGS_ASSERT_REGDUMP; 20278 PERL_UNUSED_CONTEXT; 20279 PERL_UNUSED_ARG(r); 20280 #endif /* DEBUGGING */ 20281 } 20282 20283 /* Should be synchronized with ANYOF_ #defines in regcomp.h */ 20284 #ifdef DEBUGGING 20285 20286 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \ 20287 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \ 20288 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \ 20289 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \ 20290 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \ 20291 || _CC_VERTSPACE != 15 20292 # error Need to adjust order of anyofs[] 20293 # endif 20294 static const char * const anyofs[] = { 20295 "\\w", 20296 "\\W", 20297 "\\d", 20298 "\\D", 20299 "[:alpha:]", 20300 "[:^alpha:]", 20301 "[:lower:]", 20302 "[:^lower:]", 20303 "[:upper:]", 20304 "[:^upper:]", 20305 "[:punct:]", 20306 "[:^punct:]", 20307 "[:print:]", 20308 "[:^print:]", 20309 "[:alnum:]", 20310 "[:^alnum:]", 20311 "[:graph:]", 20312 "[:^graph:]", 20313 "[:cased:]", 20314 "[:^cased:]", 20315 "\\s", 20316 "\\S", 20317 "[:blank:]", 20318 "[:^blank:]", 20319 "[:xdigit:]", 20320 "[:^xdigit:]", 20321 "[:cntrl:]", 20322 "[:^cntrl:]", 20323 "[:ascii:]", 20324 "[:^ascii:]", 20325 "\\v", 20326 "\\V" 20327 }; 20328 #endif 20329 20330 /* 20331 - regprop - printable representation of opcode, with run time support 20332 */ 20333 20334 void 20335 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) 20336 { 20337 #ifdef DEBUGGING 20338 dVAR; 20339 int k; 20340 RXi_GET_DECL(prog, progi); 20341 GET_RE_DEBUG_FLAGS_DECL; 20342 20343 PERL_ARGS_ASSERT_REGPROP; 20344 20345 SvPVCLEAR(sv); 20346 20347 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 20348 /* It would be nice to FAIL() here, but this may be called from 20349 regexec.c, and it would be hard to supply pRExC_state. */ 20350 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 20351 (int)OP(o), (int)REGNODE_MAX); 20352 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 20353 20354 k = PL_regkind[OP(o)]; 20355 20356 if (k == EXACT) { 20357 sv_catpvs(sv, " "); 20358 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 20359 * is a crude hack but it may be the best for now since 20360 * we have no flag "this EXACTish node was UTF-8" 20361 * --jhi */ 20362 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, 20363 PL_colors[0], PL_colors[1], 20364 PERL_PV_ESCAPE_UNI_DETECT | 20365 PERL_PV_ESCAPE_NONASCII | 20366 PERL_PV_PRETTY_ELLIPSES | 20367 PERL_PV_PRETTY_LTGT | 20368 PERL_PV_PRETTY_NOCLEAR 20369 ); 20370 } else if (k == TRIE) { 20371 /* print the details of the trie in dumpuntil instead, as 20372 * progi->data isn't available here */ 20373 const char op = OP(o); 20374 const U32 n = ARG(o); 20375 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 20376 (reg_ac_data *)progi->data->data[n] : 20377 NULL; 20378 const reg_trie_data * const trie 20379 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 20380 20381 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]); 20382 DEBUG_TRIE_COMPILE_r({ 20383 if (trie->jump) 20384 sv_catpvs(sv, "(JUMP)"); 20385 Perl_sv_catpvf(aTHX_ sv, 20386 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", 20387 (UV)trie->startstate, 20388 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 20389 (UV)trie->wordcount, 20390 (UV)trie->minlen, 20391 (UV)trie->maxlen, 20392 (UV)TRIE_CHARCOUNT(trie), 20393 (UV)trie->uniquecharcount 20394 ); 20395 }); 20396 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 20397 sv_catpvs(sv, "["); 20398 (void) put_charclass_bitmap_innards(sv, 20399 ((IS_ANYOF_TRIE(op)) 20400 ? ANYOF_BITMAP(o) 20401 : TRIE_BITMAP(trie)), 20402 NULL, 20403 NULL, 20404 NULL, 20405 FALSE 20406 ); 20407 sv_catpvs(sv, "]"); 20408 } 20409 } else if (k == CURLY) { 20410 U32 lo = ARG1(o), hi = ARG2(o); 20411 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 20412 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 20413 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); 20414 if (hi == REG_INFTY) 20415 sv_catpvs(sv, "INFTY"); 20416 else 20417 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); 20418 sv_catpvs(sv, "}"); 20419 } 20420 else if (k == WHILEM && o->flags) /* Ordinal/of */ 20421 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 20422 else if (k == REF || k == OPEN || k == CLOSE 20423 || k == GROUPP || OP(o)==ACCEPT) 20424 { 20425 AV *name_list= NULL; 20426 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); 20427 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ 20428 if ( RXp_PAREN_NAMES(prog) ) { 20429 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 20430 } else if ( pRExC_state ) { 20431 name_list= RExC_paren_name_list; 20432 } 20433 if (name_list) { 20434 if ( k != REF || (OP(o) < NREF)) { 20435 SV **name= av_fetch(name_list, parno, 0 ); 20436 if (name) 20437 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 20438 } 20439 else { 20440 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); 20441 I32 *nums=(I32*)SvPVX(sv_dat); 20442 SV **name= av_fetch(name_list, nums[0], 0 ); 20443 I32 n; 20444 if (name) { 20445 for ( n=0; n<SvIVX(sv_dat); n++ ) { 20446 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, 20447 (n ? "," : ""), (IV)nums[n]); 20448 } 20449 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 20450 } 20451 } 20452 } 20453 if ( k == REF && reginfo) { 20454 U32 n = ARG(o); /* which paren pair */ 20455 I32 ln = prog->offs[n].start; 20456 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1) 20457 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 20458 else if (ln == prog->offs[n].end) 20459 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 20460 else { 20461 const char *s = reginfo->strbeg + ln; 20462 Perl_sv_catpvf(aTHX_ sv, ": "); 20463 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, 20464 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 20465 } 20466 } 20467 } else if (k == GOSUB) { 20468 AV *name_list= NULL; 20469 if ( RXp_PAREN_NAMES(prog) ) { 20470 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 20471 } else if ( pRExC_state ) { 20472 name_list= RExC_paren_name_list; 20473 } 20474 20475 /* Paren and offset */ 20476 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), 20477 (int)((o + (int)ARG2L(o)) - progi->program) ); 20478 if (name_list) { 20479 SV **name= av_fetch(name_list, ARG(o), 0 ); 20480 if (name) 20481 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 20482 } 20483 } 20484 else if (k == LOGICAL) 20485 /* 2: embedded, otherwise 1 */ 20486 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); 20487 else if (k == ANYOF) { 20488 const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o); 20489 bool do_sep = FALSE; /* Do we need to separate various components of 20490 the output? */ 20491 /* Set if there is still an unresolved user-defined property */ 20492 SV *unresolved = NULL; 20493 20494 /* Things that are ignored except when the runtime locale is UTF-8 */ 20495 SV *only_utf8_locale_invlist = NULL; 20496 20497 /* Code points that don't fit in the bitmap */ 20498 SV *nonbitmap_invlist = NULL; 20499 20500 /* And things that aren't in the bitmap, but are small enough to be */ 20501 SV* bitmap_range_not_in_bitmap = NULL; 20502 20503 const bool inverted = flags & ANYOF_INVERT; 20504 20505 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) { 20506 if (ANYOFL_UTF8_LOCALE_REQD(flags)) { 20507 sv_catpvs(sv, "{utf8-locale-reqd}"); 20508 } 20509 if (flags & ANYOFL_FOLD) { 20510 sv_catpvs(sv, "{i}"); 20511 } 20512 } 20513 20514 /* If there is stuff outside the bitmap, get it */ 20515 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { 20516 (void) _get_regclass_nonbitmap_data(prog, o, FALSE, 20517 &unresolved, 20518 &only_utf8_locale_invlist, 20519 &nonbitmap_invlist); 20520 /* The non-bitmap data may contain stuff that could fit in the 20521 * bitmap. This could come from a user-defined property being 20522 * finally resolved when this call was done; or much more likely 20523 * because there are matches that require UTF-8 to be valid, and so 20524 * aren't in the bitmap. This is teased apart later */ 20525 _invlist_intersection(nonbitmap_invlist, 20526 PL_InBitmap, 20527 &bitmap_range_not_in_bitmap); 20528 /* Leave just the things that don't fit into the bitmap */ 20529 _invlist_subtract(nonbitmap_invlist, 20530 PL_InBitmap, 20531 &nonbitmap_invlist); 20532 } 20533 20534 /* Obey this flag to add all above-the-bitmap code points */ 20535 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 20536 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 20537 NUM_ANYOF_CODE_POINTS, 20538 UV_MAX); 20539 } 20540 20541 /* Ready to start outputting. First, the initial left bracket */ 20542 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 20543 20544 if (OP(o) != ANYOFH) { 20545 /* Then all the things that could fit in the bitmap */ 20546 do_sep = put_charclass_bitmap_innards(sv, 20547 ANYOF_BITMAP(o), 20548 bitmap_range_not_in_bitmap, 20549 only_utf8_locale_invlist, 20550 o, 20551 20552 /* Can't try inverting for a 20553 * better display if there 20554 * are things that haven't 20555 * been resolved */ 20556 unresolved != NULL); 20557 SvREFCNT_dec(bitmap_range_not_in_bitmap); 20558 20559 /* If there are user-defined properties which haven't been defined 20560 * yet, output them. If the result is not to be inverted, it is 20561 * clearest to output them in a separate [] from the bitmap range 20562 * stuff. If the result is to be complemented, we have to show 20563 * everything in one [], as the inversion applies to the whole 20564 * thing. Use {braces} to separate them from anything in the 20565 * bitmap and anything above the bitmap. */ 20566 if (unresolved) { 20567 if (inverted) { 20568 if (! do_sep) { /* If didn't output anything in the bitmap 20569 */ 20570 sv_catpvs(sv, "^"); 20571 } 20572 sv_catpvs(sv, "{"); 20573 } 20574 else if (do_sep) { 20575 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], 20576 PL_colors[0]); 20577 } 20578 sv_catsv(sv, unresolved); 20579 if (inverted) { 20580 sv_catpvs(sv, "}"); 20581 } 20582 do_sep = ! inverted; 20583 } 20584 } 20585 20586 /* And, finally, add the above-the-bitmap stuff */ 20587 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { 20588 SV* contents; 20589 20590 /* See if truncation size is overridden */ 20591 const STRLEN dump_len = (PL_dump_re_max_len > 256) 20592 ? PL_dump_re_max_len 20593 : 256; 20594 20595 /* This is output in a separate [] */ 20596 if (do_sep) { 20597 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); 20598 } 20599 20600 /* And, for easy of understanding, it is shown in the 20601 * uncomplemented form if possible. The one exception being if 20602 * there are unresolved items, where the inversion has to be 20603 * delayed until runtime */ 20604 if (inverted && ! unresolved) { 20605 _invlist_invert(nonbitmap_invlist); 20606 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); 20607 } 20608 20609 contents = invlist_contents(nonbitmap_invlist, 20610 FALSE /* output suitable for catsv */ 20611 ); 20612 20613 /* If the output is shorter than the permissible maximum, just do it. */ 20614 if (SvCUR(contents) <= dump_len) { 20615 sv_catsv(sv, contents); 20616 } 20617 else { 20618 const char * contents_string = SvPVX(contents); 20619 STRLEN i = dump_len; 20620 20621 /* Otherwise, start at the permissible max and work back to the 20622 * first break possibility */ 20623 while (i > 0 && contents_string[i] != ' ') { 20624 i--; 20625 } 20626 if (i == 0) { /* Fail-safe. Use the max if we couldn't 20627 find a legal break */ 20628 i = dump_len; 20629 } 20630 20631 sv_catpvn(sv, contents_string, i); 20632 sv_catpvs(sv, "..."); 20633 } 20634 20635 SvREFCNT_dec_NN(contents); 20636 SvREFCNT_dec_NN(nonbitmap_invlist); 20637 } 20638 20639 /* And finally the matching, closing ']' */ 20640 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 20641 20642 if (OP(o) == ANYOFH && FLAGS(o) != 0) { 20643 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o)); 20644 } 20645 20646 20647 SvREFCNT_dec(unresolved); 20648 } 20649 else if (k == ANYOFM) { 20650 SV * cp_list = get_ANYOFM_contents(o); 20651 20652 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 20653 if (OP(o) == NANYOFM) { 20654 _invlist_invert(cp_list); 20655 } 20656 20657 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE); 20658 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 20659 20660 SvREFCNT_dec(cp_list); 20661 } 20662 else if (k == POSIXD || k == NPOSIXD) { 20663 U8 index = FLAGS(o) * 2; 20664 if (index < C_ARRAY_LENGTH(anyofs)) { 20665 if (*anyofs[index] != '[') { 20666 sv_catpvs(sv, "["); 20667 } 20668 sv_catpv(sv, anyofs[index]); 20669 if (*anyofs[index] != '[') { 20670 sv_catpvs(sv, "]"); 20671 } 20672 } 20673 else { 20674 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 20675 } 20676 } 20677 else if (k == BOUND || k == NBOUND) { 20678 /* Must be synced with order of 'bound_type' in regcomp.h */ 20679 const char * const bounds[] = { 20680 "", /* Traditional */ 20681 "{gcb}", 20682 "{lb}", 20683 "{sb}", 20684 "{wb}" 20685 }; 20686 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); 20687 sv_catpv(sv, bounds[FLAGS(o)]); 20688 } 20689 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) { 20690 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); 20691 if (o->next_off) { 20692 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); 20693 } 20694 Perl_sv_catpvf(aTHX_ sv, "]"); 20695 } 20696 else if (OP(o) == SBOL) 20697 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); 20698 20699 /* add on the verb argument if there is one */ 20700 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) { 20701 if ( ARG(o) ) 20702 Perl_sv_catpvf(aTHX_ sv, ":%" SVf, 20703 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); 20704 else 20705 sv_catpvs(sv, ":NULL"); 20706 } 20707 #else 20708 PERL_UNUSED_CONTEXT; 20709 PERL_UNUSED_ARG(sv); 20710 PERL_UNUSED_ARG(o); 20711 PERL_UNUSED_ARG(prog); 20712 PERL_UNUSED_ARG(reginfo); 20713 PERL_UNUSED_ARG(pRExC_state); 20714 #endif /* DEBUGGING */ 20715 } 20716 20717 20718 20719 SV * 20720 Perl_re_intuit_string(pTHX_ REGEXP * const r) 20721 { /* Assume that RE_INTUIT is set */ 20722 struct regexp *const prog = ReANY(r); 20723 GET_RE_DEBUG_FLAGS_DECL; 20724 20725 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 20726 PERL_UNUSED_CONTEXT; 20727 20728 DEBUG_COMPILE_r( 20729 { 20730 const char * const s = SvPV_nolen_const(RX_UTF8(r) 20731 ? prog->check_utf8 : prog->check_substr); 20732 20733 if (!PL_colorset) reginitcolors(); 20734 Perl_re_printf( aTHX_ 20735 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 20736 PL_colors[4], 20737 RX_UTF8(r) ? "utf8 " : "", 20738 PL_colors[5], PL_colors[0], 20739 s, 20740 PL_colors[1], 20741 (strlen(s) > PL_dump_re_max_len ? "..." : "")); 20742 } ); 20743 20744 /* use UTF8 check substring if regexp pattern itself is in UTF8 */ 20745 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; 20746 } 20747 20748 /* 20749 pregfree() 20750 20751 handles refcounting and freeing the perl core regexp structure. When 20752 it is necessary to actually free the structure the first thing it 20753 does is call the 'free' method of the regexp_engine associated to 20754 the regexp, allowing the handling of the void *pprivate; member 20755 first. (This routine is not overridable by extensions, which is why 20756 the extensions free is called first.) 20757 20758 See regdupe and regdupe_internal if you change anything here. 20759 */ 20760 #ifndef PERL_IN_XSUB_RE 20761 void 20762 Perl_pregfree(pTHX_ REGEXP *r) 20763 { 20764 SvREFCNT_dec(r); 20765 } 20766 20767 void 20768 Perl_pregfree2(pTHX_ REGEXP *rx) 20769 { 20770 struct regexp *const r = ReANY(rx); 20771 GET_RE_DEBUG_FLAGS_DECL; 20772 20773 PERL_ARGS_ASSERT_PREGFREE2; 20774 20775 if (! r) 20776 return; 20777 20778 if (r->mother_re) { 20779 ReREFCNT_dec(r->mother_re); 20780 } else { 20781 CALLREGFREE_PVT(rx); /* free the private data */ 20782 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 20783 } 20784 if (r->substrs) { 20785 int i; 20786 for (i = 0; i < 2; i++) { 20787 SvREFCNT_dec(r->substrs->data[i].substr); 20788 SvREFCNT_dec(r->substrs->data[i].utf8_substr); 20789 } 20790 Safefree(r->substrs); 20791 } 20792 RX_MATCH_COPY_FREE(rx); 20793 #ifdef PERL_ANY_COW 20794 SvREFCNT_dec(r->saved_copy); 20795 #endif 20796 Safefree(r->offs); 20797 SvREFCNT_dec(r->qr_anoncv); 20798 if (r->recurse_locinput) 20799 Safefree(r->recurse_locinput); 20800 } 20801 20802 20803 /* reg_temp_copy() 20804 20805 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV, 20806 except that dsv will be created if NULL. 20807 20808 This function is used in two main ways. First to implement 20809 $r = qr/....; $s = $$r; 20810 20811 Secondly, it is used as a hacky workaround to the structural issue of 20812 match results 20813 being stored in the regexp structure which is in turn stored in 20814 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 20815 could be PL_curpm in multiple contexts, and could require multiple 20816 result sets being associated with the pattern simultaneously, such 20817 as when doing a recursive match with (??{$qr}) 20818 20819 The solution is to make a lightweight copy of the regexp structure 20820 when a qr// is returned from the code executed by (??{$qr}) this 20821 lightweight copy doesn't actually own any of its data except for 20822 the starp/end and the actual regexp structure itself. 20823 20824 */ 20825 20826 20827 REGEXP * 20828 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv) 20829 { 20830 struct regexp *drx; 20831 struct regexp *const srx = ReANY(ssv); 20832 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV; 20833 20834 PERL_ARGS_ASSERT_REG_TEMP_COPY; 20835 20836 if (!dsv) 20837 dsv = (REGEXP*) newSV_type(SVt_REGEXP); 20838 else { 20839 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV)); 20840 20841 /* our only valid caller, sv_setsv_flags(), should have done 20842 * a SV_CHECK_THINKFIRST_COW_DROP() by now */ 20843 assert(!SvOOK(dsv)); 20844 assert(!SvIsCOW(dsv)); 20845 assert(!SvROK(dsv)); 20846 20847 if (SvPVX_const(dsv)) { 20848 if (SvLEN(dsv)) 20849 Safefree(SvPVX(dsv)); 20850 SvPVX(dsv) = NULL; 20851 } 20852 SvLEN_set(dsv, 0); 20853 SvCUR_set(dsv, 0); 20854 SvOK_off((SV *)dsv); 20855 20856 if (islv) { 20857 /* For PVLVs, the head (sv_any) points to an XPVLV, while 20858 * the LV's xpvlenu_rx will point to a regexp body, which 20859 * we allocate here */ 20860 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); 20861 assert(!SvPVX(dsv)); 20862 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any; 20863 temp->sv_any = NULL; 20864 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; 20865 SvREFCNT_dec_NN(temp); 20866 /* SvCUR still resides in the xpvlv struct, so the regexp copy- 20867 ing below will not set it. */ 20868 SvCUR_set(dsv, SvCUR(ssv)); 20869 } 20870 } 20871 /* This ensures that SvTHINKFIRST(sv) is true, and hence that 20872 sv_force_normal(sv) is called. */ 20873 SvFAKE_on(dsv); 20874 drx = ReANY(dsv); 20875 20876 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8); 20877 SvPV_set(dsv, RX_WRAPPED(ssv)); 20878 /* We share the same string buffer as the original regexp, on which we 20879 hold a reference count, incremented when mother_re is set below. 20880 The string pointer is copied here, being part of the regexp struct. 20881 */ 20882 memcpy(&(drx->xpv_cur), &(srx->xpv_cur), 20883 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 20884 if (!islv) 20885 SvLEN_set(dsv, 0); 20886 if (srx->offs) { 20887 const I32 npar = srx->nparens+1; 20888 Newx(drx->offs, npar, regexp_paren_pair); 20889 Copy(srx->offs, drx->offs, npar, regexp_paren_pair); 20890 } 20891 if (srx->substrs) { 20892 int i; 20893 Newx(drx->substrs, 1, struct reg_substr_data); 20894 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data); 20895 20896 for (i = 0; i < 2; i++) { 20897 SvREFCNT_inc_void(drx->substrs->data[i].substr); 20898 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr); 20899 } 20900 20901 /* check_substr and check_utf8, if non-NULL, point to either their 20902 anchored or float namesakes, and don't hold a second reference. */ 20903 } 20904 RX_MATCH_COPIED_off(dsv); 20905 #ifdef PERL_ANY_COW 20906 drx->saved_copy = NULL; 20907 #endif 20908 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv); 20909 SvREFCNT_inc_void(drx->qr_anoncv); 20910 if (srx->recurse_locinput) 20911 Newx(drx->recurse_locinput, srx->nparens + 1, char *); 20912 20913 return dsv; 20914 } 20915 #endif 20916 20917 20918 /* regfree_internal() 20919 20920 Free the private data in a regexp. This is overloadable by 20921 extensions. Perl takes care of the regexp structure in pregfree(), 20922 this covers the *pprivate pointer which technically perl doesn't 20923 know about, however of course we have to handle the 20924 regexp_internal structure when no extension is in use. 20925 20926 Note this is called before freeing anything in the regexp 20927 structure. 20928 */ 20929 20930 void 20931 Perl_regfree_internal(pTHX_ REGEXP * const rx) 20932 { 20933 struct regexp *const r = ReANY(rx); 20934 RXi_GET_DECL(r, ri); 20935 GET_RE_DEBUG_FLAGS_DECL; 20936 20937 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 20938 20939 if (! ri) { 20940 return; 20941 } 20942 20943 DEBUG_COMPILE_r({ 20944 if (!PL_colorset) 20945 reginitcolors(); 20946 { 20947 SV *dsv= sv_newmortal(); 20948 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 20949 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len); 20950 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", 20951 PL_colors[4], PL_colors[5], s); 20952 } 20953 }); 20954 20955 #ifdef RE_TRACK_PATTERN_OFFSETS 20956 if (ri->u.offsets) 20957 Safefree(ri->u.offsets); /* 20010421 MJD */ 20958 #endif 20959 if (ri->code_blocks) 20960 S_free_codeblocks(aTHX_ ri->code_blocks); 20961 20962 if (ri->data) { 20963 int n = ri->data->count; 20964 20965 while (--n >= 0) { 20966 /* If you add a ->what type here, update the comment in regcomp.h */ 20967 switch (ri->data->what[n]) { 20968 case 'a': 20969 case 'r': 20970 case 's': 20971 case 'S': 20972 case 'u': 20973 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 20974 break; 20975 case 'f': 20976 Safefree(ri->data->data[n]); 20977 break; 20978 case 'l': 20979 case 'L': 20980 break; 20981 case 'T': 20982 { /* Aho Corasick add-on structure for a trie node. 20983 Used in stclass optimization only */ 20984 U32 refcount; 20985 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 20986 #ifdef USE_ITHREADS 20987 dVAR; 20988 #endif 20989 OP_REFCNT_LOCK; 20990 refcount = --aho->refcount; 20991 OP_REFCNT_UNLOCK; 20992 if ( !refcount ) { 20993 PerlMemShared_free(aho->states); 20994 PerlMemShared_free(aho->fail); 20995 /* do this last!!!! */ 20996 PerlMemShared_free(ri->data->data[n]); 20997 /* we should only ever get called once, so 20998 * assert as much, and also guard the free 20999 * which /might/ happen twice. At the least 21000 * it will make code anlyzers happy and it 21001 * doesn't cost much. - Yves */ 21002 assert(ri->regstclass); 21003 if (ri->regstclass) { 21004 PerlMemShared_free(ri->regstclass); 21005 ri->regstclass = 0; 21006 } 21007 } 21008 } 21009 break; 21010 case 't': 21011 { 21012 /* trie structure. */ 21013 U32 refcount; 21014 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 21015 #ifdef USE_ITHREADS 21016 dVAR; 21017 #endif 21018 OP_REFCNT_LOCK; 21019 refcount = --trie->refcount; 21020 OP_REFCNT_UNLOCK; 21021 if ( !refcount ) { 21022 PerlMemShared_free(trie->charmap); 21023 PerlMemShared_free(trie->states); 21024 PerlMemShared_free(trie->trans); 21025 if (trie->bitmap) 21026 PerlMemShared_free(trie->bitmap); 21027 if (trie->jump) 21028 PerlMemShared_free(trie->jump); 21029 PerlMemShared_free(trie->wordinfo); 21030 /* do this last!!!! */ 21031 PerlMemShared_free(ri->data->data[n]); 21032 } 21033 } 21034 break; 21035 default: 21036 Perl_croak(aTHX_ "panic: regfree data code '%c'", 21037 ri->data->what[n]); 21038 } 21039 } 21040 Safefree(ri->data->what); 21041 Safefree(ri->data); 21042 } 21043 21044 Safefree(ri); 21045 } 21046 21047 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t)) 21048 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t)) 21049 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) 21050 21051 /* 21052 re_dup_guts - duplicate a regexp. 21053 21054 This routine is expected to clone a given regexp structure. It is only 21055 compiled under USE_ITHREADS. 21056 21057 After all of the core data stored in struct regexp is duplicated 21058 the regexp_engine.dupe method is used to copy any private data 21059 stored in the *pprivate pointer. This allows extensions to handle 21060 any duplication it needs to do. 21061 21062 See pregfree() and regfree_internal() if you change anything here. 21063 */ 21064 #if defined(USE_ITHREADS) 21065 #ifndef PERL_IN_XSUB_RE 21066 void 21067 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 21068 { 21069 dVAR; 21070 I32 npar; 21071 const struct regexp *r = ReANY(sstr); 21072 struct regexp *ret = ReANY(dstr); 21073 21074 PERL_ARGS_ASSERT_RE_DUP_GUTS; 21075 21076 npar = r->nparens+1; 21077 Newx(ret->offs, npar, regexp_paren_pair); 21078 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 21079 21080 if (ret->substrs) { 21081 /* Do it this way to avoid reading from *r after the StructCopy(). 21082 That way, if any of the sv_dup_inc()s dislodge *r from the L1 21083 cache, it doesn't matter. */ 21084 int i; 21085 const bool anchored = r->check_substr 21086 ? r->check_substr == r->substrs->data[0].substr 21087 : r->check_utf8 == r->substrs->data[0].utf8_substr; 21088 Newx(ret->substrs, 1, struct reg_substr_data); 21089 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 21090 21091 for (i = 0; i < 2; i++) { 21092 ret->substrs->data[i].substr = 21093 sv_dup_inc(ret->substrs->data[i].substr, param); 21094 ret->substrs->data[i].utf8_substr = 21095 sv_dup_inc(ret->substrs->data[i].utf8_substr, param); 21096 } 21097 21098 /* check_substr and check_utf8, if non-NULL, point to either their 21099 anchored or float namesakes, and don't hold a second reference. */ 21100 21101 if (ret->check_substr) { 21102 if (anchored) { 21103 assert(r->check_utf8 == r->substrs->data[0].utf8_substr); 21104 21105 ret->check_substr = ret->substrs->data[0].substr; 21106 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 21107 } else { 21108 assert(r->check_substr == r->substrs->data[1].substr); 21109 assert(r->check_utf8 == r->substrs->data[1].utf8_substr); 21110 21111 ret->check_substr = ret->substrs->data[1].substr; 21112 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 21113 } 21114 } else if (ret->check_utf8) { 21115 if (anchored) { 21116 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 21117 } else { 21118 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 21119 } 21120 } 21121 } 21122 21123 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 21124 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); 21125 if (r->recurse_locinput) 21126 Newx(ret->recurse_locinput, r->nparens + 1, char *); 21127 21128 if (ret->pprivate) 21129 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param)); 21130 21131 if (RX_MATCH_COPIED(dstr)) 21132 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 21133 else 21134 ret->subbeg = NULL; 21135 #ifdef PERL_ANY_COW 21136 ret->saved_copy = NULL; 21137 #endif 21138 21139 /* Whether mother_re be set or no, we need to copy the string. We 21140 cannot refrain from copying it when the storage points directly to 21141 our mother regexp, because that's 21142 1: a buffer in a different thread 21143 2: something we no longer hold a reference on 21144 so we need to copy it locally. */ 21145 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1); 21146 /* set malloced length to a non-zero value so it will be freed 21147 * (otherwise in combination with SVf_FAKE it looks like an alien 21148 * buffer). It doesn't have to be the actual malloced size, since it 21149 * should never be grown */ 21150 SvLEN_set(dstr, SvCUR(sstr)+1); 21151 ret->mother_re = NULL; 21152 } 21153 #endif /* PERL_IN_XSUB_RE */ 21154 21155 /* 21156 regdupe_internal() 21157 21158 This is the internal complement to regdupe() which is used to copy 21159 the structure pointed to by the *pprivate pointer in the regexp. 21160 This is the core version of the extension overridable cloning hook. 21161 The regexp structure being duplicated will be copied by perl prior 21162 to this and will be provided as the regexp *r argument, however 21163 with the /old/ structures pprivate pointer value. Thus this routine 21164 may override any copying normally done by perl. 21165 21166 It returns a pointer to the new regexp_internal structure. 21167 */ 21168 21169 void * 21170 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 21171 { 21172 dVAR; 21173 struct regexp *const r = ReANY(rx); 21174 regexp_internal *reti; 21175 int len; 21176 RXi_GET_DECL(r, ri); 21177 21178 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 21179 21180 len = ProgLen(ri); 21181 21182 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), 21183 char, regexp_internal); 21184 Copy(ri->program, reti->program, len+1, regnode); 21185 21186 21187 if (ri->code_blocks) { 21188 int n; 21189 Newx(reti->code_blocks, 1, struct reg_code_blocks); 21190 Newx(reti->code_blocks->cb, ri->code_blocks->count, 21191 struct reg_code_block); 21192 Copy(ri->code_blocks->cb, reti->code_blocks->cb, 21193 ri->code_blocks->count, struct reg_code_block); 21194 for (n = 0; n < ri->code_blocks->count; n++) 21195 reti->code_blocks->cb[n].src_regex = (REGEXP*) 21196 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param); 21197 reti->code_blocks->count = ri->code_blocks->count; 21198 reti->code_blocks->refcnt = 1; 21199 } 21200 else 21201 reti->code_blocks = NULL; 21202 21203 reti->regstclass = NULL; 21204 21205 if (ri->data) { 21206 struct reg_data *d; 21207 const int count = ri->data->count; 21208 int i; 21209 21210 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 21211 char, struct reg_data); 21212 Newx(d->what, count, U8); 21213 21214 d->count = count; 21215 for (i = 0; i < count; i++) { 21216 d->what[i] = ri->data->what[i]; 21217 switch (d->what[i]) { 21218 /* see also regcomp.h and regfree_internal() */ 21219 case 'a': /* actually an AV, but the dup function is identical. 21220 values seem to be "plain sv's" generally. */ 21221 case 'r': /* a compiled regex (but still just another SV) */ 21222 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code) 21223 this use case should go away, the code could have used 21224 'a' instead - see S_set_ANYOF_arg() for array contents. */ 21225 case 'S': /* actually an SV, but the dup function is identical. */ 21226 case 'u': /* actually an HV, but the dup function is identical. 21227 values are "plain sv's" */ 21228 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 21229 break; 21230 case 'f': 21231 /* Synthetic Start Class - "Fake" charclass we generate to optimize 21232 * patterns which could start with several different things. Pre-TRIE 21233 * this was more important than it is now, however this still helps 21234 * in some places, for instance /x?a+/ might produce a SSC equivalent 21235 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass() 21236 * in regexec.c 21237 */ 21238 /* This is cheating. */ 21239 Newx(d->data[i], 1, regnode_ssc); 21240 StructCopy(ri->data->data[i], d->data[i], regnode_ssc); 21241 reti->regstclass = (regnode*)d->data[i]; 21242 break; 21243 case 'T': 21244 /* AHO-CORASICK fail table */ 21245 /* Trie stclasses are readonly and can thus be shared 21246 * without duplication. We free the stclass in pregfree 21247 * when the corresponding reg_ac_data struct is freed. 21248 */ 21249 reti->regstclass= ri->regstclass; 21250 /* FALLTHROUGH */ 21251 case 't': 21252 /* TRIE transition table */ 21253 OP_REFCNT_LOCK; 21254 ((reg_trie_data*)ri->data->data[i])->refcount++; 21255 OP_REFCNT_UNLOCK; 21256 /* FALLTHROUGH */ 21257 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */ 21258 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code 21259 is not from another regexp */ 21260 d->data[i] = ri->data->data[i]; 21261 break; 21262 default: 21263 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", 21264 ri->data->what[i]); 21265 } 21266 } 21267 21268 reti->data = d; 21269 } 21270 else 21271 reti->data = NULL; 21272 21273 reti->name_list_idx = ri->name_list_idx; 21274 21275 #ifdef RE_TRACK_PATTERN_OFFSETS 21276 if (ri->u.offsets) { 21277 Newx(reti->u.offsets, 2*len+1, U32); 21278 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 21279 } 21280 #else 21281 SetProgLen(reti, len); 21282 #endif 21283 21284 return (void*)reti; 21285 } 21286 21287 #endif /* USE_ITHREADS */ 21288 21289 #ifndef PERL_IN_XSUB_RE 21290 21291 /* 21292 - regnext - dig the "next" pointer out of a node 21293 */ 21294 regnode * 21295 Perl_regnext(pTHX_ regnode *p) 21296 { 21297 I32 offset; 21298 21299 if (!p) 21300 return(NULL); 21301 21302 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ 21303 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 21304 (int)OP(p), (int)REGNODE_MAX); 21305 } 21306 21307 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 21308 if (offset == 0) 21309 return(NULL); 21310 21311 return(p+offset); 21312 } 21313 21314 #endif 21315 21316 STATIC void 21317 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...) 21318 { 21319 va_list args; 21320 STRLEN l1 = strlen(pat1); 21321 STRLEN l2 = strlen(pat2); 21322 char buf[512]; 21323 SV *msv; 21324 const char *message; 21325 21326 PERL_ARGS_ASSERT_RE_CROAK2; 21327 21328 if (l1 > 510) 21329 l1 = 510; 21330 if (l1 + l2 > 510) 21331 l2 = 510 - l1; 21332 Copy(pat1, buf, l1 , char); 21333 Copy(pat2, buf + l1, l2 , char); 21334 buf[l1 + l2] = '\n'; 21335 buf[l1 + l2 + 1] = '\0'; 21336 va_start(args, pat2); 21337 msv = vmess(buf, &args); 21338 va_end(args); 21339 message = SvPV_const(msv, l1); 21340 if (l1 > 512) 21341 l1 = 512; 21342 Copy(message, buf, l1 , char); 21343 /* l1-1 to avoid \n */ 21344 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf)); 21345 } 21346 21347 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 21348 21349 #ifndef PERL_IN_XSUB_RE 21350 void 21351 Perl_save_re_context(pTHX) 21352 { 21353 I32 nparens = -1; 21354 I32 i; 21355 21356 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 21357 21358 if (PL_curpm) { 21359 const REGEXP * const rx = PM_GETRE(PL_curpm); 21360 if (rx) 21361 nparens = RX_NPARENS(rx); 21362 } 21363 21364 /* RT #124109. This is a complete hack; in the SWASHNEW case we know 21365 * that PL_curpm will be null, but that utf8.pm and the modules it 21366 * loads will only use $1..$3. 21367 * The t/porting/re_context.t test file checks this assumption. 21368 */ 21369 if (nparens == -1) 21370 nparens = 3; 21371 21372 for (i = 1; i <= nparens; i++) { 21373 char digits[TYPE_CHARS(long)]; 21374 const STRLEN len = my_snprintf(digits, sizeof(digits), 21375 "%lu", (long)i); 21376 GV *const *const gvp 21377 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 21378 21379 if (gvp) { 21380 GV * const gv = *gvp; 21381 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 21382 save_scalar(gv); 21383 } 21384 } 21385 } 21386 #endif 21387 21388 #ifdef DEBUGGING 21389 21390 STATIC void 21391 S_put_code_point(pTHX_ SV *sv, UV c) 21392 { 21393 PERL_ARGS_ASSERT_PUT_CODE_POINT; 21394 21395 if (c > 255) { 21396 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); 21397 } 21398 else if (isPRINT(c)) { 21399 const char string = (char) c; 21400 21401 /* We use {phrase} as metanotation in the class, so also escape literal 21402 * braces */ 21403 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') 21404 sv_catpvs(sv, "\\"); 21405 sv_catpvn(sv, &string, 1); 21406 } 21407 else if (isMNEMONIC_CNTRL(c)) { 21408 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); 21409 } 21410 else { 21411 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); 21412 } 21413 } 21414 21415 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 21416 21417 STATIC void 21418 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) 21419 { 21420 /* Appends to 'sv' a displayable version of the range of code points from 21421 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls 21422 * that have them, when they occur at the beginning or end of the range. 21423 * It uses hex to output the remaining code points, unless 'allow_literals' 21424 * is true, in which case the printable ASCII ones are output as-is (though 21425 * some of these will be escaped by put_code_point()). 21426 * 21427 * NOTE: This is designed only for printing ranges of code points that fit 21428 * inside an ANYOF bitmap. Higher code points are simply suppressed 21429 */ 21430 21431 const unsigned int min_range_count = 3; 21432 21433 assert(start <= end); 21434 21435 PERL_ARGS_ASSERT_PUT_RANGE; 21436 21437 while (start <= end) { 21438 UV this_end; 21439 const char * format; 21440 21441 if (end - start < min_range_count) { 21442 21443 /* Output chars individually when they occur in short ranges */ 21444 for (; start <= end; start++) { 21445 put_code_point(sv, start); 21446 } 21447 break; 21448 } 21449 21450 /* If permitted by the input options, and there is a possibility that 21451 * this range contains a printable literal, look to see if there is 21452 * one. */ 21453 if (allow_literals && start <= MAX_PRINT_A) { 21454 21455 /* If the character at the beginning of the range isn't an ASCII 21456 * printable, effectively split the range into two parts: 21457 * 1) the portion before the first such printable, 21458 * 2) the rest 21459 * and output them separately. */ 21460 if (! isPRINT_A(start)) { 21461 UV temp_end = start + 1; 21462 21463 /* There is no point looking beyond the final possible 21464 * printable, in MAX_PRINT_A */ 21465 UV max = MIN(end, MAX_PRINT_A); 21466 21467 while (temp_end <= max && ! isPRINT_A(temp_end)) { 21468 temp_end++; 21469 } 21470 21471 /* Here, temp_end points to one beyond the first printable if 21472 * found, or to one beyond 'max' if not. If none found, make 21473 * sure that we use the entire range */ 21474 if (temp_end > MAX_PRINT_A) { 21475 temp_end = end + 1; 21476 } 21477 21478 /* Output the first part of the split range: the part that 21479 * doesn't have printables, with the parameter set to not look 21480 * for literals (otherwise we would infinitely recurse) */ 21481 put_range(sv, start, temp_end - 1, FALSE); 21482 21483 /* The 2nd part of the range (if any) starts here. */ 21484 start = temp_end; 21485 21486 /* We do a continue, instead of dropping down, because even if 21487 * the 2nd part is non-empty, it could be so short that we want 21488 * to output it as individual characters, as tested for at the 21489 * top of this loop. */ 21490 continue; 21491 } 21492 21493 /* Here, 'start' is a printable ASCII. If it is an alphanumeric, 21494 * output a sub-range of just the digits or letters, then process 21495 * the remaining portion as usual. */ 21496 if (isALPHANUMERIC_A(start)) { 21497 UV mask = (isDIGIT_A(start)) 21498 ? _CC_DIGIT 21499 : isUPPER_A(start) 21500 ? _CC_UPPER 21501 : _CC_LOWER; 21502 UV temp_end = start + 1; 21503 21504 /* Find the end of the sub-range that includes just the 21505 * characters in the same class as the first character in it */ 21506 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) { 21507 temp_end++; 21508 } 21509 temp_end--; 21510 21511 /* For short ranges, don't duplicate the code above to output 21512 * them; just call recursively */ 21513 if (temp_end - start < min_range_count) { 21514 put_range(sv, start, temp_end, FALSE); 21515 } 21516 else { /* Output as a range */ 21517 put_code_point(sv, start); 21518 sv_catpvs(sv, "-"); 21519 put_code_point(sv, temp_end); 21520 } 21521 start = temp_end + 1; 21522 continue; 21523 } 21524 21525 /* We output any other printables as individual characters */ 21526 if (isPUNCT_A(start) || isSPACE_A(start)) { 21527 while (start <= end && (isPUNCT_A(start) 21528 || isSPACE_A(start))) 21529 { 21530 put_code_point(sv, start); 21531 start++; 21532 } 21533 continue; 21534 } 21535 } /* End of looking for literals */ 21536 21537 /* Here is not to output as a literal. Some control characters have 21538 * mnemonic names. Split off any of those at the beginning and end of 21539 * the range to print mnemonically. It isn't possible for many of 21540 * these to be in a row, so this won't overwhelm with output */ 21541 if ( start <= end 21542 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) 21543 { 21544 while (isMNEMONIC_CNTRL(start) && start <= end) { 21545 put_code_point(sv, start); 21546 start++; 21547 } 21548 21549 /* If this didn't take care of the whole range ... */ 21550 if (start <= end) { 21551 21552 /* Look backwards from the end to find the final non-mnemonic 21553 * */ 21554 UV temp_end = end; 21555 while (isMNEMONIC_CNTRL(temp_end)) { 21556 temp_end--; 21557 } 21558 21559 /* And separately output the interior range that doesn't start 21560 * or end with mnemonics */ 21561 put_range(sv, start, temp_end, FALSE); 21562 21563 /* Then output the mnemonic trailing controls */ 21564 start = temp_end + 1; 21565 while (start <= end) { 21566 put_code_point(sv, start); 21567 start++; 21568 } 21569 break; 21570 } 21571 } 21572 21573 /* As a final resort, output the range or subrange as hex. */ 21574 21575 this_end = (end < NUM_ANYOF_CODE_POINTS) 21576 ? end 21577 : NUM_ANYOF_CODE_POINTS - 1; 21578 #if NUM_ANYOF_CODE_POINTS > 256 21579 format = (this_end < 256) 21580 ? "\\x%02" UVXf "-\\x%02" UVXf 21581 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; 21582 #else 21583 format = "\\x%02" UVXf "-\\x%02" UVXf; 21584 #endif 21585 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); 21586 Perl_sv_catpvf(aTHX_ sv, format, start, this_end); 21587 GCC_DIAG_RESTORE_STMT; 21588 break; 21589 } 21590 } 21591 21592 STATIC void 21593 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) 21594 { 21595 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list 21596 * 'invlist' */ 21597 21598 UV start, end; 21599 bool allow_literals = TRUE; 21600 21601 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; 21602 21603 /* Generally, it is more readable if printable characters are output as 21604 * literals, but if a range (nearly) spans all of them, it's best to output 21605 * it as a single range. This code will use a single range if all but 2 21606 * ASCII printables are in it */ 21607 invlist_iterinit(invlist); 21608 while (invlist_iternext(invlist, &start, &end)) { 21609 21610 /* If the range starts beyond the final printable, it doesn't have any 21611 * in it */ 21612 if (start > MAX_PRINT_A) { 21613 break; 21614 } 21615 21616 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span 21617 * all but two, the range must start and end no later than 2 from 21618 * either end */ 21619 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { 21620 if (end > MAX_PRINT_A) { 21621 end = MAX_PRINT_A; 21622 } 21623 if (start < ' ') { 21624 start = ' '; 21625 } 21626 if (end - start >= MAX_PRINT_A - ' ' - 2) { 21627 allow_literals = FALSE; 21628 } 21629 break; 21630 } 21631 } 21632 invlist_iterfinish(invlist); 21633 21634 /* Here we have figured things out. Output each range */ 21635 invlist_iterinit(invlist); 21636 while (invlist_iternext(invlist, &start, &end)) { 21637 if (start >= NUM_ANYOF_CODE_POINTS) { 21638 break; 21639 } 21640 put_range(sv, start, end, allow_literals); 21641 } 21642 invlist_iterfinish(invlist); 21643 21644 return; 21645 } 21646 21647 STATIC SV* 21648 S_put_charclass_bitmap_innards_common(pTHX_ 21649 SV* invlist, /* The bitmap */ 21650 SV* posixes, /* Under /l, things like [:word:], \S */ 21651 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ 21652 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ 21653 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ 21654 const bool invert /* Is the result to be inverted? */ 21655 ) 21656 { 21657 /* Create and return an SV containing a displayable version of the bitmap 21658 * and associated information determined by the input parameters. If the 21659 * output would have been only the inversion indicator '^', NULL is instead 21660 * returned. */ 21661 21662 dVAR; 21663 SV * output; 21664 21665 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; 21666 21667 if (invert) { 21668 output = newSVpvs("^"); 21669 } 21670 else { 21671 output = newSVpvs(""); 21672 } 21673 21674 /* First, the code points in the bitmap that are unconditionally there */ 21675 put_charclass_bitmap_innards_invlist(output, invlist); 21676 21677 /* Traditionally, these have been placed after the main code points */ 21678 if (posixes) { 21679 sv_catsv(output, posixes); 21680 } 21681 21682 if (only_utf8 && _invlist_len(only_utf8)) { 21683 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); 21684 put_charclass_bitmap_innards_invlist(output, only_utf8); 21685 } 21686 21687 if (not_utf8 && _invlist_len(not_utf8)) { 21688 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); 21689 put_charclass_bitmap_innards_invlist(output, not_utf8); 21690 } 21691 21692 if (only_utf8_locale && _invlist_len(only_utf8_locale)) { 21693 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); 21694 put_charclass_bitmap_innards_invlist(output, only_utf8_locale); 21695 21696 /* This is the only list in this routine that can legally contain code 21697 * points outside the bitmap range. The call just above to 21698 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so 21699 * output them here. There's about a half-dozen possible, and none in 21700 * contiguous ranges longer than 2 */ 21701 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 21702 UV start, end; 21703 SV* above_bitmap = NULL; 21704 21705 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); 21706 21707 invlist_iterinit(above_bitmap); 21708 while (invlist_iternext(above_bitmap, &start, &end)) { 21709 UV i; 21710 21711 for (i = start; i <= end; i++) { 21712 put_code_point(output, i); 21713 } 21714 } 21715 invlist_iterfinish(above_bitmap); 21716 SvREFCNT_dec_NN(above_bitmap); 21717 } 21718 } 21719 21720 if (invert && SvCUR(output) == 1) { 21721 return NULL; 21722 } 21723 21724 return output; 21725 } 21726 21727 STATIC bool 21728 S_put_charclass_bitmap_innards(pTHX_ SV *sv, 21729 char *bitmap, 21730 SV *nonbitmap_invlist, 21731 SV *only_utf8_locale_invlist, 21732 const regnode * const node, 21733 const bool force_as_is_display) 21734 { 21735 /* Appends to 'sv' a displayable version of the innards of the bracketed 21736 * character class defined by the other arguments: 21737 * 'bitmap' points to the bitmap, or NULL if to ignore that. 21738 * 'nonbitmap_invlist' is an inversion list of the code points that are in 21739 * the bitmap range, but for some reason aren't in the bitmap; NULL if 21740 * none. The reasons for this could be that they require some 21741 * condition such as the target string being or not being in UTF-8 21742 * (under /d), or because they came from a user-defined property that 21743 * was not resolved at the time of the regex compilation (under /u) 21744 * 'only_utf8_locale_invlist' is an inversion list of the code points that 21745 * are valid only if the runtime locale is a UTF-8 one; NULL if none 21746 * 'node' is the regex pattern ANYOF node. It is needed only when the 21747 * above two parameters are not null, and is passed so that this 21748 * routine can tease apart the various reasons for them. 21749 * 'force_as_is_display' is TRUE if this routine should definitely NOT try 21750 * to invert things to see if that leads to a cleaner display. If 21751 * FALSE, this routine is free to use its judgment about doing this. 21752 * 21753 * It returns TRUE if there was actually something output. (It may be that 21754 * the bitmap, etc is empty.) 21755 * 21756 * When called for outputting the bitmap of a non-ANYOF node, just pass the 21757 * bitmap, with the succeeding parameters set to NULL, and the final one to 21758 * FALSE. 21759 */ 21760 21761 /* In general, it tries to display the 'cleanest' representation of the 21762 * innards, choosing whether to display them inverted or not, regardless of 21763 * whether the class itself is to be inverted. However, there are some 21764 * cases where it can't try inverting, as what actually matches isn't known 21765 * until runtime, and hence the inversion isn't either. */ 21766 21767 dVAR; 21768 bool inverting_allowed = ! force_as_is_display; 21769 21770 int i; 21771 STRLEN orig_sv_cur = SvCUR(sv); 21772 21773 SV* invlist; /* Inversion list we accumulate of code points that 21774 are unconditionally matched */ 21775 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is 21776 UTF-8 */ 21777 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 21778 */ 21779 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ 21780 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale 21781 is UTF-8 */ 21782 21783 SV* as_is_display; /* The output string when we take the inputs 21784 literally */ 21785 SV* inverted_display; /* The output string when we invert the inputs */ 21786 21787 U8 flags = (node) ? ANYOF_FLAGS(node) : 0; 21788 21789 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted 21790 to match? */ 21791 /* We are biased in favor of displaying things without them being inverted, 21792 * as that is generally easier to understand */ 21793 const int bias = 5; 21794 21795 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; 21796 21797 /* Start off with whatever code points are passed in. (We clone, so we 21798 * don't change the caller's list) */ 21799 if (nonbitmap_invlist) { 21800 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); 21801 invlist = invlist_clone(nonbitmap_invlist, NULL); 21802 } 21803 else { /* Worst case size is every other code point is matched */ 21804 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); 21805 } 21806 21807 if (flags) { 21808 if (OP(node) == ANYOFD) { 21809 21810 /* This flag indicates that the code points below 0x100 in the 21811 * nonbitmap list are precisely the ones that match only when the 21812 * target is UTF-8 (they should all be non-ASCII). */ 21813 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) 21814 { 21815 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); 21816 _invlist_subtract(invlist, only_utf8, &invlist); 21817 } 21818 21819 /* And this flag for matching all non-ASCII 0xFF and below */ 21820 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 21821 { 21822 not_utf8 = invlist_clone(PL_UpperLatin1, NULL); 21823 } 21824 } 21825 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { 21826 21827 /* If either of these flags are set, what matches isn't 21828 * determinable except during execution, so don't know enough here 21829 * to invert */ 21830 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { 21831 inverting_allowed = FALSE; 21832 } 21833 21834 /* What the posix classes match also varies at runtime, so these 21835 * will be output symbolically. */ 21836 if (ANYOF_POSIXL_TEST_ANY_SET(node)) { 21837 int i; 21838 21839 posixes = newSVpvs(""); 21840 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 21841 if (ANYOF_POSIXL_TEST(node, i)) { 21842 sv_catpv(posixes, anyofs[i]); 21843 } 21844 } 21845 } 21846 } 21847 } 21848 21849 /* Accumulate the bit map into the unconditional match list */ 21850 if (bitmap) { 21851 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 21852 if (BITMAP_TEST(bitmap, i)) { 21853 int start = i++; 21854 for (; 21855 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); 21856 i++) 21857 { /* empty */ } 21858 invlist = _add_range_to_invlist(invlist, start, i-1); 21859 } 21860 } 21861 } 21862 21863 /* Make sure that the conditional match lists don't have anything in them 21864 * that match unconditionally; otherwise the output is quite confusing. 21865 * This could happen if the code that populates these misses some 21866 * duplication. */ 21867 if (only_utf8) { 21868 _invlist_subtract(only_utf8, invlist, &only_utf8); 21869 } 21870 if (not_utf8) { 21871 _invlist_subtract(not_utf8, invlist, ¬_utf8); 21872 } 21873 21874 if (only_utf8_locale_invlist) { 21875 21876 /* Since this list is passed in, we have to make a copy before 21877 * modifying it */ 21878 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); 21879 21880 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); 21881 21882 /* And, it can get really weird for us to try outputting an inverted 21883 * form of this list when it has things above the bitmap, so don't even 21884 * try */ 21885 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 21886 inverting_allowed = FALSE; 21887 } 21888 } 21889 21890 /* Calculate what the output would be if we take the input as-is */ 21891 as_is_display = put_charclass_bitmap_innards_common(invlist, 21892 posixes, 21893 only_utf8, 21894 not_utf8, 21895 only_utf8_locale, 21896 invert); 21897 21898 /* If have to take the output as-is, just do that */ 21899 if (! inverting_allowed) { 21900 if (as_is_display) { 21901 sv_catsv(sv, as_is_display); 21902 SvREFCNT_dec_NN(as_is_display); 21903 } 21904 } 21905 else { /* But otherwise, create the output again on the inverted input, and 21906 use whichever version is shorter */ 21907 21908 int inverted_bias, as_is_bias; 21909 21910 /* We will apply our bias to whichever of the the results doesn't have 21911 * the '^' */ 21912 if (invert) { 21913 invert = FALSE; 21914 as_is_bias = bias; 21915 inverted_bias = 0; 21916 } 21917 else { 21918 invert = TRUE; 21919 as_is_bias = 0; 21920 inverted_bias = bias; 21921 } 21922 21923 /* Now invert each of the lists that contribute to the output, 21924 * excluding from the result things outside the possible range */ 21925 21926 /* For the unconditional inversion list, we have to add in all the 21927 * conditional code points, so that when inverted, they will be gone 21928 * from it */ 21929 _invlist_union(only_utf8, invlist, &invlist); 21930 _invlist_union(not_utf8, invlist, &invlist); 21931 _invlist_union(only_utf8_locale, invlist, &invlist); 21932 _invlist_invert(invlist); 21933 _invlist_intersection(invlist, PL_InBitmap, &invlist); 21934 21935 if (only_utf8) { 21936 _invlist_invert(only_utf8); 21937 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); 21938 } 21939 else if (not_utf8) { 21940 21941 /* If a code point matches iff the target string is not in UTF-8, 21942 * then complementing the result has it not match iff not in UTF-8, 21943 * which is the same thing as matching iff it is UTF-8. */ 21944 only_utf8 = not_utf8; 21945 not_utf8 = NULL; 21946 } 21947 21948 if (only_utf8_locale) { 21949 _invlist_invert(only_utf8_locale); 21950 _invlist_intersection(only_utf8_locale, 21951 PL_InBitmap, 21952 &only_utf8_locale); 21953 } 21954 21955 inverted_display = put_charclass_bitmap_innards_common( 21956 invlist, 21957 posixes, 21958 only_utf8, 21959 not_utf8, 21960 only_utf8_locale, invert); 21961 21962 /* Use the shortest representation, taking into account our bias 21963 * against showing it inverted */ 21964 if ( inverted_display 21965 && ( ! as_is_display 21966 || ( SvCUR(inverted_display) + inverted_bias 21967 < SvCUR(as_is_display) + as_is_bias))) 21968 { 21969 sv_catsv(sv, inverted_display); 21970 } 21971 else if (as_is_display) { 21972 sv_catsv(sv, as_is_display); 21973 } 21974 21975 SvREFCNT_dec(as_is_display); 21976 SvREFCNT_dec(inverted_display); 21977 } 21978 21979 SvREFCNT_dec_NN(invlist); 21980 SvREFCNT_dec(only_utf8); 21981 SvREFCNT_dec(not_utf8); 21982 SvREFCNT_dec(posixes); 21983 SvREFCNT_dec(only_utf8_locale); 21984 21985 return SvCUR(sv) > orig_sv_cur; 21986 } 21987 21988 #define CLEAR_OPTSTART \ 21989 if (optstart) STMT_START { \ 21990 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ 21991 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ 21992 optstart=NULL; \ 21993 } STMT_END 21994 21995 #define DUMPUNTIL(b,e) \ 21996 CLEAR_OPTSTART; \ 21997 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 21998 21999 STATIC const regnode * 22000 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 22001 const regnode *last, const regnode *plast, 22002 SV* sv, I32 indent, U32 depth) 22003 { 22004 U8 op = PSEUDO; /* Arbitrary non-END op. */ 22005 const regnode *next; 22006 const regnode *optstart= NULL; 22007 22008 RXi_GET_DECL(r, ri); 22009 GET_RE_DEBUG_FLAGS_DECL; 22010 22011 PERL_ARGS_ASSERT_DUMPUNTIL; 22012 22013 #ifdef DEBUG_DUMPUNTIL 22014 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, 22015 last ? last-start : 0, plast ? plast-start : 0); 22016 #endif 22017 22018 if (plast && plast < last) 22019 last= plast; 22020 22021 while (PL_regkind[op] != END && (!last || node < last)) { 22022 assert(node); 22023 /* While that wasn't END last time... */ 22024 NODE_ALIGN(node); 22025 op = OP(node); 22026 if (op == CLOSE || op == SRCLOSE || op == WHILEM) 22027 indent--; 22028 next = regnext((regnode *)node); 22029 22030 /* Where, what. */ 22031 if (OP(node) == OPTIMIZED) { 22032 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 22033 optstart = node; 22034 else 22035 goto after_print; 22036 } else 22037 CLEAR_OPTSTART; 22038 22039 regprop(r, sv, node, NULL, NULL); 22040 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), 22041 (int)(2*indent + 1), "", SvPVX_const(sv)); 22042 22043 if (OP(node) != OPTIMIZED) { 22044 if (next == NULL) /* Next ptr. */ 22045 Perl_re_printf( aTHX_ " (0)"); 22046 else if (PL_regkind[(U8)op] == BRANCH 22047 && PL_regkind[OP(next)] != BRANCH ) 22048 Perl_re_printf( aTHX_ " (FAIL)"); 22049 else 22050 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); 22051 Perl_re_printf( aTHX_ "\n"); 22052 } 22053 22054 after_print: 22055 if (PL_regkind[(U8)op] == BRANCHJ) { 22056 assert(next); 22057 { 22058 const regnode *nnode = (OP(next) == LONGJMP 22059 ? regnext((regnode *)next) 22060 : next); 22061 if (last && nnode > last) 22062 nnode = last; 22063 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 22064 } 22065 } 22066 else if (PL_regkind[(U8)op] == BRANCH) { 22067 assert(next); 22068 DUMPUNTIL(NEXTOPER(node), next); 22069 } 22070 else if ( PL_regkind[(U8)op] == TRIE ) { 22071 const regnode *this_trie = node; 22072 const char op = OP(node); 22073 const U32 n = ARG(node); 22074 const reg_ac_data * const ac = op>=AHOCORASICK ? 22075 (reg_ac_data *)ri->data->data[n] : 22076 NULL; 22077 const reg_trie_data * const trie = 22078 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 22079 #ifdef DEBUGGING 22080 AV *const trie_words 22081 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 22082 #endif 22083 const regnode *nextbranch= NULL; 22084 I32 word_idx; 22085 SvPVCLEAR(sv); 22086 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 22087 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0); 22088 22089 Perl_re_indentf( aTHX_ "%s ", 22090 indent+3, 22091 elem_ptr 22092 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 22093 SvCUR(*elem_ptr), PL_dump_re_max_len, 22094 PL_colors[0], PL_colors[1], 22095 (SvUTF8(*elem_ptr) 22096 ? PERL_PV_ESCAPE_UNI 22097 : 0) 22098 | PERL_PV_PRETTY_ELLIPSES 22099 | PERL_PV_PRETTY_LTGT 22100 ) 22101 : "???" 22102 ); 22103 if (trie->jump) { 22104 U16 dist= trie->jump[word_idx+1]; 22105 Perl_re_printf( aTHX_ "(%" UVuf ")\n", 22106 (UV)((dist ? this_trie + dist : next) - start)); 22107 if (dist) { 22108 if (!nextbranch) 22109 nextbranch= this_trie + trie->jump[0]; 22110 DUMPUNTIL(this_trie + dist, nextbranch); 22111 } 22112 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 22113 nextbranch= regnext((regnode *)nextbranch); 22114 } else { 22115 Perl_re_printf( aTHX_ "\n"); 22116 } 22117 } 22118 if (last && next > last) 22119 node= last; 22120 else 22121 node= next; 22122 } 22123 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 22124 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 22125 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 22126 } 22127 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 22128 assert(next); 22129 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 22130 } 22131 else if ( op == PLUS || op == STAR) { 22132 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 22133 } 22134 else if (PL_regkind[(U8)op] == EXACT) { 22135 /* Literal string, where present. */ 22136 node += NODE_SZ_STR(node) - 1; 22137 node = NEXTOPER(node); 22138 } 22139 else { 22140 node = NEXTOPER(node); 22141 node += regarglen[(U8)op]; 22142 } 22143 if (op == CURLYX || op == OPEN || op == SROPEN) 22144 indent++; 22145 } 22146 CLEAR_OPTSTART; 22147 #ifdef DEBUG_DUMPUNTIL 22148 Perl_re_printf( aTHX_ "--- %d\n", (int)indent); 22149 #endif 22150 return node; 22151 } 22152 22153 #endif /* DEBUGGING */ 22154 22155 #ifndef PERL_IN_XSUB_RE 22156 22157 #include "uni_keywords.h" 22158 22159 void 22160 Perl_init_uniprops(pTHX) 22161 { 22162 dVAR; 22163 22164 PL_user_def_props = newHV(); 22165 22166 #ifdef USE_ITHREADS 22167 22168 HvSHAREKEYS_off(PL_user_def_props); 22169 PL_user_def_props_aTHX = aTHX; 22170 22171 #endif 22172 22173 /* Set up the inversion list global variables */ 22174 22175 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 22176 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]); 22177 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]); 22178 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]); 22179 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]); 22180 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]); 22181 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]); 22182 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]); 22183 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]); 22184 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]); 22185 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]); 22186 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]); 22187 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]); 22188 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]); 22189 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]); 22190 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]); 22191 22192 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 22193 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]); 22194 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]); 22195 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]); 22196 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA]; 22197 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]); 22198 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]); 22199 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]); 22200 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]); 22201 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]); 22202 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]); 22203 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]); 22204 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]); 22205 PL_Posix_ptrs[_CC_VERTSPACE] = NULL; 22206 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]); 22207 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]); 22208 22209 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist); 22210 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist); 22211 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist); 22212 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist); 22213 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist); 22214 22215 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); 22216 PL_Latin1 = _new_invlist_C_array(Latin1_invlist); 22217 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); 22218 22219 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]); 22220 22221 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]); 22222 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]); 22223 22224 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]); 22225 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]); 22226 22227 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); 22228 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 22229 UNI__PERL_FOLDS_TO_MULTI_CHAR]); 22230 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 22231 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]); 22232 PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[ 22233 UNI__PERL_NON_FINAL_FOLDS]); 22234 22235 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); 22236 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); 22237 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist); 22238 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist); 22239 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); 22240 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); 22241 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); 22242 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist); 22243 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]); 22244 22245 #ifdef UNI_XIDC 22246 /* The below are used only by deprecated functions. They could be removed */ 22247 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]); 22248 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]); 22249 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]); 22250 #endif 22251 } 22252 22253 #if 0 22254 22255 This code was mainly added for backcompat to give a warning for non-portable 22256 code points in user-defined properties. But experiments showed that the 22257 warning in earlier perls were only omitted on overflow, which should be an 22258 error, so there really isnt a backcompat issue, and actually adding the 22259 warning when none was present before might cause breakage, for little gain. So 22260 khw left this code in, but not enabled. Tests were never added. 22261 22262 embed.fnc entry: 22263 Ei |const char *|get_extended_utf8_msg|const UV cp 22264 22265 PERL_STATIC_INLINE const char * 22266 S_get_extended_utf8_msg(pTHX_ const UV cp) 22267 { 22268 U8 dummy[UTF8_MAXBYTES + 1]; 22269 HV *msgs; 22270 SV **msg; 22271 22272 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED, 22273 &msgs); 22274 22275 msg = hv_fetchs(msgs, "text", 0); 22276 assert(msg); 22277 22278 (void) sv_2mortal((SV *) msgs); 22279 22280 return SvPVX(*msg); 22281 } 22282 22283 #endif 22284 22285 SV * 22286 Perl_handle_user_defined_property(pTHX_ 22287 22288 /* Parses the contents of a user-defined property definition; returning the 22289 * expanded definition if possible. If so, the return is an inversion 22290 * list. 22291 * 22292 * If there are subroutines that are part of the expansion and which aren't 22293 * known at the time of the call to this function, this returns what 22294 * parse_uniprop_string() returned for the first one encountered. 22295 * 22296 * If an error was found, NULL is returned, and 'msg' gets a suitable 22297 * message appended to it. (Appending allows the back trace of how we got 22298 * to the faulty definition to be displayed through nested calls of 22299 * user-defined subs.) 22300 * 22301 * The caller IS responsible for freeing any returned SV. 22302 * 22303 * The syntax of the contents is pretty much described in perlunicode.pod, 22304 * but we also allow comments on each line */ 22305 22306 const char * name, /* Name of property */ 22307 const STRLEN name_len, /* The name's length in bytes */ 22308 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 22309 const bool to_fold, /* ? Is this under /i */ 22310 const bool runtime, /* ? Are we in compile- or run-time */ 22311 const bool deferrable, /* Is it ok for this property's full definition 22312 to be deferred until later? */ 22313 SV* contents, /* The property's definition */ 22314 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be 22315 getting called unless this is thought to be 22316 a user-defined property */ 22317 SV * msg, /* Any error or warning msg(s) are appended to 22318 this */ 22319 const STRLEN level) /* Recursion level of this call */ 22320 { 22321 STRLEN len; 22322 const char * string = SvPV_const(contents, len); 22323 const char * const e = string + len; 22324 const bool is_contents_utf8 = cBOOL(SvUTF8(contents)); 22325 const STRLEN msgs_length_on_entry = SvCUR(msg); 22326 22327 const char * s0 = string; /* Points to first byte in the current line 22328 being parsed in 'string' */ 22329 const char overflow_msg[] = "Code point too large in \""; 22330 SV* running_definition = NULL; 22331 22332 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY; 22333 22334 *user_defined_ptr = TRUE; 22335 22336 /* Look at each line */ 22337 while (s0 < e) { 22338 const char * s; /* Current byte */ 22339 char op = '+'; /* Default operation is 'union' */ 22340 IV min = 0; /* range begin code point */ 22341 IV max = -1; /* and range end */ 22342 SV* this_definition; 22343 22344 /* Skip comment lines */ 22345 if (*s0 == '#') { 22346 s0 = strchr(s0, '\n'); 22347 if (s0 == NULL) { 22348 break; 22349 } 22350 s0++; 22351 continue; 22352 } 22353 22354 /* For backcompat, allow an empty first line */ 22355 if (*s0 == '\n') { 22356 s0++; 22357 continue; 22358 } 22359 22360 /* First character in the line may optionally be the operation */ 22361 if ( *s0 == '+' 22362 || *s0 == '!' 22363 || *s0 == '-' 22364 || *s0 == '&') 22365 { 22366 op = *s0++; 22367 } 22368 22369 /* If the line is one or two hex digits separated by blank space, its 22370 * a range; otherwise it is either another user-defined property or an 22371 * error */ 22372 22373 s = s0; 22374 22375 if (! isXDIGIT(*s)) { 22376 goto check_if_property; 22377 } 22378 22379 do { /* Each new hex digit will add 4 bits. */ 22380 if (min > ( (IV) MAX_LEGAL_CP >> 4)) { 22381 s = strchr(s, '\n'); 22382 if (s == NULL) { 22383 s = e; 22384 } 22385 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 22386 sv_catpv(msg, overflow_msg); 22387 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 22388 UTF8fARG(is_contents_utf8, s - s0, s0)); 22389 sv_catpvs(msg, "\""); 22390 goto return_failure; 22391 } 22392 22393 /* Accumulate this digit into the value */ 22394 min = (min << 4) + READ_XDIGIT(s); 22395 } while (isXDIGIT(*s)); 22396 22397 while (isBLANK(*s)) { s++; } 22398 22399 /* We allow comments at the end of the line */ 22400 if (*s == '#') { 22401 s = strchr(s, '\n'); 22402 if (s == NULL) { 22403 s = e; 22404 } 22405 s++; 22406 } 22407 else if (s < e && *s != '\n') { 22408 if (! isXDIGIT(*s)) { 22409 goto check_if_property; 22410 } 22411 22412 /* Look for the high point of the range */ 22413 max = 0; 22414 do { 22415 if (max > ( (IV) MAX_LEGAL_CP >> 4)) { 22416 s = strchr(s, '\n'); 22417 if (s == NULL) { 22418 s = e; 22419 } 22420 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 22421 sv_catpv(msg, overflow_msg); 22422 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 22423 UTF8fARG(is_contents_utf8, s - s0, s0)); 22424 sv_catpvs(msg, "\""); 22425 goto return_failure; 22426 } 22427 22428 max = (max << 4) + READ_XDIGIT(s); 22429 } while (isXDIGIT(*s)); 22430 22431 while (isBLANK(*s)) { s++; } 22432 22433 if (*s == '#') { 22434 s = strchr(s, '\n'); 22435 if (s == NULL) { 22436 s = e; 22437 } 22438 } 22439 else if (s < e && *s != '\n') { 22440 goto check_if_property; 22441 } 22442 } 22443 22444 if (max == -1) { /* The line only had one entry */ 22445 max = min; 22446 } 22447 else if (max < min) { 22448 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 22449 sv_catpvs(msg, "Illegal range in \""); 22450 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 22451 UTF8fARG(is_contents_utf8, s - s0, s0)); 22452 sv_catpvs(msg, "\""); 22453 goto return_failure; 22454 } 22455 22456 #if 0 /* See explanation at definition above of get_extended_utf8_msg() */ 22457 22458 if ( UNICODE_IS_PERL_EXTENDED(min) 22459 || UNICODE_IS_PERL_EXTENDED(max)) 22460 { 22461 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 22462 22463 /* If both code points are non-portable, warn only on the lower 22464 * one. */ 22465 sv_catpv(msg, get_extended_utf8_msg( 22466 (UNICODE_IS_PERL_EXTENDED(min)) 22467 ? min : max)); 22468 sv_catpvs(msg, " in \""); 22469 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 22470 UTF8fARG(is_contents_utf8, s - s0, s0)); 22471 sv_catpvs(msg, "\""); 22472 } 22473 22474 #endif 22475 22476 /* Here, this line contains a legal range */ 22477 this_definition = sv_2mortal(_new_invlist(2)); 22478 this_definition = _add_range_to_invlist(this_definition, min, max); 22479 goto calculate; 22480 22481 check_if_property: 22482 22483 /* Here it isn't a legal range line. See if it is a legal property 22484 * line. First find the end of the meat of the line */ 22485 s = strpbrk(s, "#\n"); 22486 if (s == NULL) { 22487 s = e; 22488 } 22489 22490 /* Ignore trailing blanks in keeping with the requirements of 22491 * parse_uniprop_string() */ 22492 s--; 22493 while (s > s0 && isBLANK_A(*s)) { 22494 s--; 22495 } 22496 s++; 22497 22498 this_definition = parse_uniprop_string(s0, s - s0, 22499 is_utf8, to_fold, runtime, 22500 deferrable, 22501 user_defined_ptr, msg, 22502 (name_len == 0) 22503 ? level /* Don't increase level 22504 if input is empty */ 22505 : level + 1 22506 ); 22507 if (this_definition == NULL) { 22508 goto return_failure; /* 'msg' should have had the reason 22509 appended to it by the above call */ 22510 } 22511 22512 if (! is_invlist(this_definition)) { /* Unknown at this time */ 22513 return newSVsv(this_definition); 22514 } 22515 22516 if (*s != '\n') { 22517 s = strchr(s, '\n'); 22518 if (s == NULL) { 22519 s = e; 22520 } 22521 } 22522 22523 calculate: 22524 22525 switch (op) { 22526 case '+': 22527 _invlist_union(running_definition, this_definition, 22528 &running_definition); 22529 break; 22530 case '-': 22531 _invlist_subtract(running_definition, this_definition, 22532 &running_definition); 22533 break; 22534 case '&': 22535 _invlist_intersection(running_definition, this_definition, 22536 &running_definition); 22537 break; 22538 case '!': 22539 _invlist_union_complement_2nd(running_definition, 22540 this_definition, &running_definition); 22541 break; 22542 default: 22543 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d", 22544 __FILE__, __LINE__, op); 22545 break; 22546 } 22547 22548 /* Position past the '\n' */ 22549 s0 = s + 1; 22550 } /* End of loop through the lines of 'contents' */ 22551 22552 /* Here, we processed all the lines in 'contents' without error. If we 22553 * didn't add any warnings, simply return success */ 22554 if (msgs_length_on_entry == SvCUR(msg)) { 22555 22556 /* If the expansion was empty, the answer isn't nothing: its an empty 22557 * inversion list */ 22558 if (running_definition == NULL) { 22559 running_definition = _new_invlist(1); 22560 } 22561 22562 return running_definition; 22563 } 22564 22565 /* Otherwise, add some explanatory text, but we will return success */ 22566 goto return_msg; 22567 22568 return_failure: 22569 running_definition = NULL; 22570 22571 return_msg: 22572 22573 if (name_len > 0) { 22574 sv_catpvs(msg, " in expansion of "); 22575 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 22576 } 22577 22578 return running_definition; 22579 } 22580 22581 /* As explained below, certain operations need to take place in the first 22582 * thread created. These macros switch contexts */ 22583 #ifdef USE_ITHREADS 22584 # define DECLARATION_FOR_GLOBAL_CONTEXT \ 22585 PerlInterpreter * save_aTHX = aTHX; 22586 # define SWITCH_TO_GLOBAL_CONTEXT \ 22587 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX)) 22588 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX)); 22589 # define CUR_CONTEXT aTHX 22590 # define ORIGINAL_CONTEXT save_aTHX 22591 #else 22592 # define DECLARATION_FOR_GLOBAL_CONTEXT 22593 # define SWITCH_TO_GLOBAL_CONTEXT NOOP 22594 # define RESTORE_CONTEXT NOOP 22595 # define CUR_CONTEXT NULL 22596 # define ORIGINAL_CONTEXT NULL 22597 #endif 22598 22599 STATIC void 22600 S_delete_recursion_entry(pTHX_ void *key) 22601 { 22602 /* Deletes the entry used to detect recursion when expanding user-defined 22603 * properties. This is a function so it can be set up to be called even if 22604 * the program unexpectedly quits */ 22605 22606 dVAR; 22607 SV ** current_entry; 22608 const STRLEN key_len = strlen((const char *) key); 22609 DECLARATION_FOR_GLOBAL_CONTEXT; 22610 22611 SWITCH_TO_GLOBAL_CONTEXT; 22612 22613 /* If the entry is one of these types, it is a permanent entry, and not the 22614 * one used to detect recursions. This function should delete only the 22615 * recursion entry */ 22616 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0); 22617 if ( current_entry 22618 && ! is_invlist(*current_entry) 22619 && ! SvPOK(*current_entry)) 22620 { 22621 (void) hv_delete(PL_user_def_props, (const char *) key, key_len, 22622 G_DISCARD); 22623 } 22624 22625 RESTORE_CONTEXT; 22626 } 22627 22628 STATIC SV * 22629 S_get_fq_name(pTHX_ 22630 const char * const name, /* The first non-blank in the \p{}, \P{} */ 22631 const Size_t name_len, /* Its length in bytes, not including any trailing space */ 22632 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 22633 const bool has_colon_colon 22634 ) 22635 { 22636 /* Returns a mortal SV containing the fully qualified version of the input 22637 * name */ 22638 22639 SV * fq_name; 22640 22641 fq_name = newSVpvs_flags("", SVs_TEMP); 22642 22643 /* Use the current package if it wasn't included in our input */ 22644 if (! has_colon_colon) { 22645 const HV * pkg = (IN_PERL_COMPILETIME) 22646 ? PL_curstash 22647 : CopSTASH(PL_curcop); 22648 const char* pkgname = HvNAME(pkg); 22649 22650 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 22651 UTF8fARG(is_utf8, strlen(pkgname), pkgname)); 22652 sv_catpvs(fq_name, "::"); 22653 } 22654 22655 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 22656 UTF8fARG(is_utf8, name_len, name)); 22657 return fq_name; 22658 } 22659 22660 SV * 22661 Perl_parse_uniprop_string(pTHX_ 22662 22663 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable 22664 * now. If so, the return is an inversion list. 22665 * 22666 * If the property is user-defined, it is a subroutine, which in turn 22667 * may call other subroutines. This function will call the whole nest of 22668 * them to get the definition they return; if some aren't known at the time 22669 * of the call to this function, the fully qualified name of the highest 22670 * level sub is returned. It is an error to call this function at runtime 22671 * without every sub defined. 22672 * 22673 * If an error was found, NULL is returned, and 'msg' gets a suitable 22674 * message appended to it. (Appending allows the back trace of how we got 22675 * to the faulty definition to be displayed through nested calls of 22676 * user-defined subs.) 22677 * 22678 * The caller should NOT try to free any returned inversion list. 22679 * 22680 * Other parameters will be set on return as described below */ 22681 22682 const char * const name, /* The first non-blank in the \p{}, \P{} */ 22683 const Size_t name_len, /* Its length in bytes, not including any 22684 trailing space */ 22685 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 22686 const bool to_fold, /* ? Is this under /i */ 22687 const bool runtime, /* TRUE if this is being called at run time */ 22688 const bool deferrable, /* TRUE if it's ok for the definition to not be 22689 known at this call */ 22690 bool *user_defined_ptr, /* Upon return from this function it will be 22691 set to TRUE if any component is a 22692 user-defined property */ 22693 SV * msg, /* Any error or warning msg(s) are appended to 22694 this */ 22695 const STRLEN level) /* Recursion level of this call */ 22696 { 22697 dVAR; 22698 char* lookup_name; /* normalized name for lookup in our tables */ 22699 unsigned lookup_len; /* Its length */ 22700 bool stricter = FALSE; /* Some properties have stricter name 22701 normalization rules, which we decide upon 22702 based on parsing */ 22703 22704 /* nv= or numeric_value=, or possibly one of the cjk numeric properties 22705 * (though it requires extra effort to download them from Unicode and 22706 * compile perl to know about them) */ 22707 bool is_nv_type = FALSE; 22708 22709 unsigned int i, j = 0; 22710 int equals_pos = -1; /* Where the '=' is found, or negative if none */ 22711 int slash_pos = -1; /* Where the '/' is found, or negative if none */ 22712 int table_index = 0; /* The entry number for this property in the table 22713 of all Unicode property names */ 22714 bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or 22715 'Is' */ 22716 Size_t lookup_offset = 0; /* Used to ignore the first few characters of 22717 the normalized name in certain situations */ 22718 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't 22719 part of a package name */ 22720 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined 22721 property rather than a Unicode 22722 one. */ 22723 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL 22724 if an error. If it is an inversion list, 22725 it is the definition. Otherwise it is a 22726 string containing the fully qualified sub 22727 name of 'name' */ 22728 SV * fq_name = NULL; /* For user-defined properties, the fully 22729 qualified name */ 22730 bool invert_return = FALSE; /* ? Do we need to complement the result before 22731 returning it */ 22732 22733 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; 22734 22735 /* The input will be normalized into 'lookup_name' */ 22736 Newx(lookup_name, name_len, char); 22737 SAVEFREEPV(lookup_name); 22738 22739 /* Parse the input. */ 22740 for (i = 0; i < name_len; i++) { 22741 char cur = name[i]; 22742 22743 /* Most of the characters in the input will be of this ilk, being parts 22744 * of a name */ 22745 if (isIDCONT_A(cur)) { 22746 22747 /* Case differences are ignored. Our lookup routine assumes 22748 * everything is lowercase, so normalize to that */ 22749 if (isUPPER_A(cur)) { 22750 lookup_name[j++] = toLOWER_A(cur); 22751 continue; 22752 } 22753 22754 if (cur == '_') { /* Don't include these in the normalized name */ 22755 continue; 22756 } 22757 22758 lookup_name[j++] = cur; 22759 22760 /* The first character in a user-defined name must be of this type. 22761 * */ 22762 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) { 22763 could_be_user_defined = FALSE; 22764 } 22765 22766 continue; 22767 } 22768 22769 /* Here, the character is not something typically in a name, But these 22770 * two types of characters (and the '_' above) can be freely ignored in 22771 * most situations. Later it may turn out we shouldn't have ignored 22772 * them, and we have to reparse, but we don't have enough information 22773 * yet to make that decision */ 22774 if (cur == '-' || isSPACE_A(cur)) { 22775 could_be_user_defined = FALSE; 22776 continue; 22777 } 22778 22779 /* An equals sign or single colon mark the end of the first part of 22780 * the property name */ 22781 if ( cur == '=' 22782 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':'))) 22783 { 22784 lookup_name[j++] = '='; /* Treat the colon as an '=' */ 22785 equals_pos = j; /* Note where it occurred in the input */ 22786 could_be_user_defined = FALSE; 22787 break; 22788 } 22789 22790 /* Otherwise, this character is part of the name. */ 22791 lookup_name[j++] = cur; 22792 22793 /* Here it isn't a single colon, so if it is a colon, it must be a 22794 * double colon */ 22795 if (cur == ':') { 22796 22797 /* A double colon should be a package qualifier. We note its 22798 * position and continue. Note that one could have 22799 * pkg1::pkg2::...::foo 22800 * so that the position at the end of the loop will be just after 22801 * the final qualifier */ 22802 22803 i++; 22804 non_pkg_begin = i + 1; 22805 lookup_name[j++] = ':'; 22806 } 22807 else { /* Only word chars (and '::') can be in a user-defined name */ 22808 could_be_user_defined = FALSE; 22809 } 22810 } /* End of parsing through the lhs of the property name (or all of it if 22811 no rhs) */ 22812 22813 #define STRLENs(s) (sizeof("" s "") - 1) 22814 22815 /* If there is a single package name 'utf8::', it is ambiguous. It could 22816 * be for a user-defined property, or it could be a Unicode property, as 22817 * all of them are considered to be for that package. For the purposes of 22818 * parsing the rest of the property, strip it off */ 22819 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) { 22820 lookup_name += STRLENs("utf8::"); 22821 j -= STRLENs("utf8::"); 22822 equals_pos -= STRLENs("utf8::"); 22823 } 22824 22825 /* Here, we are either done with the whole property name, if it was simple; 22826 * or are positioned just after the '=' if it is compound. */ 22827 22828 if (equals_pos >= 0) { 22829 assert(! stricter); /* We shouldn't have set this yet */ 22830 22831 /* Space immediately after the '=' is ignored */ 22832 i++; 22833 for (; i < name_len; i++) { 22834 if (! isSPACE_A(name[i])) { 22835 break; 22836 } 22837 } 22838 22839 /* Most punctuation after the equals indicates a subpattern, like 22840 * \p{foo=/bar/} */ 22841 if ( isPUNCT_A(name[i]) 22842 && name[i] != '-' 22843 && name[i] != '+' 22844 && name[i] != '_' 22845 && name[i] != '{') 22846 { 22847 /* Find the property. The table includes the equals sign, so we 22848 * use 'j' as-is */ 22849 table_index = match_uniprop((U8 *) lookup_name, j); 22850 if (table_index) { 22851 const char * const * prop_values 22852 = UNI_prop_value_ptrs[table_index]; 22853 SV * subpattern; 22854 Size_t subpattern_len; 22855 REGEXP * subpattern_re; 22856 char open = name[i++]; 22857 char close; 22858 const char * pos_in_brackets; 22859 bool escaped = 0; 22860 22861 /* A backslash means the real delimitter is the next character. 22862 * */ 22863 if (open == '\\') { 22864 open = name[i++]; 22865 escaped = 1; 22866 } 22867 22868 /* This data structure is constructed so that the matching 22869 * closing bracket is 3 past its matching opening. The second 22870 * set of closing is so that if the opening is something like 22871 * ']', the closing will be that as well. Something similar is 22872 * done in toke.c */ 22873 pos_in_brackets = strchr("([<)]>)]>", open); 22874 close = (pos_in_brackets) ? pos_in_brackets[3] : open; 22875 22876 if ( i >= name_len 22877 || name[name_len-1] != close 22878 || (escaped && name[name_len-2] != '\\')) 22879 { 22880 sv_catpvs(msg, "Unicode property wildcard not terminated"); 22881 goto append_name_to_msg; 22882 } 22883 22884 Perl_ck_warner_d(aTHX_ 22885 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS), 22886 "The Unicode property wildcards feature is experimental"); 22887 22888 /* Now create and compile the wildcard subpattern. Use /iaa 22889 * because nothing outside of ASCII will match, and it the 22890 * property values should all match /i. Note that when the 22891 * pattern fails to compile, our added text to the user's 22892 * pattern will be displayed to the user, which is not so 22893 * desirable. */ 22894 subpattern_len = name_len - i - 1 - escaped; 22895 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)", 22896 (unsigned) subpattern_len, 22897 name + i); 22898 subpattern = sv_2mortal(subpattern); 22899 subpattern_re = re_compile(subpattern, 0); 22900 assert(subpattern_re); /* Should have died if didn't compile 22901 successfully */ 22902 22903 /* For each legal property value, see if the supplied pattern 22904 * matches it. */ 22905 while (*prop_values) { 22906 const char * const entry = *prop_values; 22907 const Size_t len = strlen(entry); 22908 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP); 22909 22910 if (pregexec(subpattern_re, 22911 (char *) entry, 22912 (char *) entry + len, 22913 (char *) entry, 0, 22914 entry_sv, 22915 0)) 22916 { /* Here, matched. Add to the returned list */ 22917 Size_t total_len = j + len; 22918 SV * sub_invlist = NULL; 22919 char * this_string; 22920 22921 /* We know this is a legal \p{property=value}. Call 22922 * the function to return the list of code points that 22923 * match it */ 22924 Newxz(this_string, total_len + 1, char); 22925 Copy(lookup_name, this_string, j, char); 22926 my_strlcat(this_string, entry, total_len + 1); 22927 SAVEFREEPV(this_string); 22928 sub_invlist = parse_uniprop_string(this_string, 22929 total_len, 22930 is_utf8, 22931 to_fold, 22932 runtime, 22933 deferrable, 22934 user_defined_ptr, 22935 msg, 22936 level + 1); 22937 _invlist_union(prop_definition, sub_invlist, 22938 &prop_definition); 22939 } 22940 22941 prop_values++; /* Next iteration, look at next propvalue */ 22942 } /* End of looking through property values; (the data 22943 structure is terminated by a NULL ptr) */ 22944 22945 SvREFCNT_dec_NN(subpattern_re); 22946 22947 if (prop_definition) { 22948 return prop_definition; 22949 } 22950 22951 sv_catpvs(msg, "No Unicode property value wildcard matches:"); 22952 goto append_name_to_msg; 22953 } 22954 22955 /* Here's how khw thinks we should proceed to handle the properties 22956 * not yet done: Bidi Mirroring Glyph 22957 Bidi Paired Bracket 22958 Case Folding (both full and simple) 22959 Decomposition Mapping 22960 Equivalent Unified Ideograph 22961 Name 22962 Name Alias 22963 Lowercase Mapping (both full and simple) 22964 NFKC Case Fold 22965 Titlecase Mapping (both full and simple) 22966 Uppercase Mapping (both full and simple) 22967 * Move the part that looks at the property values into a perl 22968 * script, like utf8_heavy.pl is done. This makes things somewhat 22969 * easier, but most importantly, it avoids always adding all these 22970 * strings to the memory usage when the feature is little-used. 22971 * 22972 * The property values would all be concatenated into a single 22973 * string per property with each value on a separate line, and the 22974 * code point it's for on alternating lines. Then we match the 22975 * user's input pattern m//mg, without having to worry about their 22976 * uses of '^' and '$'. Only the values that aren't the default 22977 * would be in the strings. Code points would be in UTF-8. The 22978 * search pattern that we would construct would look like 22979 * (?: \n (code-point_re) \n (?aam: user-re ) \n ) 22980 * And so $1 would contain the code point that matched the user-re. 22981 * For properties where the default is the code point itself, such 22982 * as any of the case changing mappings, the string would otherwise 22983 * consist of all Unicode code points in UTF-8 strung together. 22984 * This would be impractical. So instead, examine their compiled 22985 * pattern, looking at the ssc. If none, reject the pattern as an 22986 * error. Otherwise run the pattern against every code point in 22987 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets 22988 * And it might be good to create an API to return the ssc. 22989 * 22990 * For the name properties, a new function could be created in 22991 * charnames which essentially does the same thing as above, 22992 * sharing Name.pl with the other charname functions. Don't know 22993 * about loose name matching, or algorithmically determined names. 22994 * Decomposition.pl similarly. 22995 * 22996 * It might be that a new pattern modifier would have to be 22997 * created, like /t for resTricTed, which changed the behavior of 22998 * some constructs in their subpattern, like \A. */ 22999 } /* End of is a wildcard subppattern */ 23000 23001 23002 /* Certain properties whose values are numeric need special handling. 23003 * They may optionally be prefixed by 'is'. Ignore that prefix for the 23004 * purposes of checking if this is one of those properties */ 23005 if (memBEGINPs(lookup_name, j, "is")) { 23006 lookup_offset = 2; 23007 } 23008 23009 /* Then check if it is one of these specially-handled properties. The 23010 * possibilities are hard-coded because easier this way, and the list 23011 * is unlikely to change. 23012 * 23013 * All numeric value type properties are of this ilk, and are also 23014 * special in a different way later on. So find those first. There 23015 * are several numeric value type properties in the Unihan DB (which is 23016 * unlikely to be compiled with perl, but we handle it here in case it 23017 * does get compiled). They all end with 'numeric'. The interiors 23018 * aren't checked for the precise property. This would stop working if 23019 * a cjk property were to be created that ended with 'numeric' and 23020 * wasn't a numeric type */ 23021 is_nv_type = memEQs(lookup_name + lookup_offset, 23022 j - 1 - lookup_offset, "numericvalue") 23023 || memEQs(lookup_name + lookup_offset, 23024 j - 1 - lookup_offset, "nv") 23025 || ( memENDPs(lookup_name + lookup_offset, 23026 j - 1 - lookup_offset, "numeric") 23027 && ( memBEGINPs(lookup_name + lookup_offset, 23028 j - 1 - lookup_offset, "cjk") 23029 || memBEGINPs(lookup_name + lookup_offset, 23030 j - 1 - lookup_offset, "k"))); 23031 if ( is_nv_type 23032 || memEQs(lookup_name + lookup_offset, 23033 j - 1 - lookup_offset, "canonicalcombiningclass") 23034 || memEQs(lookup_name + lookup_offset, 23035 j - 1 - lookup_offset, "ccc") 23036 || memEQs(lookup_name + lookup_offset, 23037 j - 1 - lookup_offset, "age") 23038 || memEQs(lookup_name + lookup_offset, 23039 j - 1 - lookup_offset, "in") 23040 || memEQs(lookup_name + lookup_offset, 23041 j - 1 - lookup_offset, "presentin")) 23042 { 23043 unsigned int k; 23044 23045 /* Since the stuff after the '=' is a number, we can't throw away 23046 * '-' willy-nilly, as those could be a minus sign. Other stricter 23047 * rules also apply. However, these properties all can have the 23048 * rhs not be a number, in which case they contain at least one 23049 * alphabetic. In those cases, the stricter rules don't apply. 23050 * But the numeric type properties can have the alphas [Ee] to 23051 * signify an exponent, and it is still a number with stricter 23052 * rules. So look for an alpha that signifies not-strict */ 23053 stricter = TRUE; 23054 for (k = i; k < name_len; k++) { 23055 if ( isALPHA_A(name[k]) 23056 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E'))) 23057 { 23058 stricter = FALSE; 23059 break; 23060 } 23061 } 23062 } 23063 23064 if (stricter) { 23065 23066 /* A number may have a leading '+' or '-'. The latter is retained 23067 * */ 23068 if (name[i] == '+') { 23069 i++; 23070 } 23071 else if (name[i] == '-') { 23072 lookup_name[j++] = '-'; 23073 i++; 23074 } 23075 23076 /* Skip leading zeros including single underscores separating the 23077 * zeros, or between the final leading zero and the first other 23078 * digit */ 23079 for (; i < name_len - 1; i++) { 23080 if ( name[i] != '0' 23081 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 23082 { 23083 break; 23084 } 23085 } 23086 } 23087 } 23088 else { /* No '=' */ 23089 23090 /* Only a few properties without an '=' should be parsed with stricter 23091 * rules. The list is unlikely to change. */ 23092 if ( memBEGINPs(lookup_name, j, "perl") 23093 && memNEs(lookup_name + 4, j - 4, "space") 23094 && memNEs(lookup_name + 4, j - 4, "word")) 23095 { 23096 stricter = TRUE; 23097 23098 /* We set the inputs back to 0 and the code below will reparse, 23099 * using strict */ 23100 i = j = 0; 23101 } 23102 } 23103 23104 /* Here, we have either finished the property, or are positioned to parse 23105 * the remainder, and we know if stricter rules apply. Finish out, if not 23106 * already done */ 23107 for (; i < name_len; i++) { 23108 char cur = name[i]; 23109 23110 /* In all instances, case differences are ignored, and we normalize to 23111 * lowercase */ 23112 if (isUPPER_A(cur)) { 23113 lookup_name[j++] = toLOWER(cur); 23114 continue; 23115 } 23116 23117 /* An underscore is skipped, but not under strict rules unless it 23118 * separates two digits */ 23119 if (cur == '_') { 23120 if ( stricter 23121 && ( i == 0 || (int) i == equals_pos || i == name_len- 1 23122 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1]))) 23123 { 23124 lookup_name[j++] = '_'; 23125 } 23126 continue; 23127 } 23128 23129 /* Hyphens are skipped except under strict */ 23130 if (cur == '-' && ! stricter) { 23131 continue; 23132 } 23133 23134 /* XXX Bug in documentation. It says white space skipped adjacent to 23135 * non-word char. Maybe we should, but shouldn't skip it next to a dot 23136 * in a number */ 23137 if (isSPACE_A(cur) && ! stricter) { 23138 continue; 23139 } 23140 23141 lookup_name[j++] = cur; 23142 23143 /* Unless this is a non-trailing slash, we are done with it */ 23144 if (i >= name_len - 1 || cur != '/') { 23145 continue; 23146 } 23147 23148 slash_pos = j; 23149 23150 /* A slash in the 'numeric value' property indicates that what follows 23151 * is a denominator. It can have a leading '+' and '0's that should be 23152 * skipped. But we have never allowed a negative denominator, so treat 23153 * a minus like every other character. (No need to rule out a second 23154 * '/', as that won't match anything anyway */ 23155 if (is_nv_type) { 23156 i++; 23157 if (i < name_len && name[i] == '+') { 23158 i++; 23159 } 23160 23161 /* Skip leading zeros including underscores separating digits */ 23162 for (; i < name_len - 1; i++) { 23163 if ( name[i] != '0' 23164 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 23165 { 23166 break; 23167 } 23168 } 23169 23170 /* Store the first real character in the denominator */ 23171 lookup_name[j++] = name[i]; 23172 } 23173 } 23174 23175 /* Here are completely done parsing the input 'name', and 'lookup_name' 23176 * contains a copy, normalized. 23177 * 23178 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and 23179 * different from without the underscores. */ 23180 if ( ( UNLIKELY(memEQs(lookup_name, j, "l")) 23181 || UNLIKELY(memEQs(lookup_name, j, "gc=l"))) 23182 && UNLIKELY(name[name_len-1] == '_')) 23183 { 23184 lookup_name[j++] = '&'; 23185 } 23186 23187 /* If the original input began with 'In' or 'Is', it could be a subroutine 23188 * call to a user-defined property instead of a Unicode property name. */ 23189 if ( non_pkg_begin + name_len > 2 23190 && name[non_pkg_begin+0] == 'I' 23191 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's')) 23192 { 23193 starts_with_In_or_Is = TRUE; 23194 } 23195 else { 23196 could_be_user_defined = FALSE; 23197 } 23198 23199 if (could_be_user_defined) { 23200 CV* user_sub; 23201 23202 /* If the user defined property returns the empty string, it could 23203 * easily be because the pattern is being compiled before the data it 23204 * actually needs to compile is available. This could be argued to be 23205 * a bug in the perl code, but this is a change of behavior for Perl, 23206 * so we handle it. This means that intentionally returning nothing 23207 * will not be resolved until runtime */ 23208 bool empty_return = FALSE; 23209 23210 /* Here, the name could be for a user defined property, which are 23211 * implemented as subs. */ 23212 user_sub = get_cvn_flags(name, name_len, 0); 23213 if (user_sub) { 23214 const char insecure[] = "Insecure user-defined property"; 23215 23216 /* Here, there is a sub by the correct name. Normally we call it 23217 * to get the property definition */ 23218 dSP; 23219 SV * user_sub_sv = MUTABLE_SV(user_sub); 23220 SV * error; /* Any error returned by calling 'user_sub' */ 23221 SV * key; /* The key into the hash of user defined sub names 23222 */ 23223 SV * placeholder; 23224 SV ** saved_user_prop_ptr; /* Hash entry for this property */ 23225 23226 /* How many times to retry when another thread is in the middle of 23227 * expanding the same definition we want */ 23228 PERL_INT_FAST8_T retry_countdown = 10; 23229 23230 DECLARATION_FOR_GLOBAL_CONTEXT; 23231 23232 /* If we get here, we know this property is user-defined */ 23233 *user_defined_ptr = TRUE; 23234 23235 /* We refuse to call a potentially tainted subroutine; returning an 23236 * error instead */ 23237 if (TAINT_get) { 23238 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23239 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 23240 goto append_name_to_msg; 23241 } 23242 23243 /* In principal, we only call each subroutine property definition 23244 * once during the life of the program. This guarantees that the 23245 * property definition never changes. The results of the single 23246 * sub call are stored in a hash, which is used instead for future 23247 * references to this property. The property definition is thus 23248 * immutable. But, to allow the user to have a /i-dependent 23249 * definition, we call the sub once for non-/i, and once for /i, 23250 * should the need arise, passing the /i status as a parameter. 23251 * 23252 * We start by constructing the hash key name, consisting of the 23253 * fully qualified subroutine name, preceded by the /i status, so 23254 * that there is a key for /i and a different key for non-/i */ 23255 key = newSVpvn(((to_fold) ? "1" : "0"), 1); 23256 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 23257 non_pkg_begin != 0); 23258 sv_catsv(key, fq_name); 23259 sv_2mortal(key); 23260 23261 /* We only call the sub once throughout the life of the program 23262 * (with the /i, non-/i exception noted above). That means the 23263 * hash must be global and accessible to all threads. It is 23264 * created at program start-up, before any threads are created, so 23265 * is accessible to all children. But this creates some 23266 * complications. 23267 * 23268 * 1) The keys can't be shared, or else problems arise; sharing is 23269 * turned off at hash creation time 23270 * 2) All SVs in it are there for the remainder of the life of the 23271 * program, and must be created in the same interpreter context 23272 * as the hash, or else they will be freed from the wrong pool 23273 * at global destruction time. This is handled by switching to 23274 * the hash's context to create each SV going into it, and then 23275 * immediately switching back 23276 * 3) All accesses to the hash must be controlled by a mutex, to 23277 * prevent two threads from getting an unstable state should 23278 * they simultaneously be accessing it. The code below is 23279 * crafted so that the mutex is locked whenever there is an 23280 * access and unlocked only when the next stable state is 23281 * achieved. 23282 * 23283 * The hash stores either the definition of the property if it was 23284 * valid, or, if invalid, the error message that was raised. We 23285 * use the type of SV to distinguish. 23286 * 23287 * There's also the need to guard against the definition expansion 23288 * from infinitely recursing. This is handled by storing the aTHX 23289 * of the expanding thread during the expansion. Again the SV type 23290 * is used to distinguish this from the other two cases. If we 23291 * come to here and the hash entry for this property is our aTHX, 23292 * it means we have recursed, and the code assumes that we would 23293 * infinitely recurse, so instead stops and raises an error. 23294 * (Any recursion has always been treated as infinite recursion in 23295 * this feature.) 23296 * 23297 * If instead, the entry is for a different aTHX, it means that 23298 * that thread has gotten here first, and hasn't finished expanding 23299 * the definition yet. We just have to wait until it is done. We 23300 * sleep and retry a few times, returning an error if the other 23301 * thread doesn't complete. */ 23302 23303 re_fetch: 23304 USER_PROP_MUTEX_LOCK; 23305 23306 /* If we have an entry for this key, the subroutine has already 23307 * been called once with this /i status. */ 23308 saved_user_prop_ptr = hv_fetch(PL_user_def_props, 23309 SvPVX(key), SvCUR(key), 0); 23310 if (saved_user_prop_ptr) { 23311 23312 /* If the saved result is an inversion list, it is the valid 23313 * definition of this property */ 23314 if (is_invlist(*saved_user_prop_ptr)) { 23315 prop_definition = *saved_user_prop_ptr; 23316 23317 /* The SV in the hash won't be removed until global 23318 * destruction, so it is stable and we can unlock */ 23319 USER_PROP_MUTEX_UNLOCK; 23320 23321 /* The caller shouldn't try to free this SV */ 23322 return prop_definition; 23323 } 23324 23325 /* Otherwise, if it is a string, it is the error message 23326 * that was returned when we first tried to evaluate this 23327 * property. Fail, and append the message */ 23328 if (SvPOK(*saved_user_prop_ptr)) { 23329 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23330 sv_catsv(msg, *saved_user_prop_ptr); 23331 23332 /* The SV in the hash won't be removed until global 23333 * destruction, so it is stable and we can unlock */ 23334 USER_PROP_MUTEX_UNLOCK; 23335 23336 return NULL; 23337 } 23338 23339 assert(SvIOK(*saved_user_prop_ptr)); 23340 23341 /* Here, we have an unstable entry in the hash. Either another 23342 * thread is in the middle of expanding the property's 23343 * definition, or we are ourselves recursing. We use the aTHX 23344 * in it to distinguish */ 23345 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) { 23346 23347 /* Here, it's another thread doing the expanding. We've 23348 * looked as much as we are going to at the contents of the 23349 * hash entry. It's safe to unlock. */ 23350 USER_PROP_MUTEX_UNLOCK; 23351 23352 /* Retry a few times */ 23353 if (retry_countdown-- > 0) { 23354 PerlProc_sleep(1); 23355 goto re_fetch; 23356 } 23357 23358 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23359 sv_catpvs(msg, "Timeout waiting for another thread to " 23360 "define"); 23361 goto append_name_to_msg; 23362 } 23363 23364 /* Here, we are recursing; don't dig any deeper */ 23365 USER_PROP_MUTEX_UNLOCK; 23366 23367 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23368 sv_catpvs(msg, 23369 "Infinite recursion in user-defined property"); 23370 goto append_name_to_msg; 23371 } 23372 23373 /* Here, this thread has exclusive control, and there is no entry 23374 * for this property in the hash. So we have the go ahead to 23375 * expand the definition ourselves. */ 23376 23377 PUSHSTACKi(PERLSI_MAGIC); 23378 ENTER; 23379 23380 /* Create a temporary placeholder in the hash to detect recursion 23381 * */ 23382 SWITCH_TO_GLOBAL_CONTEXT; 23383 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT)); 23384 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0); 23385 RESTORE_CONTEXT; 23386 23387 /* Now that we have a placeholder, we can let other threads 23388 * continue */ 23389 USER_PROP_MUTEX_UNLOCK; 23390 23391 /* Make sure the placeholder always gets destroyed */ 23392 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key)); 23393 23394 PUSHMARK(SP); 23395 SAVETMPS; 23396 23397 /* Call the user's function, with the /i status as a parameter. 23398 * Note that we have gone to a lot of trouble to keep this call 23399 * from being within the locked mutex region. */ 23400 XPUSHs(boolSV(to_fold)); 23401 PUTBACK; 23402 23403 /* The following block was taken from swash_init(). Presumably 23404 * they apply to here as well, though we no longer use a swash -- 23405 * khw */ 23406 SAVEHINTS(); 23407 save_re_context(); 23408 /* We might get here via a subroutine signature which uses a utf8 23409 * parameter name, at which point PL_subname will have been set 23410 * but not yet used. */ 23411 save_item(PL_subname); 23412 23413 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR); 23414 23415 SPAGAIN; 23416 23417 error = ERRSV; 23418 if (TAINT_get || SvTRUE(error)) { 23419 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23420 if (SvTRUE(error)) { 23421 sv_catpvs(msg, "Error \""); 23422 sv_catsv(msg, error); 23423 sv_catpvs(msg, "\""); 23424 } 23425 if (TAINT_get) { 23426 if (SvTRUE(error)) sv_catpvs(msg, "; "); 23427 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 23428 } 23429 23430 if (name_len > 0) { 23431 sv_catpvs(msg, " in expansion of "); 23432 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, 23433 name_len, 23434 name)); 23435 } 23436 23437 (void) POPs; 23438 prop_definition = NULL; 23439 } 23440 else { /* G_SCALAR guarantees a single return value */ 23441 SV * contents = POPs; 23442 23443 /* The contents is supposed to be the expansion of the property 23444 * definition. If the definition is deferrable, and we got an 23445 * empty string back, set a flag to later defer it (after clean 23446 * up below). */ 23447 if ( deferrable 23448 && (! SvPOK(contents) || SvCUR(contents) == 0)) 23449 { 23450 empty_return = TRUE; 23451 } 23452 else { /* Otherwise, call a function to check for valid syntax, 23453 and handle it */ 23454 23455 prop_definition = handle_user_defined_property( 23456 name, name_len, 23457 is_utf8, to_fold, runtime, 23458 deferrable, 23459 contents, user_defined_ptr, 23460 msg, 23461 level); 23462 } 23463 } 23464 23465 /* Here, we have the results of the expansion. Delete the 23466 * placeholder, and if the definition is now known, replace it with 23467 * that definition. We need exclusive access to the hash, and we 23468 * can't let anyone else in, between when we delete the placeholder 23469 * and add the permanent entry */ 23470 USER_PROP_MUTEX_LOCK; 23471 23472 S_delete_recursion_entry(aTHX_ SvPVX(key)); 23473 23474 if ( ! empty_return 23475 && (! prop_definition || is_invlist(prop_definition))) 23476 { 23477 /* If we got success we use the inversion list defining the 23478 * property; otherwise use the error message */ 23479 SWITCH_TO_GLOBAL_CONTEXT; 23480 (void) hv_store_ent(PL_user_def_props, 23481 key, 23482 ((prop_definition) 23483 ? newSVsv(prop_definition) 23484 : newSVsv(msg)), 23485 0); 23486 RESTORE_CONTEXT; 23487 } 23488 23489 /* All done, and the hash now has a permanent entry for this 23490 * property. Give up exclusive control */ 23491 USER_PROP_MUTEX_UNLOCK; 23492 23493 FREETMPS; 23494 LEAVE; 23495 POPSTACK; 23496 23497 if (empty_return) { 23498 goto definition_deferred; 23499 } 23500 23501 if (prop_definition) { 23502 23503 /* If the definition is for something not known at this time, 23504 * we toss it, and go return the main property name, as that's 23505 * the one the user will be aware of */ 23506 if (! is_invlist(prop_definition)) { 23507 SvREFCNT_dec_NN(prop_definition); 23508 goto definition_deferred; 23509 } 23510 23511 sv_2mortal(prop_definition); 23512 } 23513 23514 /* And return */ 23515 return prop_definition; 23516 23517 } /* End of calling the subroutine for the user-defined property */ 23518 } /* End of it could be a user-defined property */ 23519 23520 /* Here it wasn't a user-defined property that is known at this time. See 23521 * if it is a Unicode property */ 23522 23523 lookup_len = j; /* This is a more mnemonic name than 'j' */ 23524 23525 /* Get the index into our pointer table of the inversion list corresponding 23526 * to the property */ 23527 table_index = match_uniprop((U8 *) lookup_name, lookup_len); 23528 23529 /* If it didn't find the property ... */ 23530 if (table_index == 0) { 23531 23532 /* Try again stripping off any initial 'In' or 'Is' */ 23533 if (starts_with_In_or_Is) { 23534 lookup_name += 2; 23535 lookup_len -= 2; 23536 equals_pos -= 2; 23537 slash_pos -= 2; 23538 23539 table_index = match_uniprop((U8 *) lookup_name, lookup_len); 23540 } 23541 23542 if (table_index == 0) { 23543 char * canonical; 23544 23545 /* Here, we didn't find it. If not a numeric type property, and 23546 * can't be a user-defined one, it isn't a legal property */ 23547 if (! is_nv_type) { 23548 if (! could_be_user_defined) { 23549 goto failed; 23550 } 23551 23552 /* Here, the property name is legal as a user-defined one. At 23553 * compile time, it might just be that the subroutine for that 23554 * property hasn't been encountered yet, but at runtime, it's 23555 * an error to try to use an undefined one */ 23556 if (! deferrable) { 23557 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23558 sv_catpvs(msg, "Unknown user-defined property name"); 23559 goto append_name_to_msg; 23560 } 23561 23562 goto definition_deferred; 23563 } /* End of isn't a numeric type property */ 23564 23565 /* The numeric type properties need more work to decide. What we 23566 * do is make sure we have the number in canonical form and look 23567 * that up. */ 23568 23569 if (slash_pos < 0) { /* No slash */ 23570 23571 /* When it isn't a rational, take the input, convert it to a 23572 * NV, then create a canonical string representation of that 23573 * NV. */ 23574 23575 NV value; 23576 SSize_t value_len = lookup_len - equals_pos; 23577 23578 /* Get the value */ 23579 if ( value_len <= 0 23580 || my_atof3(lookup_name + equals_pos, &value, 23581 value_len) 23582 != lookup_name + lookup_len) 23583 { 23584 goto failed; 23585 } 23586 23587 /* If the value is an integer, the canonical value is integral 23588 * */ 23589 if (Perl_ceil(value) == value) { 23590 canonical = Perl_form(aTHX_ "%.*s%.0" NVff, 23591 equals_pos, lookup_name, value); 23592 } 23593 else { /* Otherwise, it is %e with a known precision */ 23594 char * exp_ptr; 23595 23596 canonical = Perl_form(aTHX_ "%.*s%.*" NVef, 23597 equals_pos, lookup_name, 23598 PL_E_FORMAT_PRECISION, value); 23599 23600 /* The exponent generated is expecting two digits, whereas 23601 * %e on some systems will generate three. Remove leading 23602 * zeros in excess of 2 from the exponent. We start 23603 * looking for them after the '=' */ 23604 exp_ptr = strchr(canonical + equals_pos, 'e'); 23605 if (exp_ptr) { 23606 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */ 23607 SSize_t excess_exponent_len = strlen(cur_ptr) - 2; 23608 23609 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+'); 23610 23611 if (excess_exponent_len > 0) { 23612 SSize_t leading_zeros = strspn(cur_ptr, "0"); 23613 SSize_t excess_leading_zeros 23614 = MIN(leading_zeros, excess_exponent_len); 23615 if (excess_leading_zeros > 0) { 23616 Move(cur_ptr + excess_leading_zeros, 23617 cur_ptr, 23618 strlen(cur_ptr) - excess_leading_zeros 23619 + 1, /* Copy the NUL as well */ 23620 char); 23621 } 23622 } 23623 } 23624 } 23625 } 23626 else { /* Has a slash. Create a rational in canonical form */ 23627 UV numerator, denominator, gcd, trial; 23628 const char * end_ptr; 23629 const char * sign = ""; 23630 23631 /* We can't just find the numerator, denominator, and do the 23632 * division, then use the method above, because that is 23633 * inexact. And the input could be a rational that is within 23634 * epsilon (given our precision) of a valid rational, and would 23635 * then incorrectly compare valid. 23636 * 23637 * We're only interested in the part after the '=' */ 23638 const char * this_lookup_name = lookup_name + equals_pos; 23639 lookup_len -= equals_pos; 23640 slash_pos -= equals_pos; 23641 23642 /* Handle any leading minus */ 23643 if (this_lookup_name[0] == '-') { 23644 sign = "-"; 23645 this_lookup_name++; 23646 lookup_len--; 23647 slash_pos--; 23648 } 23649 23650 /* Convert the numerator to numeric */ 23651 end_ptr = this_lookup_name + slash_pos; 23652 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) { 23653 goto failed; 23654 } 23655 23656 /* It better have included all characters before the slash */ 23657 if (*end_ptr != '/') { 23658 goto failed; 23659 } 23660 23661 /* Set to look at just the denominator */ 23662 this_lookup_name += slash_pos; 23663 lookup_len -= slash_pos; 23664 end_ptr = this_lookup_name + lookup_len; 23665 23666 /* Convert the denominator to numeric */ 23667 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) { 23668 goto failed; 23669 } 23670 23671 /* It better be the rest of the characters, and don't divide by 23672 * 0 */ 23673 if ( end_ptr != this_lookup_name + lookup_len 23674 || denominator == 0) 23675 { 23676 goto failed; 23677 } 23678 23679 /* Get the greatest common denominator using 23680 http://en.wikipedia.org/wiki/Euclidean_algorithm */ 23681 gcd = numerator; 23682 trial = denominator; 23683 while (trial != 0) { 23684 UV temp = trial; 23685 trial = gcd % trial; 23686 gcd = temp; 23687 } 23688 23689 /* If already in lowest possible terms, we have already tried 23690 * looking this up */ 23691 if (gcd == 1) { 23692 goto failed; 23693 } 23694 23695 /* Reduce the rational, which should put it in canonical form 23696 * */ 23697 numerator /= gcd; 23698 denominator /= gcd; 23699 23700 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf, 23701 equals_pos, lookup_name, sign, numerator, denominator); 23702 } 23703 23704 /* Here, we have the number in canonical form. Try that */ 23705 table_index = match_uniprop((U8 *) canonical, strlen(canonical)); 23706 if (table_index == 0) { 23707 goto failed; 23708 } 23709 } /* End of still didn't find the property in our table */ 23710 } /* End of didn't find the property in our table */ 23711 23712 /* Here, we have a non-zero return, which is an index into a table of ptrs. 23713 * A negative return signifies that the real index is the absolute value, 23714 * but the result needs to be inverted */ 23715 if (table_index < 0) { 23716 invert_return = TRUE; 23717 table_index = -table_index; 23718 } 23719 23720 /* Out-of band indices indicate a deprecated property. The proper index is 23721 * modulo it with the table size. And dividing by the table size yields 23722 * an offset into a table constructed by regen/mk_invlists.pl to contain 23723 * the corresponding warning message */ 23724 if (table_index > MAX_UNI_KEYWORD_INDEX) { 23725 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX; 23726 table_index %= MAX_UNI_KEYWORD_INDEX; 23727 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), 23728 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s", 23729 (int) name_len, name, deprecated_property_msgs[warning_offset]); 23730 } 23731 23732 /* In a few properties, a different property is used under /i. These are 23733 * unlikely to change, so are hard-coded here. */ 23734 if (to_fold) { 23735 if ( table_index == UNI_XPOSIXUPPER 23736 || table_index == UNI_XPOSIXLOWER 23737 || table_index == UNI_TITLE) 23738 { 23739 table_index = UNI_CASED; 23740 } 23741 else if ( table_index == UNI_UPPERCASELETTER 23742 || table_index == UNI_LOWERCASELETTER 23743 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */ 23744 || table_index == UNI_TITLECASELETTER 23745 # endif 23746 ) { 23747 table_index = UNI_CASEDLETTER; 23748 } 23749 else if ( table_index == UNI_POSIXUPPER 23750 || table_index == UNI_POSIXLOWER) 23751 { 23752 table_index = UNI_POSIXALPHA; 23753 } 23754 } 23755 23756 /* Create and return the inversion list */ 23757 prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]); 23758 sv_2mortal(prop_definition); 23759 23760 23761 /* See if there is a private use override to add to this definition */ 23762 { 23763 COPHH * hinthash = (IN_PERL_COMPILETIME) 23764 ? CopHINTHASH_get(&PL_compiling) 23765 : CopHINTHASH_get(PL_curcop); 23766 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0); 23767 23768 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) { 23769 23770 /* See if there is an element in the hints hash for this table */ 23771 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index); 23772 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup)); 23773 23774 if (pos) { 23775 bool dummy; 23776 SV * pu_definition; 23777 SV * pu_invlist; 23778 SV * expanded_prop_definition = 23779 sv_2mortal(invlist_clone(prop_definition, NULL)); 23780 23781 /* If so, it's definition is the string from here to the next 23782 * \a character. And its format is the same as a user-defined 23783 * property */ 23784 pos += SvCUR(pu_lookup); 23785 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos); 23786 pu_invlist = handle_user_defined_property(lookup_name, 23787 lookup_len, 23788 0, /* Not UTF-8 */ 23789 0, /* Not folded */ 23790 runtime, 23791 deferrable, 23792 pu_definition, 23793 &dummy, 23794 msg, 23795 level); 23796 if (TAINT_get) { 23797 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23798 sv_catpvs(msg, "Insecure private-use override"); 23799 goto append_name_to_msg; 23800 } 23801 23802 /* For now, as a safety measure, make sure that it doesn't 23803 * override non-private use code points */ 23804 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist); 23805 23806 /* Add it to the list to be returned */ 23807 _invlist_union(prop_definition, pu_invlist, 23808 &expanded_prop_definition); 23809 prop_definition = expanded_prop_definition; 23810 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental"); 23811 } 23812 } 23813 } 23814 23815 if (invert_return) { 23816 _invlist_invert(prop_definition); 23817 } 23818 return prop_definition; 23819 23820 23821 failed: 23822 if (non_pkg_begin != 0) { 23823 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23824 sv_catpvs(msg, "Illegal user-defined property name"); 23825 } 23826 else { 23827 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 23828 sv_catpvs(msg, "Can't find Unicode property definition"); 23829 } 23830 /* FALLTHROUGH */ 23831 23832 append_name_to_msg: 23833 { 23834 const char * prefix = (runtime && level == 0) ? " \\p{" : " \""; 23835 const char * suffix = (runtime && level == 0) ? "}" : "\""; 23836 23837 sv_catpv(msg, prefix); 23838 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 23839 sv_catpv(msg, suffix); 23840 } 23841 23842 return NULL; 23843 23844 definition_deferred: 23845 23846 /* Here it could yet to be defined, so defer evaluation of this 23847 * until its needed at runtime. We need the fully qualified property name 23848 * to avoid ambiguity, and a trailing newline */ 23849 if (! fq_name) { 23850 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 23851 non_pkg_begin != 0 /* If has "::" */ 23852 ); 23853 } 23854 sv_catpvs(fq_name, "\n"); 23855 23856 *user_defined_ptr = TRUE; 23857 return fq_name; 23858 } 23859 23860 #endif 23861 23862 /* 23863 * ex: set ts=8 sts=4 sw=4 et: 23864 */ 23865