1 #ifndef REGCOMP_INTERNAL_H 2 #define REGCOMP_INTERNAL_H 3 #ifndef STATIC 4 #define STATIC static 5 #endif 6 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM 7 #define RE_OPTIMIZE_CURLYX_TO_CURLYM 1 8 #endif 9 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN 10 #define RE_OPTIMIZE_CURLYX_TO_CURLYN 1 11 #endif 12 13 /* this is a chain of data about sub patterns we are processing that 14 need to be handled separately/specially in study_chunk. Its so 15 we can simulate recursion without losing state. */ 16 struct scan_frame; 17 typedef struct scan_frame { 18 regnode *last_regnode; /* last node to process in this frame */ 19 regnode *next_regnode; /* next node to process when last is reached */ 20 U32 prev_recursed_depth; 21 I32 stopparen; /* what stopparen do we use */ 22 bool in_gosub; /* this or an outer frame is for GOSUB */ 23 24 struct scan_frame *this_prev_frame; /* this previous frame */ 25 struct scan_frame *prev_frame; /* previous frame */ 26 struct scan_frame *next_frame; /* next frame */ 27 } scan_frame; 28 29 /* Certain characters are output as a sequence with the first being a 30 * backslash. */ 31 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) 32 33 34 struct RExC_state_t { 35 U32 flags; /* RXf_* are we folding, multilining? */ 36 U32 pm_flags; /* PMf_* stuff from the calling PMOP */ 37 char *precomp; /* uncompiled string. */ 38 char *precomp_end; /* pointer to end of uncompiled string. */ 39 REGEXP *rx_sv; /* The SV that is the regexp. */ 40 regexp *rx; /* perl core regexp structure */ 41 regexp_internal *rxi; /* internal data for regexp object 42 pprivate field */ 43 char *start; /* Start of input for compile */ 44 char *end; /* End of input for compile */ 45 char *parse; /* Input-scan pointer. */ 46 char *copy_start; /* start of copy of input within 47 constructed parse string */ 48 char *save_copy_start; /* Provides one level of saving 49 and restoring 'copy_start' */ 50 char *copy_start_in_input; /* Position in input string 51 corresponding to copy_start */ 52 SSize_t whilem_seen; /* number of WHILEM in this expr */ 53 regnode *emit_start; /* Start of emitted-code area */ 54 regnode_offset emit; /* Code-emit pointer */ 55 I32 naughty; /* How bad is this pattern? */ 56 I32 sawback; /* Did we see \1, ...? */ 57 SSize_t size; /* Number of regnode equivalents in 58 pattern */ 59 Size_t sets_depth; /* Counts recursion depth of already- 60 compiled regex set patterns */ 61 U32 seen; 62 63 I32 parens_buf_size; /* #slots malloced open/close_parens */ 64 regnode_offset *open_parens; /* offsets to open parens */ 65 regnode_offset *close_parens; /* offsets to close parens */ 66 HV *paren_names; /* Paren names */ 67 68 /* position beyond 'precomp' of the warning message furthest away from 69 * 'precomp'. During the parse, no warnings are raised for any problems 70 * earlier in the parse than this position. This works if warnings are 71 * raised the first time a given spot is parsed, and if only one 72 * independent warning is raised for any given spot */ 73 Size_t latest_warn_offset; 74 75 /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id. 76 * "Logical Parno" is the user visible view with branch reset taken into 77 * account. "Parno" (or physical parno) is the actual capture buffers in 78 * the pattern *NOT* taking into account branch reset. We also maintain 79 * a map of "next" pointers which allow us to skip to the next physical 80 * capture buffer with the same logical id, with 0 representing "none". 81 * 82 * As we compile we keep track of the two different counts using the 83 * 'logical_npar' and 'npar' members, and we keep track of the upper bound 84 * of both in 'total_par' and 'logical_total_par', we also populate 85 * the 'logical_to_parno' map, which gives us the first physical parno 86 * for a given logical parno, and the `parno_to_logical` array which gives 87 * us the logical id for each physical parno. When compilation is 88 * completed we construct the 'parno_to_logical_next' array from the 89 * 'parno_to_logical' array. (We do not bother constructing it during 90 * compilation as we do not need it, and we can construct it in O(N) time 91 * once we are done, but would need more complicated logic during the 92 * compile, because we want the next pointers to go from smallest to 93 * largest, eg, left to right.) 94 * 95 * Logical: $1 $2 $3 $4 $2 $3 $2 $5 96 * Physical: 1 2 3 4 5 6 7 8 97 * Next: 0 5 6 0 7 0 0 0 98 * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/ 99 * 100 * As much as possible the internals use and store the physical id of 101 * of capture buffers. We decode the physical to the logical only when 102 * we need to, for instance when someone use $2. 103 * 104 * Note that when branch reset is not used logical and physical are the 105 * same and the next data would be all zero. So when branch reset is not 106 * used we do not need to populate this data into the final regexp. 107 * 108 */ 109 I32 *logical_to_parno; /* logical_parno to parno */ 110 I32 *parno_to_logical; /* parno to logical_parno */ 111 I32 *parno_to_logical_next; /* parno to next (greater value) 112 parno with the same 113 logical_parno as parno.*/ 114 115 I32 npar; /* Capture buffer count so far in the 116 parse, (OPEN) plus one. ("par" 0 is 117 the whole pattern)*/ 118 I32 logical_npar; /* Logical version of npar */ 119 I32 total_par; /* During initial parse, is either 0, 120 or -1; the latter indicating a 121 reparse is needed. After that pass, 122 it is what 'npar' became after the 123 pass. Hence, it being > 0 indicates 124 we are in a reparse situation */ 125 I32 logical_total_par; /* Logical version to total par */ 126 I32 nestroot; /* root parens we are in - used by 127 accept */ 128 I32 seen_zerolen; 129 regnode *end_op; /* END node in program */ 130 I32 utf8; /* whether the pattern is utf8 or not */ 131 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 132 /* XXX use this for future optimisation of case 133 * where pattern must be upgraded to utf8. */ 134 I32 uni_semantics; /* If a d charset modifier should use unicode 135 rules, even if the pattern is not in 136 utf8 */ 137 138 I32 recurse_count; /* Number of recurse regops we have generated */ 139 regnode **recurse; /* Recurse regops */ 140 U8 *study_chunk_recursed; /* bitmap of which subs we have moved 141 through */ 142 U32 study_chunk_recursed_bytes; /* bytes in bitmap */ 143 I32 in_lookaround; 144 I32 contains_locale; 145 I32 override_recoding; 146 I32 recode_x_to_native; 147 I32 in_multi_char_class; 148 int code_index; /* next code_blocks[] slot */ 149 struct reg_code_blocks *code_blocks;/* positions of literal (?{}) 150 within pattern */ 151 SSize_t maxlen; /* mininum possible number of chars in string to match */ 152 scan_frame *frame_head; 153 scan_frame *frame_last; 154 U32 frame_count; 155 AV *warn_text; 156 HV *unlexed_names; 157 SV *runtime_code_qr; /* qr with the runtime code blocks */ 158 bool seen_d_op; 159 bool strict; 160 bool study_started; 161 bool in_script_run; 162 bool use_BRANCHJ; 163 bool sWARN_EXPERIMENTAL__VLB; 164 bool sWARN_EXPERIMENTAL__REGEX_SETS; 165 /* DEBUGGING only fields, keep these LAST so that we do not 166 * have any weirdness with static builds. 167 * 168 * We include these if we are building a DEBUGGING perl OR if we 169 * are not using dynamic linking (USE_DYNAMIC_LOADING). 170 * 171 * See GH Issue #21558 and also ba6e2c38aafc23cf114f3ba0d0ff3baead34328b 172 */ 173 #if defined(DEBUGGING) || !defined(USE_DYNAMIC_LOADING) 174 const char *lastparse; 175 I32 lastnum; 176 U32 study_chunk_recursed_count; 177 AV *paren_name_list; /* idx -> name */ 178 SV *mysv1; 179 SV *mysv2; 180 #endif 181 }; 182 183 #ifdef DEBUGGING 184 #define RExC_lastparse (pRExC_state->lastparse) 185 #define RExC_lastnum (pRExC_state->lastnum) 186 #define RExC_paren_name_list (pRExC_state->paren_name_list) 187 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) 188 #define RExC_mysv (pRExC_state->mysv1) 189 #define RExC_mysv1 (pRExC_state->mysv1) 190 #define RExC_mysv2 (pRExC_state->mysv2) 191 #endif 192 193 #define RExC_flags (pRExC_state->flags) 194 #define RExC_pm_flags (pRExC_state->pm_flags) 195 #define RExC_precomp (pRExC_state->precomp) 196 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) 197 #define RExC_copy_start_in_constructed (pRExC_state->copy_start) 198 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) 199 #define RExC_precomp_end (pRExC_state->precomp_end) 200 #define RExC_rx_sv (pRExC_state->rx_sv) 201 #define RExC_rx (pRExC_state->rx) 202 #define RExC_rxi (pRExC_state->rxi) 203 #define RExC_start (pRExC_state->start) 204 #define RExC_end (pRExC_state->end) 205 #define RExC_parse (pRExC_state->parse) 206 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) 207 #define RExC_whilem_seen (pRExC_state->whilem_seen) 208 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs 209 under /d from /u ? */ 210 211 #define RExC_emit (pRExC_state->emit) 212 #define RExC_emit_start (pRExC_state->emit_start) 213 #define RExC_sawback (pRExC_state->sawback) 214 #define RExC_seen (pRExC_state->seen) 215 #define RExC_size (pRExC_state->size) 216 #define RExC_maxlen (pRExC_state->maxlen) 217 #define RExC_logical_npar (pRExC_state->logical_npar) 218 #define RExC_logical_total_parens (pRExC_state->logical_total_par) 219 #define RExC_logical_to_parno (pRExC_state->logical_to_parno) 220 #define RExC_parno_to_logical (pRExC_state->parno_to_logical) 221 #define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next) 222 #define RExC_npar (pRExC_state->npar) 223 #define RExC_total_parens (pRExC_state->total_par) 224 #define RExC_parens_buf_size (pRExC_state->parens_buf_size) 225 #define RExC_nestroot (pRExC_state->nestroot) 226 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 227 #define RExC_utf8 (pRExC_state->utf8) 228 #define RExC_uni_semantics (pRExC_state->uni_semantics) 229 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 230 #define RExC_open_parens (pRExC_state->open_parens) 231 #define RExC_close_parens (pRExC_state->close_parens) 232 #define RExC_end_op (pRExC_state->end_op) 233 #define RExC_paren_names (pRExC_state->paren_names) 234 #define RExC_recurse (pRExC_state->recurse) 235 #define RExC_recurse_count (pRExC_state->recurse_count) 236 #define RExC_sets_depth (pRExC_state->sets_depth) 237 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) 238 #define RExC_study_chunk_recursed_bytes \ 239 (pRExC_state->study_chunk_recursed_bytes) 240 #define RExC_in_lookaround (pRExC_state->in_lookaround) 241 #define RExC_contains_locale (pRExC_state->contains_locale) 242 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) 243 244 #ifdef EBCDIC 245 # define SET_recode_x_to_native(x) \ 246 STMT_START { RExC_recode_x_to_native = (x); } STMT_END 247 #else 248 # define SET_recode_x_to_native(x) NOOP 249 #endif 250 251 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) 252 #define RExC_frame_head (pRExC_state->frame_head) 253 #define RExC_frame_last (pRExC_state->frame_last) 254 #define RExC_frame_count (pRExC_state->frame_count) 255 #define RExC_strict (pRExC_state->strict) 256 #define RExC_study_started (pRExC_state->study_started) 257 #define RExC_warn_text (pRExC_state->warn_text) 258 #define RExC_in_script_run (pRExC_state->in_script_run) 259 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) 260 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) 261 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) 262 #define RExC_unlexed_names (pRExC_state->unlexed_names) 263 264 265 /***********************************************************************/ 266 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse 267 * 268 * All of these macros depend on the above RExC_ accessor macros, which 269 * in turns depend on a variable pRExC_state being in scope where they 270 * are used. This is the standard regexp parser context variable which is 271 * passed into every non-trivial parse function in this file. 272 * 273 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all 274 * of the macros which do not take an argument will operate on the 275 * pRExC_state structure *only*. 276 * 277 * Please do NOT modify RExC_parse without using these macros. In the 278 * future these macros will be extended for enhanced debugging and trace 279 * output during the parse process. 280 */ 281 282 /* RExC_parse_incf(flag) 283 * 284 * Increment RExC_parse to point at the next codepoint, while doing 285 * the right thing depending on whether we are parsing UTF-8 strings 286 * or not. The 'flag' argument determines if content is UTF-8 or not, 287 * intended for cases where this is NOT governed by the UTF macro. 288 * 289 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. 290 * 291 * WARNING: Does NOT take into account RExC_end; it is the callers 292 * responsibility to make sure there are enough octets left in 293 * RExC_parse to ensure that when processing UTF-8 we would not read 294 * past the end of the string. 295 */ 296 #define RExC_parse_incf(flag) STMT_START { \ 297 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ 298 } STMT_END 299 300 /* RExC_parse_inc_safef(flag) 301 * 302 * Safely increment RExC_parse to point at the next codepoint, 303 * doing the right thing depending on whether we are parsing 304 * UTF-8 strings or not and NOT reading past the end of the buffer. 305 * The 'flag' argument determines if content is UTF-8 or not, 306 * intended for cases where this is NOT governed by the UTF macro. 307 * 308 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. 309 * 310 * NOTE: Will NOT read past RExC_end when content is UTF-8. 311 */ 312 #define RExC_parse_inc_safef(flag) STMT_START { \ 313 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ 314 } STMT_END 315 316 /* RExC_parse_inc() 317 * 318 * Increment RExC_parse to point at the next codepoint, 319 * doing the right thing depending on whether we are parsing 320 * UTF-8 strings or not. 321 * 322 * WARNING: Does NOT take into account RExC_end, it is the callers 323 * responsibility to make sure there are enough octets left in 324 * RExC_parse to ensure that when processing UTF-8 we would not read 325 * past the end of the string. 326 * 327 * NOTE: whether we are parsing UTF-8 or not is determined by the 328 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 329 * macro operates on the pRExC_state structure only. 330 */ 331 #define RExC_parse_inc() RExC_parse_incf(UTF) 332 333 /* RExC_parse_inc_safe() 334 * 335 * Safely increment RExC_parse to point at the next codepoint, 336 * doing the right thing depending on whether we are parsing 337 * UTF-8 strings or not and NOT reading past the end of the buffer. 338 * 339 * NOTE: whether we are parsing UTF-8 or not is determined by the 340 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 341 * macro operates on the pRExC_state structure only. 342 */ 343 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) 344 345 /* RExC_parse_inc_utf8() 346 * 347 * Increment RExC_parse to point at the next utf8 codepoint, 348 * assumes content is UTF-8. 349 * 350 * WARNING: Does NOT take into account RExC_end; it is the callers 351 * responsibility to make sure there are enough octets left in RExC_parse 352 * to ensure that when processing UTF-8 we would not read past the end 353 * of the string. 354 */ 355 #define RExC_parse_inc_utf8() STMT_START { \ 356 RExC_parse += UTF8SKIP(RExC_parse); \ 357 } STMT_END 358 359 /* RExC_parse_inc_if_char() 360 * 361 * Increment RExC_parse to point at the next codepoint, if and only 362 * if the current parse point is NOT a NULL, while doing the right thing 363 * depending on whether we are parsing UTF-8 strings or not. 364 * 365 * WARNING: Does NOT take into account RExC_end, it is the callers 366 * responsibility to make sure there are enough octets left in RExC_parse 367 * to ensure that when processing UTF-8 we would not read past the end 368 * of the string. 369 * 370 * NOTE: whether we are parsing UTF-8 or not is determined by the 371 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 372 * macro operates on the pRExC_state structure only. 373 */ 374 #define RExC_parse_inc_if_char() STMT_START { \ 375 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ 376 } STMT_END 377 378 /* RExC_parse_inc_by(n_octets) 379 * 380 * Increment the parse cursor by the number of octets specified by 381 * the 'n_octets' argument. 382 * 383 * NOTE: Does NOT check ANY constraints. It is the callers responsibility 384 * that this will not move past the end of the string, or leave the 385 * pointer in the middle of a UTF-8 sequence. 386 * 387 * Typically used to advanced past previously analyzed content. 388 */ 389 #define RExC_parse_inc_by(n_octets) STMT_START { \ 390 RExC_parse += (n_octets); \ 391 } STMT_END 392 393 /* RExC_parse_set(to_ptr) 394 * 395 * Sets the RExC_parse pointer to the pointer specified by the 'to' 396 * argument. No validation whatsoever is performed on the to pointer. 397 */ 398 #define RExC_parse_set(to_ptr) STMT_START { \ 399 RExC_parse = (to_ptr); \ 400 } STMT_END 401 402 /**********************************************************************/ 403 404 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set 405 * a flag to disable back-off on the fixed/floating substrings - if it's 406 * a high complexity pattern we assume the benefit of avoiding a full match 407 * is worth the cost of checking for the substrings even if they rarely help. 408 */ 409 #define RExC_naughty (pRExC_state->naughty) 410 #define TOO_NAUGHTY (10) 411 #define MARK_NAUGHTY(add) \ 412 if (RExC_naughty < TOO_NAUGHTY) \ 413 RExC_naughty += (add) 414 #define MARK_NAUGHTY_EXP(exp, add) \ 415 if (RExC_naughty < TOO_NAUGHTY) \ 416 RExC_naughty += RExC_naughty / (exp) + (add) 417 418 #define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') 419 #define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ 420 || ((*s) == '{' && regcurly(s, e, NULL))) 421 422 /* 423 * Flags to be passed up. 424 */ 425 #define HASWIDTH 0x01 /* Known to not match null strings, could match 426 non-null ones. */ 427 #define SIMPLE 0x02 /* Exactly one character wide */ 428 /* (or LNBREAK as a special case) */ 429 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ 430 #define TRYAGAIN 0x10 /* Weeded out a declaration. */ 431 #define RESTART_PARSE 0x20 /* Need to redo the parse */ 432 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to 433 calcuate sizes as UTF-8 */ 434 435 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 436 437 /* whether trie related optimizations are enabled */ 438 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 439 #define TRIE_STUDY_OPT 440 #define FULL_TRIE_STUDY 441 #define TRIE_STCLASS 442 #endif 443 444 /* About the term "restudy" and the var "restudied" and the defines 445 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to 446 * doing multiple study_chunk() calls over the same set of opcodes for* the 447 * purpose of enhanced TRIE optimizations. 448 * 449 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal 450 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled 451 * which then causes the Perl_re_op_compile() to then call the optimizer 452 * S_study_chunk() a second time to perform additional optimizations, 453 * including the aho_corasick startclass optimization. 454 * This additional pass will only happen once, which is managed by the 455 * 'restudied' variable in Perl_re_op_compile(). 456 * 457 * When this second pass is under way the flags passed into study_chunk() will 458 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down 459 * to any recursive calls to S_study_chunk(). 460 * 461 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that 462 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may 463 * be produced twice. 464 * 465 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and 466 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. 467 */ 468 469 470 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 471 #define PBITVAL(paren) (1 << ((paren) & 7)) 472 #define PAREN_OFFSET(depth) \ 473 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) 474 #define PAREN_TEST(depth, paren) \ 475 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) 476 #define PAREN_SET(depth, paren) \ 477 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) 478 #define PAREN_UNSET(depth, paren) \ 479 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) 480 481 #define REQUIRE_UTF8(flagp) STMT_START { \ 482 if (!UTF) { \ 483 *flagp = RESTART_PARSE|NEED_UTF8; \ 484 return 0; \ 485 } \ 486 } STMT_END 487 488 /* /u is to be chosen if we are supposed to use Unicode rules, or if the 489 * pattern is in UTF-8. This latter condition is in case the outermost rules 490 * are locale. See GH #17278 */ 491 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) 492 493 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is 494 * a flag that indicates we need to override /d with /u as a result of 495 * something in the pattern. It should only be used in regards to calling 496 * set_regex_charset() or get_regex_charset() */ 497 #define REQUIRE_UNI_RULES(flagp, restart_retval) \ 498 STMT_START { \ 499 if (DEPENDS_SEMANTICS) { \ 500 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ 501 RExC_uni_semantics = 1; \ 502 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ 503 /* No need to restart the parse if we haven't seen \ 504 * anything that differs between /u and /d, and no need \ 505 * to restart immediately if we're going to reparse \ 506 * anyway to count parens */ \ 507 *flagp |= RESTART_PARSE; \ 508 return restart_retval; \ 509 } \ 510 } \ 511 } STMT_END 512 513 #define REQUIRE_BRANCHJ(flagp, restart_retval) \ 514 STMT_START { \ 515 RExC_use_BRANCHJ = 1; \ 516 *flagp |= RESTART_PARSE; \ 517 return restart_retval; \ 518 } STMT_END 519 520 /* Until we have completed the parse, we leave RExC_total_parens at 0 or 521 * less. After that, it must always be positive, because the whole re is 522 * considered to be surrounded by virtual parens. Setting it to negative 523 * indicates there is some construct that needs to know the actual number of 524 * parens to be properly handled. And that means an extra pass will be 525 * required after we've counted them all */ 526 #define ALL_PARENS_COUNTED (RExC_total_parens > 0) 527 #define REQUIRE_PARENS_PASS \ 528 STMT_START { /* No-op if have completed a pass */ \ 529 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ 530 } STMT_END 531 #define IN_PARENS_PASS (RExC_total_parens < 0) 532 533 534 /* This is used to return failure (zero) early from the calling function if 535 * various flags in 'flags' are set. Two flags always cause a return: 536 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any 537 * additional flags that should cause a return; 0 if none. If the return will 538 * be done, '*flagp' is first set to be all of the flags that caused the 539 * return. */ 540 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ 541 STMT_START { \ 542 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ 543 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ 544 return 0; \ 545 } \ 546 } STMT_END 547 548 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) 549 550 #define RETURN_FAIL_ON_RESTART(flags,flagp) \ 551 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) 552 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ 553 if (MUST_RESTART(*(flagp))) return 0 554 555 /* This converts the named class defined in regcomp.h to its equivalent class 556 * number defined in handy.h. */ 557 #define namedclass_to_classnum(class) ((int) ((class) / 2)) 558 #define classnum_to_namedclass(classnum) ((classnum) * 2) 559 560 #define _invlist_union_complement_2nd(a, b, output) \ 561 _invlist_union_maybe_complement_2nd(a, b, TRUE, output) 562 #define _invlist_intersection_complement_2nd(a, b, output) \ 563 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) 564 565 /* We add a marker if we are deferring expansion of a property that is both 566 * 1) potentiallly user-defined; and 567 * 2) could also be an official Unicode property. 568 * 569 * Without this marker, any deferred expansion can only be for a user-defined 570 * one. This marker shouldn't conflict with any that could be in a legal name, 571 * and is appended to its name to indicate this. There is a string and 572 * character form */ 573 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" 574 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' 575 576 /* What is infinity for optimization purposes */ 577 #define OPTIMIZE_INFTY SSize_t_MAX 578 579 /* About scan_data_t. 580 581 During optimisation we recurse through the regexp program performing 582 various inplace (keyhole style) optimisations. In addition study_chunk 583 and scan_commit populate this data structure with information about 584 what strings MUST appear in the pattern. We look for the longest 585 string that must appear at a fixed location, and we look for the 586 longest string that may appear at a floating location. So for instance 587 in the pattern: 588 589 /FOO[xX]A.*B[xX]BAR/ 590 591 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 592 strings (because they follow a .* construct). study_chunk will identify 593 both FOO and BAR as being the longest fixed and floating strings respectively. 594 595 The strings can be composites, for instance 596 597 /(f)(o)(o)/ 598 599 will result in a composite fixed substring 'foo'. 600 601 For each string some basic information is maintained: 602 603 - min_offset 604 This is the position the string must appear at, or not before. 605 It also implicitly (when combined with minlenp) tells us how many 606 characters must match before the string we are searching for. 607 Likewise when combined with minlenp and the length of the string it 608 tells us how many characters must appear after the string we have 609 found. 610 611 - max_offset 612 Only used for floating strings. This is the rightmost point that 613 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the 614 string can occur infinitely far to the right. 615 For fixed strings, it is equal to min_offset. 616 617 - minlenp 618 A pointer to the minimum number of characters of the pattern that the 619 string was found inside. This is important as in the case of positive 620 lookahead or positive lookbehind we can have multiple patterns 621 involved. Consider 622 623 /(?=FOO).*F/ 624 625 The minimum length of the pattern overall is 3, the minimum length 626 of the lookahead part is 3, but the minimum length of the part that 627 will actually match is 1. So 'FOO's minimum length is 3, but the 628 minimum length for the F is 1. This is important as the minimum length 629 is used to determine offsets in front of and behind the string being 630 looked for. Since strings can be composites this is the length of the 631 pattern at the time it was committed with a scan_commit. Note that 632 the length is calculated by study_chunk, so that the minimum lengths 633 are not known until the full pattern has been compiled, thus the 634 pointer to the value. 635 636 - lookbehind 637 638 In the case of lookbehind the string being searched for can be 639 offset past the start point of the final matching string. 640 If this value was just blithely removed from the min_offset it would 641 invalidate some of the calculations for how many chars must match 642 before or after (as they are derived from min_offset and minlen and 643 the length of the string being searched for). 644 When the final pattern is compiled and the data is moved from the 645 scan_data_t structure into the regexp structure the information 646 about lookbehind is factored in, with the information that would 647 have been lost precalculated in the end_shift field for the 648 associated string. 649 650 The fields pos_min and pos_delta are used to store the minimum offset 651 and the delta to the maximum offset at the current point in the pattern. 652 653 */ 654 655 struct scan_data_substrs { 656 SV *str; /* longest substring found in pattern */ 657 SSize_t min_offset; /* earliest point in string it can appear */ 658 SSize_t max_offset; /* latest point in string it can appear */ 659 SSize_t *minlenp; /* pointer to the minlen relevant to the string */ 660 SSize_t lookbehind; /* is the pos of the string modified by LB */ 661 I32 flags; /* per substring SF_* and SCF_* flags */ 662 }; 663 664 /* this is typedef'ed in perl.h */ 665 struct scan_data_t { 666 /*I32 len_min; unused */ 667 /*I32 len_delta; unused */ 668 SSize_t pos_min; 669 SSize_t pos_delta; 670 SV *last_found; 671 SSize_t last_end; /* min value, <0 unless valid. */ 672 SSize_t last_start_min; 673 SSize_t last_start_max; 674 U8 cur_is_floating; /* whether the last_* values should be set as 675 * the next fixed (0) or floating (1) 676 * substring */ 677 678 /* [0] is longest fixed substring so far, [1] is longest float so far */ 679 struct scan_data_substrs substrs[2]; 680 681 I32 flags; /* common SF_* and SCF_* flags */ 682 I32 whilem_c; 683 SSize_t *last_closep; 684 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop 685 seen. DO NOT DEREFERENCE the regnode 686 pointer - the op may have been optimized 687 away */ 688 regnode_ssc *start_class; 689 }; 690 691 /* 692 * Forward declarations for pregcomp()'s friends. 693 */ 694 695 static const scan_data_t zero_scan_data = { 696 0, 0, NULL, 0, 0, 0, 0, 697 { 698 { NULL, 0, 0, 0, 0, 0 }, 699 { NULL, 0, 0, 0, 0, 0 }, 700 }, 701 0, 0, NULL, NULL, NULL 702 }; 703 704 /* study flags */ 705 706 #define SF_BEFORE_SEOL 0x0001 707 #define SF_BEFORE_MEOL 0x0002 708 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 709 710 #define SF_IS_INF 0x0040 711 #define SF_HAS_PAR 0x0080 712 #define SF_IN_PAR 0x0100 713 #define SF_HAS_EVAL 0x0200 714 715 716 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the 717 * longest substring in the pattern. When it is not set the optimiser keeps 718 * track of position, but does not keep track of the actual strings seen, 719 * 720 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but 721 * /foo/i will not. 722 * 723 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" 724 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be 725 * turned off because of the alternation (BRANCH). */ 726 #define SCF_DO_SUBSTR 0x0400 727 728 #define SCF_DO_STCLASS_AND 0x0800 729 #define SCF_DO_STCLASS_OR 0x1000 730 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 731 #define SCF_WHILEM_VISITED_POS 0x2000 732 733 #define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? 734 Search for "restudy" in this file 735 to find a detailed explanation.*/ 736 #define SCF_SEEN_ACCEPT 0x8000 737 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? 738 Search for "restudy" in this file 739 to find a detailed explanation. */ 740 #define SCF_IN_DEFINE 0x20000 741 742 743 744 #define UTF cBOOL(RExC_utf8) 745 746 /* The enums for all these are ordered so things work out correctly */ 747 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) 748 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ 749 == REGEX_DEPENDS_CHARSET) 750 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) 751 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ 752 >= REGEX_UNICODE_CHARSET) 753 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 754 == REGEX_ASCII_RESTRICTED_CHARSET) 755 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 756 >= REGEX_ASCII_RESTRICTED_CHARSET) 757 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ 758 == REGEX_ASCII_MORE_RESTRICTED_CHARSET) 759 760 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) 761 762 /* For programs that want to be strictly Unicode compatible by dying if any 763 * attempt is made to match a non-Unicode code point against a Unicode 764 * property. */ 765 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) 766 767 #define OOB_NAMEDCLASS -1 768 769 /* There is no code point that is out-of-bounds, so this is problematic. But 770 * its only current use is to initialize a variable that is always set before 771 * looked at. */ 772 #define OOB_UNICODE 0xDEADBEEF 773 774 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 775 776 777 /* length of regex to show in messages that don't mark a position within */ 778 #define RegexLengthToShowInErrorMessages 127 779 780 /* 781 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 782 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 783 * op/pragma/warn/regcomp. 784 */ 785 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 786 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 787 788 #define REPORT_LOCATION " in regex; marked by " MARKER1 \ 789 " in m/%" UTF8f MARKER2 "%" UTF8f "/" 790 791 /* The code in this file in places uses one level of recursion with parsing 792 * rebased to an alternate string constructed by us in memory. This can take 793 * the form of something that is completely different from the input, or 794 * something that uses the input as part of the alternate. In the first case, 795 * there should be no possibility of an error, as we are in complete control of 796 * the alternate string. But in the second case we don't completely control 797 * the input portion, so there may be errors in that. Here's an example: 798 * /[abc\x{DF}def]/ui 799 * is handled specially because \x{df} folds to a sequence of more than one 800 * character: 'ss'. What is done is to create and parse an alternate string, 801 * which looks like this: 802 * /(?:\x{DF}|[abc\x{DF}def])/ui 803 * where it uses the input unchanged in the middle of something it constructs, 804 * which is a branch for the DF outside the character class, and clustering 805 * parens around the whole thing. (It knows enough to skip the DF inside the 806 * class while in this substitute parse.) 'abc' and 'def' may have errors that 807 * need to be reported. The general situation looks like this: 808 * 809 * |<------- identical ------>| 810 * sI tI xI eI 811 * Input: --------------------------------------------------------------- 812 * Constructed: --------------------------------------------------- 813 * sC tC xC eC EC 814 * |<------- identical ------>| 815 * 816 * sI..eI is the portion of the input pattern we are concerned with here. 817 * sC..EC is the constructed substitute parse string. 818 * sC..tC is constructed by us 819 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. 820 * In the diagram, these are vertically aligned. 821 * eC..EC is also constructed by us. 822 * xC is the position in the substitute parse string where we found a 823 * problem. 824 * xI is the position in the original pattern corresponding to xC. 825 * 826 * We want to display a message showing the real input string. Thus we need to 827 * translate from xC to xI. We know that xC >= tC, since the portion of the 828 * string sC..tC has been constructed by us, and so shouldn't have errors. We 829 * get: 830 * xI = tI + (xC - tC) 831 * 832 * When the substitute parse is constructed, the code needs to set: 833 * RExC_start (sC) 834 * RExC_end (eC) 835 * RExC_copy_start_in_input (tI) 836 * RExC_copy_start_in_constructed (tC) 837 * and restore them when done. 838 * 839 * During normal processing of the input pattern, both 840 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to 841 * sI, so that xC equals xI. 842 */ 843 844 #define sI RExC_precomp 845 #define eI RExC_precomp_end 846 #define sC RExC_start 847 #define eC RExC_end 848 #define tI RExC_copy_start_in_input 849 #define tC RExC_copy_start_in_constructed 850 #define xI(xC) (tI + (xC - tC)) 851 #define xI_offset(xC) (xI(xC) - sI) 852 853 #define REPORT_LOCATION_ARGS(xC) \ 854 UTF8fARG(UTF, \ 855 (xI(xC) > eI) /* Don't run off end */ \ 856 ? eI - sI /* Length before the <--HERE */ \ 857 : ((xI_offset(xC) >= 0) \ 858 ? xI_offset(xC) \ 859 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ 860 IVdf " trying to output message for " \ 861 " pattern %.*s", \ 862 __FILE__, __LINE__, (IV) xI_offset(xC), \ 863 ((int) (eC - sC)), sC), 0)), \ 864 sI), /* The input pattern printed up to the <--HERE */ \ 865 UTF8fARG(UTF, \ 866 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ 867 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ 868 869 /* Used to point after bad bytes for an error message, but avoid skipping 870 * past a nul byte. */ 871 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) 872 873 /* Set up to clean up after our imminent demise */ 874 #define PREPARE_TO_DIE \ 875 STMT_START { \ 876 if (RExC_rx_sv) \ 877 SAVEFREESV(RExC_rx_sv); \ 878 if (RExC_open_parens) \ 879 SAVEFREEPV(RExC_open_parens); \ 880 if (RExC_close_parens) \ 881 SAVEFREEPV(RExC_close_parens); \ 882 if (RExC_logical_to_parno) \ 883 SAVEFREEPV(RExC_logical_to_parno); \ 884 if (RExC_parno_to_logical) \ 885 SAVEFREEPV(RExC_parno_to_logical); \ 886 } STMT_END 887 888 /* 889 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 890 * arg. Show regex, up to a maximum length. If it's too long, chop and add 891 * "...". 892 */ 893 #define _FAIL(code) STMT_START { \ 894 const char *ellipses = ""; \ 895 IV len = RExC_precomp_end - RExC_precomp; \ 896 \ 897 if (len > RegexLengthToShowInErrorMessages) { \ 898 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 899 len = RegexLengthToShowInErrorMessages - 10; \ 900 ellipses = "..."; \ 901 } \ 902 code; \ 903 } STMT_END 904 905 #define FAIL(msg) _FAIL( \ 906 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ 907 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 908 909 #define FAIL2(msg,arg) _FAIL( \ 910 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ 911 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 912 913 #define FAIL3(msg,arg1,arg2) _FAIL( \ 914 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ 915 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 916 917 /* 918 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 919 */ 920 #define Simple_vFAIL(m) STMT_START { \ 921 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 922 m, REPORT_LOCATION_ARGS(RExC_parse)); \ 923 } STMT_END 924 925 /* 926 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 927 */ 928 #define vFAIL(m) STMT_START { \ 929 Simple_vFAIL(m); \ 930 } STMT_END 931 932 /* 933 * Like Simple_vFAIL(), but accepts two arguments. 934 */ 935 #define Simple_vFAIL2(m,a1) STMT_START { \ 936 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ 937 REPORT_LOCATION_ARGS(RExC_parse)); \ 938 } STMT_END 939 940 /* 941 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 942 */ 943 #define vFAIL2(m,a1) STMT_START { \ 944 Simple_vFAIL2(m, a1); \ 945 } STMT_END 946 947 948 /* 949 * Like Simple_vFAIL(), but accepts three arguments. 950 */ 951 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 952 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ 953 REPORT_LOCATION_ARGS(RExC_parse)); \ 954 } STMT_END 955 956 /* 957 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 958 */ 959 #define vFAIL3(m,a1,a2) STMT_START { \ 960 Simple_vFAIL3(m, a1, a2); \ 961 } STMT_END 962 963 /* 964 * Like Simple_vFAIL(), but accepts four arguments. 965 */ 966 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 967 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ 968 REPORT_LOCATION_ARGS(RExC_parse)); \ 969 } STMT_END 970 971 #define vFAIL4(m,a1,a2,a3) STMT_START { \ 972 Simple_vFAIL4(m, a1, a2, a3); \ 973 } STMT_END 974 975 /* A specialized version of vFAIL2 that works with UTF8f */ 976 #define vFAIL2utf8f(m, a1) STMT_START { \ 977 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ 978 REPORT_LOCATION_ARGS(RExC_parse)); \ 979 } STMT_END 980 981 #define vFAIL3utf8f(m, a1, a2) STMT_START { \ 982 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ 983 REPORT_LOCATION_ARGS(RExC_parse)); \ 984 } STMT_END 985 986 /* Setting this to NULL is a signal to not output warnings */ 987 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ 988 STMT_START { \ 989 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ 990 RExC_copy_start_in_constructed = NULL; \ 991 } STMT_END 992 #define RESTORE_WARNINGS \ 993 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed 994 995 /* Since a warning can be generated multiple times as the input is reparsed, we 996 * output it the first time we come to that point in the parse, but suppress it 997 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not 998 * generate any warnings */ 999 #define TO_OUTPUT_WARNINGS(loc) \ 1000 ( RExC_copy_start_in_constructed \ 1001 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) 1002 1003 /* After we've emitted a warning, we save the position in the input so we don't 1004 * output it again */ 1005 #define UPDATE_WARNINGS_LOC(loc) \ 1006 STMT_START { \ 1007 if (TO_OUTPUT_WARNINGS(loc)) { \ 1008 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ 1009 - RExC_precomp; \ 1010 } \ 1011 } STMT_END 1012 1013 /* 'warns' is the output of the packWARNx macro used in 'code' */ 1014 #define _WARN_HELPER(loc, warns, code) \ 1015 STMT_START { \ 1016 if (! RExC_copy_start_in_constructed) { \ 1017 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ 1018 " expected at '%s'", \ 1019 __FILE__, __LINE__, loc); \ 1020 } \ 1021 if (TO_OUTPUT_WARNINGS(loc)) { \ 1022 code; \ 1023 UPDATE_WARNINGS_LOC(loc); \ 1024 } \ 1025 } STMT_END 1026 1027 /* m is not necessarily a "literal string", in this macro */ 1028 #define warn_non_literal_string(loc, packed_warn, m) \ 1029 _WARN_HELPER(loc, packed_warn, \ 1030 Perl_warner(aTHX_ packed_warn, \ 1031 "%s" REPORT_LOCATION, \ 1032 m, REPORT_LOCATION_ARGS(loc))) 1033 #define reg_warn_non_literal_string(loc, m) \ 1034 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) 1035 1036 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ 1037 STMT_START { \ 1038 char * format; \ 1039 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ 1040 Newx(format, format_size, char); \ 1041 my_strlcpy(format, m, format_size); \ 1042 my_strlcat(format, REPORT_LOCATION, format_size); \ 1043 SAVEFREEPV(format); \ 1044 _WARN_HELPER(loc, packwarn, \ 1045 Perl_ck_warner(aTHX_ packwarn, \ 1046 format, \ 1047 a1, REPORT_LOCATION_ARGS(loc))); \ 1048 } STMT_END 1049 1050 #define ckWARNreg(loc,m) \ 1051 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1052 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1053 m REPORT_LOCATION, \ 1054 REPORT_LOCATION_ARGS(loc))) 1055 1056 #define vWARN(loc, m) \ 1057 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1058 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1059 m REPORT_LOCATION, \ 1060 REPORT_LOCATION_ARGS(loc))) \ 1061 1062 #define vWARN_dep(loc,category,m) \ 1063 _WARN_HELPER(loc, packWARN(category), \ 1064 Perl_warner(aTHX_ packWARN(category), \ 1065 m REPORT_LOCATION, \ 1066 REPORT_LOCATION_ARGS(loc))) 1067 1068 #define ckWARNdep(loc,category,m) \ 1069 _WARN_HELPER(loc, packWARN(category), \ 1070 Perl_ck_warner_d(aTHX_ packWARN(category), \ 1071 m REPORT_LOCATION, \ 1072 REPORT_LOCATION_ARGS(loc))) 1073 1074 #define ckWARNregdep(loc,category,m) \ 1075 _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP), \ 1076 Perl_ck_warner_d(aTHX_ packWARN2(category, \ 1077 WARN_REGEXP), \ 1078 m REPORT_LOCATION, \ 1079 REPORT_LOCATION_ARGS(loc))) 1080 1081 #define ckWARN2reg_d(loc,m, a1) \ 1082 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1083 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ 1084 m REPORT_LOCATION, \ 1085 a1, REPORT_LOCATION_ARGS(loc))) 1086 1087 #define ckWARN2reg(loc, m, a1) \ 1088 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1089 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1090 m REPORT_LOCATION, \ 1091 a1, REPORT_LOCATION_ARGS(loc))) 1092 1093 #define vWARN3(loc, m, a1, a2) \ 1094 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1095 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1096 m REPORT_LOCATION, \ 1097 a1, a2, REPORT_LOCATION_ARGS(loc))) 1098 1099 #define ckWARN3reg(loc, m, a1, a2) \ 1100 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1101 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1102 m REPORT_LOCATION, \ 1103 a1, a2, \ 1104 REPORT_LOCATION_ARGS(loc))) 1105 1106 #define vWARN4(loc, m, a1, a2, a3) \ 1107 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1108 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1109 m REPORT_LOCATION, \ 1110 a1, a2, a3, \ 1111 REPORT_LOCATION_ARGS(loc))) 1112 1113 #define ckWARN4reg(loc, m, a1, a2, a3) \ 1114 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1115 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1116 m REPORT_LOCATION, \ 1117 a1, a2, a3, \ 1118 REPORT_LOCATION_ARGS(loc))) 1119 1120 #define vWARN5(loc, m, a1, a2, a3, a4) \ 1121 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1122 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1123 m REPORT_LOCATION, \ 1124 a1, a2, a3, a4, \ 1125 REPORT_LOCATION_ARGS(loc))) 1126 1127 #define ckWARNexperimental(loc, class, m) \ 1128 STMT_START { \ 1129 if (! RExC_warned_ ## class) { /* warn once per compilation */ \ 1130 RExC_warned_ ## class = 1; \ 1131 _WARN_HELPER(loc, packWARN(class), \ 1132 Perl_ck_warner_d(aTHX_ packWARN(class), \ 1133 m REPORT_LOCATION, \ 1134 REPORT_LOCATION_ARGS(loc)));\ 1135 } \ 1136 } STMT_END 1137 1138 #define ckWARNexperimental_with_arg(loc, class, m, arg) \ 1139 STMT_START { \ 1140 if (! RExC_warned_ ## class) { /* warn once per compilation */ \ 1141 RExC_warned_ ## class = 1; \ 1142 _WARN_HELPER(loc, packWARN(class), \ 1143 Perl_ck_warner_d(aTHX_ packWARN(class), \ 1144 m REPORT_LOCATION, \ 1145 arg, REPORT_LOCATION_ARGS(loc)));\ 1146 } \ 1147 } STMT_END 1148 1149 /* Convert between a pointer to a node and its offset from the beginning of the 1150 * program */ 1151 #define REGNODE_p(offset) (RExC_emit_start + (offset)) 1152 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ 1153 (SSize_t) ((node) - RExC_emit_start)) 1154 1155 #define ProgLen(ri) ri->proglen 1156 #define SetProgLen(ri,x) ri->proglen = x 1157 1158 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 1159 #define EXPERIMENTAL_INPLACESCAN 1160 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ 1161 1162 #define DEBUG_RExC_seen() \ 1163 DEBUG_OPTIMISE_MORE_r({ \ 1164 Perl_re_printf( aTHX_ "RExC_seen: "); \ 1165 \ 1166 if (RExC_seen & REG_ZERO_LEN_SEEN) \ 1167 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ 1168 \ 1169 if (RExC_seen & REG_LOOKBEHIND_SEEN) \ 1170 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ 1171 \ 1172 if (RExC_seen & REG_GPOS_SEEN) \ 1173 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ 1174 \ 1175 if (RExC_seen & REG_RECURSE_SEEN) \ 1176 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ 1177 \ 1178 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ 1179 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ 1180 \ 1181 if (RExC_seen & REG_VERBARG_SEEN) \ 1182 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ 1183 \ 1184 if (RExC_seen & REG_CUTGROUP_SEEN) \ 1185 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ 1186 \ 1187 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ 1188 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ 1189 \ 1190 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ 1191 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ 1192 \ 1193 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ 1194 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ 1195 \ 1196 if (RExC_seen & REG_PESSIMIZE_SEEN) \ 1197 Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN "); \ 1198 \ 1199 Perl_re_printf( aTHX_ "\n"); \ 1200 }); 1201 1202 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ 1203 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) 1204 1205 1206 #ifdef DEBUGGING 1207 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ 1208 debug_studydata(where, data, depth, is_inf, min, stopmin, delta) 1209 1210 # define DEBUG_PEEP(str, scan, depth, flags) \ 1211 debug_peep(str, pRExC_state, scan, depth, flags) 1212 #else 1213 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP 1214 # define DEBUG_PEEP(str, scan, depth, flags) NOOP 1215 #endif 1216 1217 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 1218 #ifdef DEBUGGING 1219 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 1220 #else 1221 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 1222 #endif 1223 1224 #define MADE_TRIE 1 1225 #define MADE_JUMP_TRIE 2 1226 #define MADE_EXACT_TRIE 4 1227 1228 #define INVLIST_INDEX 0 1229 #define ONLY_LOCALE_MATCHES_INDEX 1 1230 #define DEFERRED_USER_DEFINED_INDEX 2 1231 1232 /* These two functions currently do the exact same thing */ 1233 #define ssc_init_zero ssc_init 1234 1235 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) 1236 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) 1237 1238 #ifdef DEBUGGING 1239 #define REGNODE_GUTS(state,op,extra_size) \ 1240 regnode_guts_debug(state,op,extra_size) 1241 #else 1242 #define REGNODE_GUTS(state,op,extra_size) \ 1243 regnode_guts(state,extra_size) 1244 #endif 1245 1246 #define CLEAR_OPTSTART \ 1247 if (optstart) STMT_START { \ 1248 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ 1249 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ 1250 optstart=NULL; \ 1251 } STMT_END 1252 1253 #define DUMPUNTIL(b,e) \ 1254 CLEAR_OPTSTART; \ 1255 node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 1256 1257 #define REGNODE_STEP_OVER(ret,t1,t2) \ 1258 NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode)) 1259 1260 #endif /* REGCOMP_INTERNAL_H */ 1261