1*f2a19305Safresh1 #ifndef REGCOMP_INTERNAL_H 2*f2a19305Safresh1 #define REGCOMP_INTERNAL_H 3*f2a19305Safresh1 #ifndef STATIC 4*f2a19305Safresh1 #define STATIC static 5*f2a19305Safresh1 #endif 6*f2a19305Safresh1 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM 7*f2a19305Safresh1 #define RE_OPTIMIZE_CURLYX_TO_CURLYM 1 8*f2a19305Safresh1 #endif 9*f2a19305Safresh1 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN 10*f2a19305Safresh1 #define RE_OPTIMIZE_CURLYX_TO_CURLYN 1 11*f2a19305Safresh1 #endif 12*f2a19305Safresh1 13*f2a19305Safresh1 /* this is a chain of data about sub patterns we are processing that 14*f2a19305Safresh1 need to be handled separately/specially in study_chunk. Its so 15*f2a19305Safresh1 we can simulate recursion without losing state. */ 16*f2a19305Safresh1 struct scan_frame; 17*f2a19305Safresh1 typedef struct scan_frame { 18*f2a19305Safresh1 regnode *last_regnode; /* last node to process in this frame */ 19*f2a19305Safresh1 regnode *next_regnode; /* next node to process when last is reached */ 20*f2a19305Safresh1 U32 prev_recursed_depth; 21*f2a19305Safresh1 I32 stopparen; /* what stopparen do we use */ 22*f2a19305Safresh1 bool in_gosub; /* this or an outer frame is for GOSUB */ 23*f2a19305Safresh1 24*f2a19305Safresh1 struct scan_frame *this_prev_frame; /* this previous frame */ 25*f2a19305Safresh1 struct scan_frame *prev_frame; /* previous frame */ 26*f2a19305Safresh1 struct scan_frame *next_frame; /* next frame */ 27*f2a19305Safresh1 } scan_frame; 28*f2a19305Safresh1 29*f2a19305Safresh1 /* Certain characters are output as a sequence with the first being a 30*f2a19305Safresh1 * backslash. */ 31*f2a19305Safresh1 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) 32*f2a19305Safresh1 33*f2a19305Safresh1 34*f2a19305Safresh1 struct RExC_state_t { 35*f2a19305Safresh1 U32 flags; /* RXf_* are we folding, multilining? */ 36*f2a19305Safresh1 U32 pm_flags; /* PMf_* stuff from the calling PMOP */ 37*f2a19305Safresh1 char *precomp; /* uncompiled string. */ 38*f2a19305Safresh1 char *precomp_end; /* pointer to end of uncompiled string. */ 39*f2a19305Safresh1 REGEXP *rx_sv; /* The SV that is the regexp. */ 40*f2a19305Safresh1 regexp *rx; /* perl core regexp structure */ 41*f2a19305Safresh1 regexp_internal *rxi; /* internal data for regexp object 42*f2a19305Safresh1 pprivate field */ 43*f2a19305Safresh1 char *start; /* Start of input for compile */ 44*f2a19305Safresh1 char *end; /* End of input for compile */ 45*f2a19305Safresh1 char *parse; /* Input-scan pointer. */ 46*f2a19305Safresh1 char *copy_start; /* start of copy of input within 47*f2a19305Safresh1 constructed parse string */ 48*f2a19305Safresh1 char *save_copy_start; /* Provides one level of saving 49*f2a19305Safresh1 and restoring 'copy_start' */ 50*f2a19305Safresh1 char *copy_start_in_input; /* Position in input string 51*f2a19305Safresh1 corresponding to copy_start */ 52*f2a19305Safresh1 SSize_t whilem_seen; /* number of WHILEM in this expr */ 53*f2a19305Safresh1 regnode *emit_start; /* Start of emitted-code area */ 54*f2a19305Safresh1 regnode_offset emit; /* Code-emit pointer */ 55*f2a19305Safresh1 I32 naughty; /* How bad is this pattern? */ 56*f2a19305Safresh1 I32 sawback; /* Did we see \1, ...? */ 57*f2a19305Safresh1 SSize_t size; /* Number of regnode equivalents in 58*f2a19305Safresh1 pattern */ 59*f2a19305Safresh1 Size_t sets_depth; /* Counts recursion depth of already- 60*f2a19305Safresh1 compiled regex set patterns */ 61*f2a19305Safresh1 U32 seen; 62*f2a19305Safresh1 63*f2a19305Safresh1 I32 parens_buf_size; /* #slots malloced open/close_parens */ 64*f2a19305Safresh1 regnode_offset *open_parens; /* offsets to open parens */ 65*f2a19305Safresh1 regnode_offset *close_parens; /* offsets to close parens */ 66*f2a19305Safresh1 HV *paren_names; /* Paren names */ 67*f2a19305Safresh1 68*f2a19305Safresh1 /* position beyond 'precomp' of the warning message furthest away from 69*f2a19305Safresh1 * 'precomp'. During the parse, no warnings are raised for any problems 70*f2a19305Safresh1 * earlier in the parse than this position. This works if warnings are 71*f2a19305Safresh1 * raised the first time a given spot is parsed, and if only one 72*f2a19305Safresh1 * independent warning is raised for any given spot */ 73*f2a19305Safresh1 Size_t latest_warn_offset; 74*f2a19305Safresh1 75*f2a19305Safresh1 /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id. 76*f2a19305Safresh1 * "Logical Parno" is the user visible view with branch reset taken into 77*f2a19305Safresh1 * account. "Parno" (or physical parno) is the actual capture buffers in 78*f2a19305Safresh1 * the pattern *NOT* taking into account branch reset. We also maintain 79*f2a19305Safresh1 * a map of "next" pointers which allow us to skip to the next physical 80*f2a19305Safresh1 * capture buffer with the same logical id, with 0 representing "none". 81*f2a19305Safresh1 * 82*f2a19305Safresh1 * As we compile we keep track of the two different counts using the 83*f2a19305Safresh1 * 'logical_npar' and 'npar' members, and we keep track of the upper bound 84*f2a19305Safresh1 * of both in 'total_par' and 'logical_total_par', we also populate 85*f2a19305Safresh1 * the 'logical_to_parno' map, which gives us the first physical parno 86*f2a19305Safresh1 * for a given logical parno, and the `parno_to_logical` array which gives 87*f2a19305Safresh1 * us the logical id for each physical parno. When compilation is 88*f2a19305Safresh1 * completed we construct the 'parno_to_logical_next' array from the 89*f2a19305Safresh1 * 'parno_to_logical' array. (We do not bother constructing it during 90*f2a19305Safresh1 * compilation as we do not need it, and we can construct it in O(N) time 91*f2a19305Safresh1 * once we are done, but would need more complicated logic during the 92*f2a19305Safresh1 * compile, because we want the next pointers to go from smallest to 93*f2a19305Safresh1 * largest, eg, left to right.) 94*f2a19305Safresh1 * 95*f2a19305Safresh1 * Logical: $1 $2 $3 $4 $2 $3 $2 $5 96*f2a19305Safresh1 * Physical: 1 2 3 4 5 6 7 8 97*f2a19305Safresh1 * Next: 0 5 6 0 7 0 0 0 98*f2a19305Safresh1 * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/ 99*f2a19305Safresh1 * 100*f2a19305Safresh1 * As much as possible the internals use and store the physical id of 101*f2a19305Safresh1 * of capture buffers. We decode the physical to the logical only when 102*f2a19305Safresh1 * we need to, for instance when someone use $2. 103*f2a19305Safresh1 * 104*f2a19305Safresh1 * Note that when branch reset is not used logical and physical are the 105*f2a19305Safresh1 * same and the next data would be all zero. So when branch reset is not 106*f2a19305Safresh1 * used we do not need to populate this data into the final regexp. 107*f2a19305Safresh1 * 108*f2a19305Safresh1 */ 109*f2a19305Safresh1 I32 *logical_to_parno; /* logical_parno to parno */ 110*f2a19305Safresh1 I32 *parno_to_logical; /* parno to logical_parno */ 111*f2a19305Safresh1 I32 *parno_to_logical_next; /* parno to next (greater value) 112*f2a19305Safresh1 parno with the same 113*f2a19305Safresh1 logical_parno as parno.*/ 114*f2a19305Safresh1 115*f2a19305Safresh1 I32 npar; /* Capture buffer count so far in the 116*f2a19305Safresh1 parse, (OPEN) plus one. ("par" 0 is 117*f2a19305Safresh1 the whole pattern)*/ 118*f2a19305Safresh1 I32 logical_npar; /* Logical version of npar */ 119*f2a19305Safresh1 I32 total_par; /* During initial parse, is either 0, 120*f2a19305Safresh1 or -1; the latter indicating a 121*f2a19305Safresh1 reparse is needed. After that pass, 122*f2a19305Safresh1 it is what 'npar' became after the 123*f2a19305Safresh1 pass. Hence, it being > 0 indicates 124*f2a19305Safresh1 we are in a reparse situation */ 125*f2a19305Safresh1 I32 logical_total_par; /* Logical version to total par */ 126*f2a19305Safresh1 I32 nestroot; /* root parens we are in - used by 127*f2a19305Safresh1 accept */ 128*f2a19305Safresh1 I32 seen_zerolen; 129*f2a19305Safresh1 regnode *end_op; /* END node in program */ 130*f2a19305Safresh1 I32 utf8; /* whether the pattern is utf8 or not */ 131*f2a19305Safresh1 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 132*f2a19305Safresh1 /* XXX use this for future optimisation of case 133*f2a19305Safresh1 * where pattern must be upgraded to utf8. */ 134*f2a19305Safresh1 I32 uni_semantics; /* If a d charset modifier should use unicode 135*f2a19305Safresh1 rules, even if the pattern is not in 136*f2a19305Safresh1 utf8 */ 137*f2a19305Safresh1 138*f2a19305Safresh1 I32 recurse_count; /* Number of recurse regops we have generated */ 139*f2a19305Safresh1 regnode **recurse; /* Recurse regops */ 140*f2a19305Safresh1 U8 *study_chunk_recursed; /* bitmap of which subs we have moved 141*f2a19305Safresh1 through */ 142*f2a19305Safresh1 U32 study_chunk_recursed_bytes; /* bytes in bitmap */ 143*f2a19305Safresh1 I32 in_lookaround; 144*f2a19305Safresh1 I32 contains_locale; 145*f2a19305Safresh1 I32 override_recoding; 146*f2a19305Safresh1 I32 recode_x_to_native; 147*f2a19305Safresh1 I32 in_multi_char_class; 148*f2a19305Safresh1 int code_index; /* next code_blocks[] slot */ 149*f2a19305Safresh1 struct reg_code_blocks *code_blocks;/* positions of literal (?{}) 150*f2a19305Safresh1 within pattern */ 151*f2a19305Safresh1 SSize_t maxlen; /* mininum possible number of chars in string to match */ 152*f2a19305Safresh1 scan_frame *frame_head; 153*f2a19305Safresh1 scan_frame *frame_last; 154*f2a19305Safresh1 U32 frame_count; 155*f2a19305Safresh1 AV *warn_text; 156*f2a19305Safresh1 HV *unlexed_names; 157*f2a19305Safresh1 SV *runtime_code_qr; /* qr with the runtime code blocks */ 158*f2a19305Safresh1 #ifdef DEBUGGING 159*f2a19305Safresh1 const char *lastparse; 160*f2a19305Safresh1 I32 lastnum; 161*f2a19305Safresh1 U32 study_chunk_recursed_count; 162*f2a19305Safresh1 AV *paren_name_list; /* idx -> name */ 163*f2a19305Safresh1 SV *mysv1; 164*f2a19305Safresh1 SV *mysv2; 165*f2a19305Safresh1 #endif 166*f2a19305Safresh1 bool seen_d_op; 167*f2a19305Safresh1 bool strict; 168*f2a19305Safresh1 bool study_started; 169*f2a19305Safresh1 bool in_script_run; 170*f2a19305Safresh1 bool use_BRANCHJ; 171*f2a19305Safresh1 bool sWARN_EXPERIMENTAL__VLB; 172*f2a19305Safresh1 bool sWARN_EXPERIMENTAL__REGEX_SETS; 173*f2a19305Safresh1 }; 174*f2a19305Safresh1 175*f2a19305Safresh1 #ifdef DEBUGGING 176*f2a19305Safresh1 #define RExC_lastparse (pRExC_state->lastparse) 177*f2a19305Safresh1 #define RExC_lastnum (pRExC_state->lastnum) 178*f2a19305Safresh1 #define RExC_paren_name_list (pRExC_state->paren_name_list) 179*f2a19305Safresh1 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) 180*f2a19305Safresh1 #define RExC_mysv (pRExC_state->mysv1) 181*f2a19305Safresh1 #define RExC_mysv1 (pRExC_state->mysv1) 182*f2a19305Safresh1 #define RExC_mysv2 (pRExC_state->mysv2) 183*f2a19305Safresh1 #endif 184*f2a19305Safresh1 185*f2a19305Safresh1 #define RExC_flags (pRExC_state->flags) 186*f2a19305Safresh1 #define RExC_pm_flags (pRExC_state->pm_flags) 187*f2a19305Safresh1 #define RExC_precomp (pRExC_state->precomp) 188*f2a19305Safresh1 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) 189*f2a19305Safresh1 #define RExC_copy_start_in_constructed (pRExC_state->copy_start) 190*f2a19305Safresh1 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) 191*f2a19305Safresh1 #define RExC_precomp_end (pRExC_state->precomp_end) 192*f2a19305Safresh1 #define RExC_rx_sv (pRExC_state->rx_sv) 193*f2a19305Safresh1 #define RExC_rx (pRExC_state->rx) 194*f2a19305Safresh1 #define RExC_rxi (pRExC_state->rxi) 195*f2a19305Safresh1 #define RExC_start (pRExC_state->start) 196*f2a19305Safresh1 #define RExC_end (pRExC_state->end) 197*f2a19305Safresh1 #define RExC_parse (pRExC_state->parse) 198*f2a19305Safresh1 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) 199*f2a19305Safresh1 #define RExC_whilem_seen (pRExC_state->whilem_seen) 200*f2a19305Safresh1 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs 201*f2a19305Safresh1 under /d from /u ? */ 202*f2a19305Safresh1 203*f2a19305Safresh1 #define RExC_emit (pRExC_state->emit) 204*f2a19305Safresh1 #define RExC_emit_start (pRExC_state->emit_start) 205*f2a19305Safresh1 #define RExC_sawback (pRExC_state->sawback) 206*f2a19305Safresh1 #define RExC_seen (pRExC_state->seen) 207*f2a19305Safresh1 #define RExC_size (pRExC_state->size) 208*f2a19305Safresh1 #define RExC_maxlen (pRExC_state->maxlen) 209*f2a19305Safresh1 #define RExC_logical_npar (pRExC_state->logical_npar) 210*f2a19305Safresh1 #define RExC_logical_total_parens (pRExC_state->logical_total_par) 211*f2a19305Safresh1 #define RExC_logical_to_parno (pRExC_state->logical_to_parno) 212*f2a19305Safresh1 #define RExC_parno_to_logical (pRExC_state->parno_to_logical) 213*f2a19305Safresh1 #define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next) 214*f2a19305Safresh1 #define RExC_npar (pRExC_state->npar) 215*f2a19305Safresh1 #define RExC_total_parens (pRExC_state->total_par) 216*f2a19305Safresh1 #define RExC_parens_buf_size (pRExC_state->parens_buf_size) 217*f2a19305Safresh1 #define RExC_nestroot (pRExC_state->nestroot) 218*f2a19305Safresh1 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 219*f2a19305Safresh1 #define RExC_utf8 (pRExC_state->utf8) 220*f2a19305Safresh1 #define RExC_uni_semantics (pRExC_state->uni_semantics) 221*f2a19305Safresh1 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 222*f2a19305Safresh1 #define RExC_open_parens (pRExC_state->open_parens) 223*f2a19305Safresh1 #define RExC_close_parens (pRExC_state->close_parens) 224*f2a19305Safresh1 #define RExC_end_op (pRExC_state->end_op) 225*f2a19305Safresh1 #define RExC_paren_names (pRExC_state->paren_names) 226*f2a19305Safresh1 #define RExC_recurse (pRExC_state->recurse) 227*f2a19305Safresh1 #define RExC_recurse_count (pRExC_state->recurse_count) 228*f2a19305Safresh1 #define RExC_sets_depth (pRExC_state->sets_depth) 229*f2a19305Safresh1 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) 230*f2a19305Safresh1 #define RExC_study_chunk_recursed_bytes \ 231*f2a19305Safresh1 (pRExC_state->study_chunk_recursed_bytes) 232*f2a19305Safresh1 #define RExC_in_lookaround (pRExC_state->in_lookaround) 233*f2a19305Safresh1 #define RExC_contains_locale (pRExC_state->contains_locale) 234*f2a19305Safresh1 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) 235*f2a19305Safresh1 236*f2a19305Safresh1 #ifdef EBCDIC 237*f2a19305Safresh1 # define SET_recode_x_to_native(x) \ 238*f2a19305Safresh1 STMT_START { RExC_recode_x_to_native = (x); } STMT_END 239*f2a19305Safresh1 #else 240*f2a19305Safresh1 # define SET_recode_x_to_native(x) NOOP 241*f2a19305Safresh1 #endif 242*f2a19305Safresh1 243*f2a19305Safresh1 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) 244*f2a19305Safresh1 #define RExC_frame_head (pRExC_state->frame_head) 245*f2a19305Safresh1 #define RExC_frame_last (pRExC_state->frame_last) 246*f2a19305Safresh1 #define RExC_frame_count (pRExC_state->frame_count) 247*f2a19305Safresh1 #define RExC_strict (pRExC_state->strict) 248*f2a19305Safresh1 #define RExC_study_started (pRExC_state->study_started) 249*f2a19305Safresh1 #define RExC_warn_text (pRExC_state->warn_text) 250*f2a19305Safresh1 #define RExC_in_script_run (pRExC_state->in_script_run) 251*f2a19305Safresh1 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) 252*f2a19305Safresh1 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) 253*f2a19305Safresh1 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) 254*f2a19305Safresh1 #define RExC_unlexed_names (pRExC_state->unlexed_names) 255*f2a19305Safresh1 256*f2a19305Safresh1 257*f2a19305Safresh1 /***********************************************************************/ 258*f2a19305Safresh1 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse 259*f2a19305Safresh1 * 260*f2a19305Safresh1 * All of these macros depend on the above RExC_ accessor macros, which 261*f2a19305Safresh1 * in turns depend on a variable pRExC_state being in scope where they 262*f2a19305Safresh1 * are used. This is the standard regexp parser context variable which is 263*f2a19305Safresh1 * passed into every non-trivial parse function in this file. 264*f2a19305Safresh1 * 265*f2a19305Safresh1 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all 266*f2a19305Safresh1 * of the macros which do not take an argument will operate on the 267*f2a19305Safresh1 * pRExC_state structure *only*. 268*f2a19305Safresh1 * 269*f2a19305Safresh1 * Please do NOT modify RExC_parse without using these macros. In the 270*f2a19305Safresh1 * future these macros will be extended for enhanced debugging and trace 271*f2a19305Safresh1 * output during the parse process. 272*f2a19305Safresh1 */ 273*f2a19305Safresh1 274*f2a19305Safresh1 /* RExC_parse_incf(flag) 275*f2a19305Safresh1 * 276*f2a19305Safresh1 * Increment RExC_parse to point at the next codepoint, while doing 277*f2a19305Safresh1 * the right thing depending on whether we are parsing UTF-8 strings 278*f2a19305Safresh1 * or not. The 'flag' argument determines if content is UTF-8 or not, 279*f2a19305Safresh1 * intended for cases where this is NOT governed by the UTF macro. 280*f2a19305Safresh1 * 281*f2a19305Safresh1 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. 282*f2a19305Safresh1 * 283*f2a19305Safresh1 * WARNING: Does NOT take into account RExC_end; it is the callers 284*f2a19305Safresh1 * responsibility to make sure there are enough octets left in 285*f2a19305Safresh1 * RExC_parse to ensure that when processing UTF-8 we would not read 286*f2a19305Safresh1 * past the end of the string. 287*f2a19305Safresh1 */ 288*f2a19305Safresh1 #define RExC_parse_incf(flag) STMT_START { \ 289*f2a19305Safresh1 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ 290*f2a19305Safresh1 } STMT_END 291*f2a19305Safresh1 292*f2a19305Safresh1 /* RExC_parse_inc_safef(flag) 293*f2a19305Safresh1 * 294*f2a19305Safresh1 * Safely increment RExC_parse to point at the next codepoint, 295*f2a19305Safresh1 * doing the right thing depending on whether we are parsing 296*f2a19305Safresh1 * UTF-8 strings or not and NOT reading past the end of the buffer. 297*f2a19305Safresh1 * The 'flag' argument determines if content is UTF-8 or not, 298*f2a19305Safresh1 * intended for cases where this is NOT governed by the UTF macro. 299*f2a19305Safresh1 * 300*f2a19305Safresh1 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. 301*f2a19305Safresh1 * 302*f2a19305Safresh1 * NOTE: Will NOT read past RExC_end when content is UTF-8. 303*f2a19305Safresh1 */ 304*f2a19305Safresh1 #define RExC_parse_inc_safef(flag) STMT_START { \ 305*f2a19305Safresh1 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ 306*f2a19305Safresh1 } STMT_END 307*f2a19305Safresh1 308*f2a19305Safresh1 /* RExC_parse_inc() 309*f2a19305Safresh1 * 310*f2a19305Safresh1 * Increment RExC_parse to point at the next codepoint, 311*f2a19305Safresh1 * doing the right thing depending on whether we are parsing 312*f2a19305Safresh1 * UTF-8 strings or not. 313*f2a19305Safresh1 * 314*f2a19305Safresh1 * WARNING: Does NOT take into account RExC_end, it is the callers 315*f2a19305Safresh1 * responsibility to make sure there are enough octets left in 316*f2a19305Safresh1 * RExC_parse to ensure that when processing UTF-8 we would not read 317*f2a19305Safresh1 * past the end of the string. 318*f2a19305Safresh1 * 319*f2a19305Safresh1 * NOTE: whether we are parsing UTF-8 or not is determined by the 320*f2a19305Safresh1 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 321*f2a19305Safresh1 * macro operates on the pRExC_state structure only. 322*f2a19305Safresh1 */ 323*f2a19305Safresh1 #define RExC_parse_inc() RExC_parse_incf(UTF) 324*f2a19305Safresh1 325*f2a19305Safresh1 /* RExC_parse_inc_safe() 326*f2a19305Safresh1 * 327*f2a19305Safresh1 * Safely increment RExC_parse to point at the next codepoint, 328*f2a19305Safresh1 * doing the right thing depending on whether we are parsing 329*f2a19305Safresh1 * UTF-8 strings or not and NOT reading past the end of the buffer. 330*f2a19305Safresh1 * 331*f2a19305Safresh1 * NOTE: whether we are parsing UTF-8 or not is determined by the 332*f2a19305Safresh1 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 333*f2a19305Safresh1 * macro operates on the pRExC_state structure only. 334*f2a19305Safresh1 */ 335*f2a19305Safresh1 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) 336*f2a19305Safresh1 337*f2a19305Safresh1 /* RExC_parse_inc_utf8() 338*f2a19305Safresh1 * 339*f2a19305Safresh1 * Increment RExC_parse to point at the next utf8 codepoint, 340*f2a19305Safresh1 * assumes content is UTF-8. 341*f2a19305Safresh1 * 342*f2a19305Safresh1 * WARNING: Does NOT take into account RExC_end; it is the callers 343*f2a19305Safresh1 * responsibility to make sure there are enough octets left in RExC_parse 344*f2a19305Safresh1 * to ensure that when processing UTF-8 we would not read past the end 345*f2a19305Safresh1 * of the string. 346*f2a19305Safresh1 */ 347*f2a19305Safresh1 #define RExC_parse_inc_utf8() STMT_START { \ 348*f2a19305Safresh1 RExC_parse += UTF8SKIP(RExC_parse); \ 349*f2a19305Safresh1 } STMT_END 350*f2a19305Safresh1 351*f2a19305Safresh1 /* RExC_parse_inc_if_char() 352*f2a19305Safresh1 * 353*f2a19305Safresh1 * Increment RExC_parse to point at the next codepoint, if and only 354*f2a19305Safresh1 * if the current parse point is NOT a NULL, while doing the right thing 355*f2a19305Safresh1 * depending on whether we are parsing UTF-8 strings or not. 356*f2a19305Safresh1 * 357*f2a19305Safresh1 * WARNING: Does NOT take into account RExC_end, it is the callers 358*f2a19305Safresh1 * responsibility to make sure there are enough octets left in RExC_parse 359*f2a19305Safresh1 * to ensure that when processing UTF-8 we would not read past the end 360*f2a19305Safresh1 * of the string. 361*f2a19305Safresh1 * 362*f2a19305Safresh1 * NOTE: whether we are parsing UTF-8 or not is determined by the 363*f2a19305Safresh1 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this 364*f2a19305Safresh1 * macro operates on the pRExC_state structure only. 365*f2a19305Safresh1 */ 366*f2a19305Safresh1 #define RExC_parse_inc_if_char() STMT_START { \ 367*f2a19305Safresh1 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ 368*f2a19305Safresh1 } STMT_END 369*f2a19305Safresh1 370*f2a19305Safresh1 /* RExC_parse_inc_by(n_octets) 371*f2a19305Safresh1 * 372*f2a19305Safresh1 * Increment the parse cursor by the number of octets specified by 373*f2a19305Safresh1 * the 'n_octets' argument. 374*f2a19305Safresh1 * 375*f2a19305Safresh1 * NOTE: Does NOT check ANY constraints. It is the callers responsibility 376*f2a19305Safresh1 * that this will not move past the end of the string, or leave the 377*f2a19305Safresh1 * pointer in the middle of a UTF-8 sequence. 378*f2a19305Safresh1 * 379*f2a19305Safresh1 * Typically used to advanced past previously analyzed content. 380*f2a19305Safresh1 */ 381*f2a19305Safresh1 #define RExC_parse_inc_by(n_octets) STMT_START { \ 382*f2a19305Safresh1 RExC_parse += (n_octets); \ 383*f2a19305Safresh1 } STMT_END 384*f2a19305Safresh1 385*f2a19305Safresh1 /* RExC_parse_set(to_ptr) 386*f2a19305Safresh1 * 387*f2a19305Safresh1 * Sets the RExC_parse pointer to the pointer specified by the 'to' 388*f2a19305Safresh1 * argument. No validation whatsoever is performed on the to pointer. 389*f2a19305Safresh1 */ 390*f2a19305Safresh1 #define RExC_parse_set(to_ptr) STMT_START { \ 391*f2a19305Safresh1 RExC_parse = (to_ptr); \ 392*f2a19305Safresh1 } STMT_END 393*f2a19305Safresh1 394*f2a19305Safresh1 /**********************************************************************/ 395*f2a19305Safresh1 396*f2a19305Safresh1 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set 397*f2a19305Safresh1 * a flag to disable back-off on the fixed/floating substrings - if it's 398*f2a19305Safresh1 * a high complexity pattern we assume the benefit of avoiding a full match 399*f2a19305Safresh1 * is worth the cost of checking for the substrings even if they rarely help. 400*f2a19305Safresh1 */ 401*f2a19305Safresh1 #define RExC_naughty (pRExC_state->naughty) 402*f2a19305Safresh1 #define TOO_NAUGHTY (10) 403*f2a19305Safresh1 #define MARK_NAUGHTY(add) \ 404*f2a19305Safresh1 if (RExC_naughty < TOO_NAUGHTY) \ 405*f2a19305Safresh1 RExC_naughty += (add) 406*f2a19305Safresh1 #define MARK_NAUGHTY_EXP(exp, add) \ 407*f2a19305Safresh1 if (RExC_naughty < TOO_NAUGHTY) \ 408*f2a19305Safresh1 RExC_naughty += RExC_naughty / (exp) + (add) 409*f2a19305Safresh1 410*f2a19305Safresh1 #define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') 411*f2a19305Safresh1 #define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ 412*f2a19305Safresh1 || ((*s) == '{' && regcurly(s, e, NULL))) 413*f2a19305Safresh1 414*f2a19305Safresh1 /* 415*f2a19305Safresh1 * Flags to be passed up. 416*f2a19305Safresh1 */ 417*f2a19305Safresh1 #define HASWIDTH 0x01 /* Known to not match null strings, could match 418*f2a19305Safresh1 non-null ones. */ 419*f2a19305Safresh1 #define SIMPLE 0x02 /* Exactly one character wide */ 420*f2a19305Safresh1 /* (or LNBREAK as a special case) */ 421*f2a19305Safresh1 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ 422*f2a19305Safresh1 #define TRYAGAIN 0x10 /* Weeded out a declaration. */ 423*f2a19305Safresh1 #define RESTART_PARSE 0x20 /* Need to redo the parse */ 424*f2a19305Safresh1 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to 425*f2a19305Safresh1 calcuate sizes as UTF-8 */ 426*f2a19305Safresh1 427*f2a19305Safresh1 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 428*f2a19305Safresh1 429*f2a19305Safresh1 /* whether trie related optimizations are enabled */ 430*f2a19305Safresh1 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 431*f2a19305Safresh1 #define TRIE_STUDY_OPT 432*f2a19305Safresh1 #define FULL_TRIE_STUDY 433*f2a19305Safresh1 #define TRIE_STCLASS 434*f2a19305Safresh1 #endif 435*f2a19305Safresh1 436*f2a19305Safresh1 /* About the term "restudy" and the var "restudied" and the defines 437*f2a19305Safresh1 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to 438*f2a19305Safresh1 * doing multiple study_chunk() calls over the same set of opcodes for* the 439*f2a19305Safresh1 * purpose of enhanced TRIE optimizations. 440*f2a19305Safresh1 * 441*f2a19305Safresh1 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal 442*f2a19305Safresh1 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled 443*f2a19305Safresh1 * which then causes the Perl_re_op_compile() to then call the optimizer 444*f2a19305Safresh1 * S_study_chunk() a second time to perform additional optimizations, 445*f2a19305Safresh1 * including the aho_corasick startclass optimization. 446*f2a19305Safresh1 * This additional pass will only happen once, which is managed by the 447*f2a19305Safresh1 * 'restudied' variable in Perl_re_op_compile(). 448*f2a19305Safresh1 * 449*f2a19305Safresh1 * When this second pass is under way the flags passed into study_chunk() will 450*f2a19305Safresh1 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down 451*f2a19305Safresh1 * to any recursive calls to S_study_chunk(). 452*f2a19305Safresh1 * 453*f2a19305Safresh1 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that 454*f2a19305Safresh1 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may 455*f2a19305Safresh1 * be produced twice. 456*f2a19305Safresh1 * 457*f2a19305Safresh1 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and 458*f2a19305Safresh1 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. 459*f2a19305Safresh1 */ 460*f2a19305Safresh1 461*f2a19305Safresh1 462*f2a19305Safresh1 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 463*f2a19305Safresh1 #define PBITVAL(paren) (1 << ((paren) & 7)) 464*f2a19305Safresh1 #define PAREN_OFFSET(depth) \ 465*f2a19305Safresh1 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) 466*f2a19305Safresh1 #define PAREN_TEST(depth, paren) \ 467*f2a19305Safresh1 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) 468*f2a19305Safresh1 #define PAREN_SET(depth, paren) \ 469*f2a19305Safresh1 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) 470*f2a19305Safresh1 #define PAREN_UNSET(depth, paren) \ 471*f2a19305Safresh1 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) 472*f2a19305Safresh1 473*f2a19305Safresh1 #define REQUIRE_UTF8(flagp) STMT_START { \ 474*f2a19305Safresh1 if (!UTF) { \ 475*f2a19305Safresh1 *flagp = RESTART_PARSE|NEED_UTF8; \ 476*f2a19305Safresh1 return 0; \ 477*f2a19305Safresh1 } \ 478*f2a19305Safresh1 } STMT_END 479*f2a19305Safresh1 480*f2a19305Safresh1 /* /u is to be chosen if we are supposed to use Unicode rules, or if the 481*f2a19305Safresh1 * pattern is in UTF-8. This latter condition is in case the outermost rules 482*f2a19305Safresh1 * are locale. See GH #17278 */ 483*f2a19305Safresh1 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) 484*f2a19305Safresh1 485*f2a19305Safresh1 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is 486*f2a19305Safresh1 * a flag that indicates we need to override /d with /u as a result of 487*f2a19305Safresh1 * something in the pattern. It should only be used in regards to calling 488*f2a19305Safresh1 * set_regex_charset() or get_regex_charset() */ 489*f2a19305Safresh1 #define REQUIRE_UNI_RULES(flagp, restart_retval) \ 490*f2a19305Safresh1 STMT_START { \ 491*f2a19305Safresh1 if (DEPENDS_SEMANTICS) { \ 492*f2a19305Safresh1 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ 493*f2a19305Safresh1 RExC_uni_semantics = 1; \ 494*f2a19305Safresh1 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ 495*f2a19305Safresh1 /* No need to restart the parse if we haven't seen \ 496*f2a19305Safresh1 * anything that differs between /u and /d, and no need \ 497*f2a19305Safresh1 * to restart immediately if we're going to reparse \ 498*f2a19305Safresh1 * anyway to count parens */ \ 499*f2a19305Safresh1 *flagp |= RESTART_PARSE; \ 500*f2a19305Safresh1 return restart_retval; \ 501*f2a19305Safresh1 } \ 502*f2a19305Safresh1 } \ 503*f2a19305Safresh1 } STMT_END 504*f2a19305Safresh1 505*f2a19305Safresh1 #define REQUIRE_BRANCHJ(flagp, restart_retval) \ 506*f2a19305Safresh1 STMT_START { \ 507*f2a19305Safresh1 RExC_use_BRANCHJ = 1; \ 508*f2a19305Safresh1 *flagp |= RESTART_PARSE; \ 509*f2a19305Safresh1 return restart_retval; \ 510*f2a19305Safresh1 } STMT_END 511*f2a19305Safresh1 512*f2a19305Safresh1 /* Until we have completed the parse, we leave RExC_total_parens at 0 or 513*f2a19305Safresh1 * less. After that, it must always be positive, because the whole re is 514*f2a19305Safresh1 * considered to be surrounded by virtual parens. Setting it to negative 515*f2a19305Safresh1 * indicates there is some construct that needs to know the actual number of 516*f2a19305Safresh1 * parens to be properly handled. And that means an extra pass will be 517*f2a19305Safresh1 * required after we've counted them all */ 518*f2a19305Safresh1 #define ALL_PARENS_COUNTED (RExC_total_parens > 0) 519*f2a19305Safresh1 #define REQUIRE_PARENS_PASS \ 520*f2a19305Safresh1 STMT_START { /* No-op if have completed a pass */ \ 521*f2a19305Safresh1 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ 522*f2a19305Safresh1 } STMT_END 523*f2a19305Safresh1 #define IN_PARENS_PASS (RExC_total_parens < 0) 524*f2a19305Safresh1 525*f2a19305Safresh1 526*f2a19305Safresh1 /* This is used to return failure (zero) early from the calling function if 527*f2a19305Safresh1 * various flags in 'flags' are set. Two flags always cause a return: 528*f2a19305Safresh1 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any 529*f2a19305Safresh1 * additional flags that should cause a return; 0 if none. If the return will 530*f2a19305Safresh1 * be done, '*flagp' is first set to be all of the flags that caused the 531*f2a19305Safresh1 * return. */ 532*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ 533*f2a19305Safresh1 STMT_START { \ 534*f2a19305Safresh1 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ 535*f2a19305Safresh1 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ 536*f2a19305Safresh1 return 0; \ 537*f2a19305Safresh1 } \ 538*f2a19305Safresh1 } STMT_END 539*f2a19305Safresh1 540*f2a19305Safresh1 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) 541*f2a19305Safresh1 542*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART(flags,flagp) \ 543*f2a19305Safresh1 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) 544*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ 545*f2a19305Safresh1 if (MUST_RESTART(*(flagp))) return 0 546*f2a19305Safresh1 547*f2a19305Safresh1 /* This converts the named class defined in regcomp.h to its equivalent class 548*f2a19305Safresh1 * number defined in handy.h. */ 549*f2a19305Safresh1 #define namedclass_to_classnum(class) ((int) ((class) / 2)) 550*f2a19305Safresh1 #define classnum_to_namedclass(classnum) ((classnum) * 2) 551*f2a19305Safresh1 552*f2a19305Safresh1 #define _invlist_union_complement_2nd(a, b, output) \ 553*f2a19305Safresh1 _invlist_union_maybe_complement_2nd(a, b, TRUE, output) 554*f2a19305Safresh1 #define _invlist_intersection_complement_2nd(a, b, output) \ 555*f2a19305Safresh1 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) 556*f2a19305Safresh1 557*f2a19305Safresh1 /* We add a marker if we are deferring expansion of a property that is both 558*f2a19305Safresh1 * 1) potentiallly user-defined; and 559*f2a19305Safresh1 * 2) could also be an official Unicode property. 560*f2a19305Safresh1 * 561*f2a19305Safresh1 * Without this marker, any deferred expansion can only be for a user-defined 562*f2a19305Safresh1 * one. This marker shouldn't conflict with any that could be in a legal name, 563*f2a19305Safresh1 * and is appended to its name to indicate this. There is a string and 564*f2a19305Safresh1 * character form */ 565*f2a19305Safresh1 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" 566*f2a19305Safresh1 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' 567*f2a19305Safresh1 568*f2a19305Safresh1 /* What is infinity for optimization purposes */ 569*f2a19305Safresh1 #define OPTIMIZE_INFTY SSize_t_MAX 570*f2a19305Safresh1 571*f2a19305Safresh1 /* About scan_data_t. 572*f2a19305Safresh1 573*f2a19305Safresh1 During optimisation we recurse through the regexp program performing 574*f2a19305Safresh1 various inplace (keyhole style) optimisations. In addition study_chunk 575*f2a19305Safresh1 and scan_commit populate this data structure with information about 576*f2a19305Safresh1 what strings MUST appear in the pattern. We look for the longest 577*f2a19305Safresh1 string that must appear at a fixed location, and we look for the 578*f2a19305Safresh1 longest string that may appear at a floating location. So for instance 579*f2a19305Safresh1 in the pattern: 580*f2a19305Safresh1 581*f2a19305Safresh1 /FOO[xX]A.*B[xX]BAR/ 582*f2a19305Safresh1 583*f2a19305Safresh1 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 584*f2a19305Safresh1 strings (because they follow a .* construct). study_chunk will identify 585*f2a19305Safresh1 both FOO and BAR as being the longest fixed and floating strings respectively. 586*f2a19305Safresh1 587*f2a19305Safresh1 The strings can be composites, for instance 588*f2a19305Safresh1 589*f2a19305Safresh1 /(f)(o)(o)/ 590*f2a19305Safresh1 591*f2a19305Safresh1 will result in a composite fixed substring 'foo'. 592*f2a19305Safresh1 593*f2a19305Safresh1 For each string some basic information is maintained: 594*f2a19305Safresh1 595*f2a19305Safresh1 - min_offset 596*f2a19305Safresh1 This is the position the string must appear at, or not before. 597*f2a19305Safresh1 It also implicitly (when combined with minlenp) tells us how many 598*f2a19305Safresh1 characters must match before the string we are searching for. 599*f2a19305Safresh1 Likewise when combined with minlenp and the length of the string it 600*f2a19305Safresh1 tells us how many characters must appear after the string we have 601*f2a19305Safresh1 found. 602*f2a19305Safresh1 603*f2a19305Safresh1 - max_offset 604*f2a19305Safresh1 Only used for floating strings. This is the rightmost point that 605*f2a19305Safresh1 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the 606*f2a19305Safresh1 string can occur infinitely far to the right. 607*f2a19305Safresh1 For fixed strings, it is equal to min_offset. 608*f2a19305Safresh1 609*f2a19305Safresh1 - minlenp 610*f2a19305Safresh1 A pointer to the minimum number of characters of the pattern that the 611*f2a19305Safresh1 string was found inside. This is important as in the case of positive 612*f2a19305Safresh1 lookahead or positive lookbehind we can have multiple patterns 613*f2a19305Safresh1 involved. Consider 614*f2a19305Safresh1 615*f2a19305Safresh1 /(?=FOO).*F/ 616*f2a19305Safresh1 617*f2a19305Safresh1 The minimum length of the pattern overall is 3, the minimum length 618*f2a19305Safresh1 of the lookahead part is 3, but the minimum length of the part that 619*f2a19305Safresh1 will actually match is 1. So 'FOO's minimum length is 3, but the 620*f2a19305Safresh1 minimum length for the F is 1. This is important as the minimum length 621*f2a19305Safresh1 is used to determine offsets in front of and behind the string being 622*f2a19305Safresh1 looked for. Since strings can be composites this is the length of the 623*f2a19305Safresh1 pattern at the time it was committed with a scan_commit. Note that 624*f2a19305Safresh1 the length is calculated by study_chunk, so that the minimum lengths 625*f2a19305Safresh1 are not known until the full pattern has been compiled, thus the 626*f2a19305Safresh1 pointer to the value. 627*f2a19305Safresh1 628*f2a19305Safresh1 - lookbehind 629*f2a19305Safresh1 630*f2a19305Safresh1 In the case of lookbehind the string being searched for can be 631*f2a19305Safresh1 offset past the start point of the final matching string. 632*f2a19305Safresh1 If this value was just blithely removed from the min_offset it would 633*f2a19305Safresh1 invalidate some of the calculations for how many chars must match 634*f2a19305Safresh1 before or after (as they are derived from min_offset and minlen and 635*f2a19305Safresh1 the length of the string being searched for). 636*f2a19305Safresh1 When the final pattern is compiled and the data is moved from the 637*f2a19305Safresh1 scan_data_t structure into the regexp structure the information 638*f2a19305Safresh1 about lookbehind is factored in, with the information that would 639*f2a19305Safresh1 have been lost precalculated in the end_shift field for the 640*f2a19305Safresh1 associated string. 641*f2a19305Safresh1 642*f2a19305Safresh1 The fields pos_min and pos_delta are used to store the minimum offset 643*f2a19305Safresh1 and the delta to the maximum offset at the current point in the pattern. 644*f2a19305Safresh1 645*f2a19305Safresh1 */ 646*f2a19305Safresh1 647*f2a19305Safresh1 struct scan_data_substrs { 648*f2a19305Safresh1 SV *str; /* longest substring found in pattern */ 649*f2a19305Safresh1 SSize_t min_offset; /* earliest point in string it can appear */ 650*f2a19305Safresh1 SSize_t max_offset; /* latest point in string it can appear */ 651*f2a19305Safresh1 SSize_t *minlenp; /* pointer to the minlen relevant to the string */ 652*f2a19305Safresh1 SSize_t lookbehind; /* is the pos of the string modified by LB */ 653*f2a19305Safresh1 I32 flags; /* per substring SF_* and SCF_* flags */ 654*f2a19305Safresh1 }; 655*f2a19305Safresh1 656*f2a19305Safresh1 /* this is typedef'ed in perl.h */ 657*f2a19305Safresh1 struct scan_data_t { 658*f2a19305Safresh1 /*I32 len_min; unused */ 659*f2a19305Safresh1 /*I32 len_delta; unused */ 660*f2a19305Safresh1 SSize_t pos_min; 661*f2a19305Safresh1 SSize_t pos_delta; 662*f2a19305Safresh1 SV *last_found; 663*f2a19305Safresh1 SSize_t last_end; /* min value, <0 unless valid. */ 664*f2a19305Safresh1 SSize_t last_start_min; 665*f2a19305Safresh1 SSize_t last_start_max; 666*f2a19305Safresh1 U8 cur_is_floating; /* whether the last_* values should be set as 667*f2a19305Safresh1 * the next fixed (0) or floating (1) 668*f2a19305Safresh1 * substring */ 669*f2a19305Safresh1 670*f2a19305Safresh1 /* [0] is longest fixed substring so far, [1] is longest float so far */ 671*f2a19305Safresh1 struct scan_data_substrs substrs[2]; 672*f2a19305Safresh1 673*f2a19305Safresh1 I32 flags; /* common SF_* and SCF_* flags */ 674*f2a19305Safresh1 I32 whilem_c; 675*f2a19305Safresh1 SSize_t *last_closep; 676*f2a19305Safresh1 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop 677*f2a19305Safresh1 seen. DO NOT DEREFERENCE the regnode 678*f2a19305Safresh1 pointer - the op may have been optimized 679*f2a19305Safresh1 away */ 680*f2a19305Safresh1 regnode_ssc *start_class; 681*f2a19305Safresh1 }; 682*f2a19305Safresh1 683*f2a19305Safresh1 /* 684*f2a19305Safresh1 * Forward declarations for pregcomp()'s friends. 685*f2a19305Safresh1 */ 686*f2a19305Safresh1 687*f2a19305Safresh1 static const scan_data_t zero_scan_data = { 688*f2a19305Safresh1 0, 0, NULL, 0, 0, 0, 0, 689*f2a19305Safresh1 { 690*f2a19305Safresh1 { NULL, 0, 0, 0, 0, 0 }, 691*f2a19305Safresh1 { NULL, 0, 0, 0, 0, 0 }, 692*f2a19305Safresh1 }, 693*f2a19305Safresh1 0, 0, NULL, NULL, NULL 694*f2a19305Safresh1 }; 695*f2a19305Safresh1 696*f2a19305Safresh1 /* study flags */ 697*f2a19305Safresh1 698*f2a19305Safresh1 #define SF_BEFORE_SEOL 0x0001 699*f2a19305Safresh1 #define SF_BEFORE_MEOL 0x0002 700*f2a19305Safresh1 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 701*f2a19305Safresh1 702*f2a19305Safresh1 #define SF_IS_INF 0x0040 703*f2a19305Safresh1 #define SF_HAS_PAR 0x0080 704*f2a19305Safresh1 #define SF_IN_PAR 0x0100 705*f2a19305Safresh1 #define SF_HAS_EVAL 0x0200 706*f2a19305Safresh1 707*f2a19305Safresh1 708*f2a19305Safresh1 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the 709*f2a19305Safresh1 * longest substring in the pattern. When it is not set the optimiser keeps 710*f2a19305Safresh1 * track of position, but does not keep track of the actual strings seen, 711*f2a19305Safresh1 * 712*f2a19305Safresh1 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but 713*f2a19305Safresh1 * /foo/i will not. 714*f2a19305Safresh1 * 715*f2a19305Safresh1 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" 716*f2a19305Safresh1 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be 717*f2a19305Safresh1 * turned off because of the alternation (BRANCH). */ 718*f2a19305Safresh1 #define SCF_DO_SUBSTR 0x0400 719*f2a19305Safresh1 720*f2a19305Safresh1 #define SCF_DO_STCLASS_AND 0x0800 721*f2a19305Safresh1 #define SCF_DO_STCLASS_OR 0x1000 722*f2a19305Safresh1 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 723*f2a19305Safresh1 #define SCF_WHILEM_VISITED_POS 0x2000 724*f2a19305Safresh1 725*f2a19305Safresh1 #define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? 726*f2a19305Safresh1 Search for "restudy" in this file 727*f2a19305Safresh1 to find a detailed explanation.*/ 728*f2a19305Safresh1 #define SCF_SEEN_ACCEPT 0x8000 729*f2a19305Safresh1 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? 730*f2a19305Safresh1 Search for "restudy" in this file 731*f2a19305Safresh1 to find a detailed explanation. */ 732*f2a19305Safresh1 #define SCF_IN_DEFINE 0x20000 733*f2a19305Safresh1 734*f2a19305Safresh1 735*f2a19305Safresh1 736*f2a19305Safresh1 #define UTF cBOOL(RExC_utf8) 737*f2a19305Safresh1 738*f2a19305Safresh1 /* The enums for all these are ordered so things work out correctly */ 739*f2a19305Safresh1 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) 740*f2a19305Safresh1 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ 741*f2a19305Safresh1 == REGEX_DEPENDS_CHARSET) 742*f2a19305Safresh1 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) 743*f2a19305Safresh1 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ 744*f2a19305Safresh1 >= REGEX_UNICODE_CHARSET) 745*f2a19305Safresh1 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 746*f2a19305Safresh1 == REGEX_ASCII_RESTRICTED_CHARSET) 747*f2a19305Safresh1 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 748*f2a19305Safresh1 >= REGEX_ASCII_RESTRICTED_CHARSET) 749*f2a19305Safresh1 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ 750*f2a19305Safresh1 == REGEX_ASCII_MORE_RESTRICTED_CHARSET) 751*f2a19305Safresh1 752*f2a19305Safresh1 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) 753*f2a19305Safresh1 754*f2a19305Safresh1 /* For programs that want to be strictly Unicode compatible by dying if any 755*f2a19305Safresh1 * attempt is made to match a non-Unicode code point against a Unicode 756*f2a19305Safresh1 * property. */ 757*f2a19305Safresh1 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) 758*f2a19305Safresh1 759*f2a19305Safresh1 #define OOB_NAMEDCLASS -1 760*f2a19305Safresh1 761*f2a19305Safresh1 /* There is no code point that is out-of-bounds, so this is problematic. But 762*f2a19305Safresh1 * its only current use is to initialize a variable that is always set before 763*f2a19305Safresh1 * looked at. */ 764*f2a19305Safresh1 #define OOB_UNICODE 0xDEADBEEF 765*f2a19305Safresh1 766*f2a19305Safresh1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 767*f2a19305Safresh1 768*f2a19305Safresh1 769*f2a19305Safresh1 /* length of regex to show in messages that don't mark a position within */ 770*f2a19305Safresh1 #define RegexLengthToShowInErrorMessages 127 771*f2a19305Safresh1 772*f2a19305Safresh1 /* 773*f2a19305Safresh1 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 774*f2a19305Safresh1 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 775*f2a19305Safresh1 * op/pragma/warn/regcomp. 776*f2a19305Safresh1 */ 777*f2a19305Safresh1 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 778*f2a19305Safresh1 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 779*f2a19305Safresh1 780*f2a19305Safresh1 #define REPORT_LOCATION " in regex; marked by " MARKER1 \ 781*f2a19305Safresh1 " in m/%" UTF8f MARKER2 "%" UTF8f "/" 782*f2a19305Safresh1 783*f2a19305Safresh1 /* The code in this file in places uses one level of recursion with parsing 784*f2a19305Safresh1 * rebased to an alternate string constructed by us in memory. This can take 785*f2a19305Safresh1 * the form of something that is completely different from the input, or 786*f2a19305Safresh1 * something that uses the input as part of the alternate. In the first case, 787*f2a19305Safresh1 * there should be no possibility of an error, as we are in complete control of 788*f2a19305Safresh1 * the alternate string. But in the second case we don't completely control 789*f2a19305Safresh1 * the input portion, so there may be errors in that. Here's an example: 790*f2a19305Safresh1 * /[abc\x{DF}def]/ui 791*f2a19305Safresh1 * is handled specially because \x{df} folds to a sequence of more than one 792*f2a19305Safresh1 * character: 'ss'. What is done is to create and parse an alternate string, 793*f2a19305Safresh1 * which looks like this: 794*f2a19305Safresh1 * /(?:\x{DF}|[abc\x{DF}def])/ui 795*f2a19305Safresh1 * where it uses the input unchanged in the middle of something it constructs, 796*f2a19305Safresh1 * which is a branch for the DF outside the character class, and clustering 797*f2a19305Safresh1 * parens around the whole thing. (It knows enough to skip the DF inside the 798*f2a19305Safresh1 * class while in this substitute parse.) 'abc' and 'def' may have errors that 799*f2a19305Safresh1 * need to be reported. The general situation looks like this: 800*f2a19305Safresh1 * 801*f2a19305Safresh1 * |<------- identical ------>| 802*f2a19305Safresh1 * sI tI xI eI 803*f2a19305Safresh1 * Input: --------------------------------------------------------------- 804*f2a19305Safresh1 * Constructed: --------------------------------------------------- 805*f2a19305Safresh1 * sC tC xC eC EC 806*f2a19305Safresh1 * |<------- identical ------>| 807*f2a19305Safresh1 * 808*f2a19305Safresh1 * sI..eI is the portion of the input pattern we are concerned with here. 809*f2a19305Safresh1 * sC..EC is the constructed substitute parse string. 810*f2a19305Safresh1 * sC..tC is constructed by us 811*f2a19305Safresh1 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. 812*f2a19305Safresh1 * In the diagram, these are vertically aligned. 813*f2a19305Safresh1 * eC..EC is also constructed by us. 814*f2a19305Safresh1 * xC is the position in the substitute parse string where we found a 815*f2a19305Safresh1 * problem. 816*f2a19305Safresh1 * xI is the position in the original pattern corresponding to xC. 817*f2a19305Safresh1 * 818*f2a19305Safresh1 * We want to display a message showing the real input string. Thus we need to 819*f2a19305Safresh1 * translate from xC to xI. We know that xC >= tC, since the portion of the 820*f2a19305Safresh1 * string sC..tC has been constructed by us, and so shouldn't have errors. We 821*f2a19305Safresh1 * get: 822*f2a19305Safresh1 * xI = tI + (xC - tC) 823*f2a19305Safresh1 * 824*f2a19305Safresh1 * When the substitute parse is constructed, the code needs to set: 825*f2a19305Safresh1 * RExC_start (sC) 826*f2a19305Safresh1 * RExC_end (eC) 827*f2a19305Safresh1 * RExC_copy_start_in_input (tI) 828*f2a19305Safresh1 * RExC_copy_start_in_constructed (tC) 829*f2a19305Safresh1 * and restore them when done. 830*f2a19305Safresh1 * 831*f2a19305Safresh1 * During normal processing of the input pattern, both 832*f2a19305Safresh1 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to 833*f2a19305Safresh1 * sI, so that xC equals xI. 834*f2a19305Safresh1 */ 835*f2a19305Safresh1 836*f2a19305Safresh1 #define sI RExC_precomp 837*f2a19305Safresh1 #define eI RExC_precomp_end 838*f2a19305Safresh1 #define sC RExC_start 839*f2a19305Safresh1 #define eC RExC_end 840*f2a19305Safresh1 #define tI RExC_copy_start_in_input 841*f2a19305Safresh1 #define tC RExC_copy_start_in_constructed 842*f2a19305Safresh1 #define xI(xC) (tI + (xC - tC)) 843*f2a19305Safresh1 #define xI_offset(xC) (xI(xC) - sI) 844*f2a19305Safresh1 845*f2a19305Safresh1 #define REPORT_LOCATION_ARGS(xC) \ 846*f2a19305Safresh1 UTF8fARG(UTF, \ 847*f2a19305Safresh1 (xI(xC) > eI) /* Don't run off end */ \ 848*f2a19305Safresh1 ? eI - sI /* Length before the <--HERE */ \ 849*f2a19305Safresh1 : ((xI_offset(xC) >= 0) \ 850*f2a19305Safresh1 ? xI_offset(xC) \ 851*f2a19305Safresh1 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ 852*f2a19305Safresh1 IVdf " trying to output message for " \ 853*f2a19305Safresh1 " pattern %.*s", \ 854*f2a19305Safresh1 __FILE__, __LINE__, (IV) xI_offset(xC), \ 855*f2a19305Safresh1 ((int) (eC - sC)), sC), 0)), \ 856*f2a19305Safresh1 sI), /* The input pattern printed up to the <--HERE */ \ 857*f2a19305Safresh1 UTF8fARG(UTF, \ 858*f2a19305Safresh1 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ 859*f2a19305Safresh1 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ 860*f2a19305Safresh1 861*f2a19305Safresh1 /* Used to point after bad bytes for an error message, but avoid skipping 862*f2a19305Safresh1 * past a nul byte. */ 863*f2a19305Safresh1 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) 864*f2a19305Safresh1 865*f2a19305Safresh1 /* Set up to clean up after our imminent demise */ 866*f2a19305Safresh1 #define PREPARE_TO_DIE \ 867*f2a19305Safresh1 STMT_START { \ 868*f2a19305Safresh1 if (RExC_rx_sv) \ 869*f2a19305Safresh1 SAVEFREESV(RExC_rx_sv); \ 870*f2a19305Safresh1 if (RExC_open_parens) \ 871*f2a19305Safresh1 SAVEFREEPV(RExC_open_parens); \ 872*f2a19305Safresh1 if (RExC_close_parens) \ 873*f2a19305Safresh1 SAVEFREEPV(RExC_close_parens); \ 874*f2a19305Safresh1 if (RExC_logical_to_parno) \ 875*f2a19305Safresh1 SAVEFREEPV(RExC_logical_to_parno); \ 876*f2a19305Safresh1 if (RExC_parno_to_logical) \ 877*f2a19305Safresh1 SAVEFREEPV(RExC_parno_to_logical); \ 878*f2a19305Safresh1 } STMT_END 879*f2a19305Safresh1 880*f2a19305Safresh1 /* 881*f2a19305Safresh1 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 882*f2a19305Safresh1 * arg. Show regex, up to a maximum length. If it's too long, chop and add 883*f2a19305Safresh1 * "...". 884*f2a19305Safresh1 */ 885*f2a19305Safresh1 #define _FAIL(code) STMT_START { \ 886*f2a19305Safresh1 const char *ellipses = ""; \ 887*f2a19305Safresh1 IV len = RExC_precomp_end - RExC_precomp; \ 888*f2a19305Safresh1 \ 889*f2a19305Safresh1 PREPARE_TO_DIE; \ 890*f2a19305Safresh1 if (len > RegexLengthToShowInErrorMessages) { \ 891*f2a19305Safresh1 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 892*f2a19305Safresh1 len = RegexLengthToShowInErrorMessages - 10; \ 893*f2a19305Safresh1 ellipses = "..."; \ 894*f2a19305Safresh1 } \ 895*f2a19305Safresh1 code; \ 896*f2a19305Safresh1 } STMT_END 897*f2a19305Safresh1 898*f2a19305Safresh1 #define FAIL(msg) _FAIL( \ 899*f2a19305Safresh1 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ 900*f2a19305Safresh1 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 901*f2a19305Safresh1 902*f2a19305Safresh1 #define FAIL2(msg,arg) _FAIL( \ 903*f2a19305Safresh1 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ 904*f2a19305Safresh1 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 905*f2a19305Safresh1 906*f2a19305Safresh1 #define FAIL3(msg,arg1,arg2) _FAIL( \ 907*f2a19305Safresh1 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ 908*f2a19305Safresh1 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 909*f2a19305Safresh1 910*f2a19305Safresh1 /* 911*f2a19305Safresh1 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 912*f2a19305Safresh1 */ 913*f2a19305Safresh1 #define Simple_vFAIL(m) STMT_START { \ 914*f2a19305Safresh1 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 915*f2a19305Safresh1 m, REPORT_LOCATION_ARGS(RExC_parse)); \ 916*f2a19305Safresh1 } STMT_END 917*f2a19305Safresh1 918*f2a19305Safresh1 /* 919*f2a19305Safresh1 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 920*f2a19305Safresh1 */ 921*f2a19305Safresh1 #define vFAIL(m) STMT_START { \ 922*f2a19305Safresh1 PREPARE_TO_DIE; \ 923*f2a19305Safresh1 Simple_vFAIL(m); \ 924*f2a19305Safresh1 } STMT_END 925*f2a19305Safresh1 926*f2a19305Safresh1 /* 927*f2a19305Safresh1 * Like Simple_vFAIL(), but accepts two arguments. 928*f2a19305Safresh1 */ 929*f2a19305Safresh1 #define Simple_vFAIL2(m,a1) STMT_START { \ 930*f2a19305Safresh1 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ 931*f2a19305Safresh1 REPORT_LOCATION_ARGS(RExC_parse)); \ 932*f2a19305Safresh1 } STMT_END 933*f2a19305Safresh1 934*f2a19305Safresh1 /* 935*f2a19305Safresh1 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 936*f2a19305Safresh1 */ 937*f2a19305Safresh1 #define vFAIL2(m,a1) STMT_START { \ 938*f2a19305Safresh1 PREPARE_TO_DIE; \ 939*f2a19305Safresh1 Simple_vFAIL2(m, a1); \ 940*f2a19305Safresh1 } STMT_END 941*f2a19305Safresh1 942*f2a19305Safresh1 943*f2a19305Safresh1 /* 944*f2a19305Safresh1 * Like Simple_vFAIL(), but accepts three arguments. 945*f2a19305Safresh1 */ 946*f2a19305Safresh1 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 947*f2a19305Safresh1 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ 948*f2a19305Safresh1 REPORT_LOCATION_ARGS(RExC_parse)); \ 949*f2a19305Safresh1 } STMT_END 950*f2a19305Safresh1 951*f2a19305Safresh1 /* 952*f2a19305Safresh1 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 953*f2a19305Safresh1 */ 954*f2a19305Safresh1 #define vFAIL3(m,a1,a2) STMT_START { \ 955*f2a19305Safresh1 PREPARE_TO_DIE; \ 956*f2a19305Safresh1 Simple_vFAIL3(m, a1, a2); \ 957*f2a19305Safresh1 } STMT_END 958*f2a19305Safresh1 959*f2a19305Safresh1 /* 960*f2a19305Safresh1 * Like Simple_vFAIL(), but accepts four arguments. 961*f2a19305Safresh1 */ 962*f2a19305Safresh1 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 963*f2a19305Safresh1 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ 964*f2a19305Safresh1 REPORT_LOCATION_ARGS(RExC_parse)); \ 965*f2a19305Safresh1 } STMT_END 966*f2a19305Safresh1 967*f2a19305Safresh1 #define vFAIL4(m,a1,a2,a3) STMT_START { \ 968*f2a19305Safresh1 PREPARE_TO_DIE; \ 969*f2a19305Safresh1 Simple_vFAIL4(m, a1, a2, a3); \ 970*f2a19305Safresh1 } STMT_END 971*f2a19305Safresh1 972*f2a19305Safresh1 /* A specialized version of vFAIL2 that works with UTF8f */ 973*f2a19305Safresh1 #define vFAIL2utf8f(m, a1) STMT_START { \ 974*f2a19305Safresh1 PREPARE_TO_DIE; \ 975*f2a19305Safresh1 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ 976*f2a19305Safresh1 REPORT_LOCATION_ARGS(RExC_parse)); \ 977*f2a19305Safresh1 } STMT_END 978*f2a19305Safresh1 979*f2a19305Safresh1 #define vFAIL3utf8f(m, a1, a2) STMT_START { \ 980*f2a19305Safresh1 PREPARE_TO_DIE; \ 981*f2a19305Safresh1 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ 982*f2a19305Safresh1 REPORT_LOCATION_ARGS(RExC_parse)); \ 983*f2a19305Safresh1 } STMT_END 984*f2a19305Safresh1 985*f2a19305Safresh1 /* Setting this to NULL is a signal to not output warnings */ 986*f2a19305Safresh1 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ 987*f2a19305Safresh1 STMT_START { \ 988*f2a19305Safresh1 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ 989*f2a19305Safresh1 RExC_copy_start_in_constructed = NULL; \ 990*f2a19305Safresh1 } STMT_END 991*f2a19305Safresh1 #define RESTORE_WARNINGS \ 992*f2a19305Safresh1 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed 993*f2a19305Safresh1 994*f2a19305Safresh1 /* Since a warning can be generated multiple times as the input is reparsed, we 995*f2a19305Safresh1 * output it the first time we come to that point in the parse, but suppress it 996*f2a19305Safresh1 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not 997*f2a19305Safresh1 * generate any warnings */ 998*f2a19305Safresh1 #define TO_OUTPUT_WARNINGS(loc) \ 999*f2a19305Safresh1 ( RExC_copy_start_in_constructed \ 1000*f2a19305Safresh1 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) 1001*f2a19305Safresh1 1002*f2a19305Safresh1 /* After we've emitted a warning, we save the position in the input so we don't 1003*f2a19305Safresh1 * output it again */ 1004*f2a19305Safresh1 #define UPDATE_WARNINGS_LOC(loc) \ 1005*f2a19305Safresh1 STMT_START { \ 1006*f2a19305Safresh1 if (TO_OUTPUT_WARNINGS(loc)) { \ 1007*f2a19305Safresh1 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ 1008*f2a19305Safresh1 - RExC_precomp; \ 1009*f2a19305Safresh1 } \ 1010*f2a19305Safresh1 } STMT_END 1011*f2a19305Safresh1 1012*f2a19305Safresh1 /* 'warns' is the output of the packWARNx macro used in 'code' */ 1013*f2a19305Safresh1 #define _WARN_HELPER(loc, warns, code) \ 1014*f2a19305Safresh1 STMT_START { \ 1015*f2a19305Safresh1 if (! RExC_copy_start_in_constructed) { \ 1016*f2a19305Safresh1 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ 1017*f2a19305Safresh1 " expected at '%s'", \ 1018*f2a19305Safresh1 __FILE__, __LINE__, loc); \ 1019*f2a19305Safresh1 } \ 1020*f2a19305Safresh1 if (TO_OUTPUT_WARNINGS(loc)) { \ 1021*f2a19305Safresh1 if (ckDEAD(warns)) \ 1022*f2a19305Safresh1 PREPARE_TO_DIE; \ 1023*f2a19305Safresh1 code; \ 1024*f2a19305Safresh1 UPDATE_WARNINGS_LOC(loc); \ 1025*f2a19305Safresh1 } \ 1026*f2a19305Safresh1 } STMT_END 1027*f2a19305Safresh1 1028*f2a19305Safresh1 /* m is not necessarily a "literal string", in this macro */ 1029*f2a19305Safresh1 #define warn_non_literal_string(loc, packed_warn, m) \ 1030*f2a19305Safresh1 _WARN_HELPER(loc, packed_warn, \ 1031*f2a19305Safresh1 Perl_warner(aTHX_ packed_warn, \ 1032*f2a19305Safresh1 "%s" REPORT_LOCATION, \ 1033*f2a19305Safresh1 m, REPORT_LOCATION_ARGS(loc))) 1034*f2a19305Safresh1 #define reg_warn_non_literal_string(loc, m) \ 1035*f2a19305Safresh1 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) 1036*f2a19305Safresh1 1037*f2a19305Safresh1 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ 1038*f2a19305Safresh1 STMT_START { \ 1039*f2a19305Safresh1 char * format; \ 1040*f2a19305Safresh1 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ 1041*f2a19305Safresh1 Newx(format, format_size, char); \ 1042*f2a19305Safresh1 my_strlcpy(format, m, format_size); \ 1043*f2a19305Safresh1 my_strlcat(format, REPORT_LOCATION, format_size); \ 1044*f2a19305Safresh1 SAVEFREEPV(format); \ 1045*f2a19305Safresh1 _WARN_HELPER(loc, packwarn, \ 1046*f2a19305Safresh1 Perl_ck_warner(aTHX_ packwarn, \ 1047*f2a19305Safresh1 format, \ 1048*f2a19305Safresh1 a1, REPORT_LOCATION_ARGS(loc))); \ 1049*f2a19305Safresh1 } STMT_END 1050*f2a19305Safresh1 1051*f2a19305Safresh1 #define ckWARNreg(loc,m) \ 1052*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1053*f2a19305Safresh1 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1054*f2a19305Safresh1 m REPORT_LOCATION, \ 1055*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1056*f2a19305Safresh1 1057*f2a19305Safresh1 #define vWARN(loc, m) \ 1058*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1059*f2a19305Safresh1 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1060*f2a19305Safresh1 m REPORT_LOCATION, \ 1061*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) \ 1062*f2a19305Safresh1 1063*f2a19305Safresh1 #define vWARN_dep(loc,category,m) \ 1064*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(category), \ 1065*f2a19305Safresh1 Perl_warner(aTHX_ packWARN(category), \ 1066*f2a19305Safresh1 m REPORT_LOCATION, \ 1067*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1068*f2a19305Safresh1 1069*f2a19305Safresh1 #define ckWARNdep(loc,category,m) \ 1070*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(category), \ 1071*f2a19305Safresh1 Perl_ck_warner_d(aTHX_ packWARN(category), \ 1072*f2a19305Safresh1 m REPORT_LOCATION, \ 1073*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1074*f2a19305Safresh1 1075*f2a19305Safresh1 #define ckWARNregdep(loc,category,m) \ 1076*f2a19305Safresh1 _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP), \ 1077*f2a19305Safresh1 Perl_ck_warner_d(aTHX_ packWARN2(category, \ 1078*f2a19305Safresh1 WARN_REGEXP), \ 1079*f2a19305Safresh1 m REPORT_LOCATION, \ 1080*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1081*f2a19305Safresh1 1082*f2a19305Safresh1 #define ckWARN2reg_d(loc,m, a1) \ 1083*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1084*f2a19305Safresh1 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ 1085*f2a19305Safresh1 m REPORT_LOCATION, \ 1086*f2a19305Safresh1 a1, REPORT_LOCATION_ARGS(loc))) 1087*f2a19305Safresh1 1088*f2a19305Safresh1 #define ckWARN2reg(loc, m, a1) \ 1089*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1090*f2a19305Safresh1 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1091*f2a19305Safresh1 m REPORT_LOCATION, \ 1092*f2a19305Safresh1 a1, REPORT_LOCATION_ARGS(loc))) 1093*f2a19305Safresh1 1094*f2a19305Safresh1 #define vWARN3(loc, m, a1, a2) \ 1095*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1096*f2a19305Safresh1 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1097*f2a19305Safresh1 m REPORT_LOCATION, \ 1098*f2a19305Safresh1 a1, a2, REPORT_LOCATION_ARGS(loc))) 1099*f2a19305Safresh1 1100*f2a19305Safresh1 #define ckWARN3reg(loc, m, a1, a2) \ 1101*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1102*f2a19305Safresh1 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1103*f2a19305Safresh1 m REPORT_LOCATION, \ 1104*f2a19305Safresh1 a1, a2, \ 1105*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1106*f2a19305Safresh1 1107*f2a19305Safresh1 #define vWARN4(loc, m, a1, a2, a3) \ 1108*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1109*f2a19305Safresh1 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1110*f2a19305Safresh1 m REPORT_LOCATION, \ 1111*f2a19305Safresh1 a1, a2, a3, \ 1112*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1113*f2a19305Safresh1 1114*f2a19305Safresh1 #define ckWARN4reg(loc, m, a1, a2, a3) \ 1115*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1116*f2a19305Safresh1 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 1117*f2a19305Safresh1 m REPORT_LOCATION, \ 1118*f2a19305Safresh1 a1, a2, a3, \ 1119*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1120*f2a19305Safresh1 1121*f2a19305Safresh1 #define vWARN5(loc, m, a1, a2, a3, a4) \ 1122*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ 1123*f2a19305Safresh1 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 1124*f2a19305Safresh1 m REPORT_LOCATION, \ 1125*f2a19305Safresh1 a1, a2, a3, a4, \ 1126*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc))) 1127*f2a19305Safresh1 1128*f2a19305Safresh1 #define ckWARNexperimental(loc, class, m) \ 1129*f2a19305Safresh1 STMT_START { \ 1130*f2a19305Safresh1 if (! RExC_warned_ ## class) { /* warn once per compilation */ \ 1131*f2a19305Safresh1 RExC_warned_ ## class = 1; \ 1132*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(class), \ 1133*f2a19305Safresh1 Perl_ck_warner_d(aTHX_ packWARN(class), \ 1134*f2a19305Safresh1 m REPORT_LOCATION, \ 1135*f2a19305Safresh1 REPORT_LOCATION_ARGS(loc)));\ 1136*f2a19305Safresh1 } \ 1137*f2a19305Safresh1 } STMT_END 1138*f2a19305Safresh1 1139*f2a19305Safresh1 #define ckWARNexperimental_with_arg(loc, class, m, arg) \ 1140*f2a19305Safresh1 STMT_START { \ 1141*f2a19305Safresh1 if (! RExC_warned_ ## class) { /* warn once per compilation */ \ 1142*f2a19305Safresh1 RExC_warned_ ## class = 1; \ 1143*f2a19305Safresh1 _WARN_HELPER(loc, packWARN(class), \ 1144*f2a19305Safresh1 Perl_ck_warner_d(aTHX_ packWARN(class), \ 1145*f2a19305Safresh1 m REPORT_LOCATION, \ 1146*f2a19305Safresh1 arg, REPORT_LOCATION_ARGS(loc)));\ 1147*f2a19305Safresh1 } \ 1148*f2a19305Safresh1 } STMT_END 1149*f2a19305Safresh1 1150*f2a19305Safresh1 /* Convert between a pointer to a node and its offset from the beginning of the 1151*f2a19305Safresh1 * program */ 1152*f2a19305Safresh1 #define REGNODE_p(offset) (RExC_emit_start + (offset)) 1153*f2a19305Safresh1 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ 1154*f2a19305Safresh1 (SSize_t) ((node) - RExC_emit_start)) 1155*f2a19305Safresh1 1156*f2a19305Safresh1 #define ProgLen(ri) ri->proglen 1157*f2a19305Safresh1 #define SetProgLen(ri,x) ri->proglen = x 1158*f2a19305Safresh1 1159*f2a19305Safresh1 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 1160*f2a19305Safresh1 #define EXPERIMENTAL_INPLACESCAN 1161*f2a19305Safresh1 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ 1162*f2a19305Safresh1 1163*f2a19305Safresh1 #define DEBUG_RExC_seen() \ 1164*f2a19305Safresh1 DEBUG_OPTIMISE_MORE_r({ \ 1165*f2a19305Safresh1 Perl_re_printf( aTHX_ "RExC_seen: "); \ 1166*f2a19305Safresh1 \ 1167*f2a19305Safresh1 if (RExC_seen & REG_ZERO_LEN_SEEN) \ 1168*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ 1169*f2a19305Safresh1 \ 1170*f2a19305Safresh1 if (RExC_seen & REG_LOOKBEHIND_SEEN) \ 1171*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ 1172*f2a19305Safresh1 \ 1173*f2a19305Safresh1 if (RExC_seen & REG_GPOS_SEEN) \ 1174*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ 1175*f2a19305Safresh1 \ 1176*f2a19305Safresh1 if (RExC_seen & REG_RECURSE_SEEN) \ 1177*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ 1178*f2a19305Safresh1 \ 1179*f2a19305Safresh1 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ 1180*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ 1181*f2a19305Safresh1 \ 1182*f2a19305Safresh1 if (RExC_seen & REG_VERBARG_SEEN) \ 1183*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ 1184*f2a19305Safresh1 \ 1185*f2a19305Safresh1 if (RExC_seen & REG_CUTGROUP_SEEN) \ 1186*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ 1187*f2a19305Safresh1 \ 1188*f2a19305Safresh1 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ 1189*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ 1190*f2a19305Safresh1 \ 1191*f2a19305Safresh1 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ 1192*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ 1193*f2a19305Safresh1 \ 1194*f2a19305Safresh1 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ 1195*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ 1196*f2a19305Safresh1 \ 1197*f2a19305Safresh1 if (RExC_seen & REG_PESSIMIZE_SEEN) \ 1198*f2a19305Safresh1 Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN "); \ 1199*f2a19305Safresh1 \ 1200*f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); \ 1201*f2a19305Safresh1 }); 1202*f2a19305Safresh1 1203*f2a19305Safresh1 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ 1204*f2a19305Safresh1 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) 1205*f2a19305Safresh1 1206*f2a19305Safresh1 1207*f2a19305Safresh1 #ifdef DEBUGGING 1208*f2a19305Safresh1 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ 1209*f2a19305Safresh1 debug_studydata(where, data, depth, is_inf, min, stopmin, delta) 1210*f2a19305Safresh1 1211*f2a19305Safresh1 # define DEBUG_PEEP(str, scan, depth, flags) \ 1212*f2a19305Safresh1 debug_peep(str, pRExC_state, scan, depth, flags) 1213*f2a19305Safresh1 #else 1214*f2a19305Safresh1 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP 1215*f2a19305Safresh1 # define DEBUG_PEEP(str, scan, depth, flags) NOOP 1216*f2a19305Safresh1 #endif 1217*f2a19305Safresh1 1218*f2a19305Safresh1 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 1219*f2a19305Safresh1 #ifdef DEBUGGING 1220*f2a19305Safresh1 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 1221*f2a19305Safresh1 #else 1222*f2a19305Safresh1 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 1223*f2a19305Safresh1 #endif 1224*f2a19305Safresh1 1225*f2a19305Safresh1 #define MADE_TRIE 1 1226*f2a19305Safresh1 #define MADE_JUMP_TRIE 2 1227*f2a19305Safresh1 #define MADE_EXACT_TRIE 4 1228*f2a19305Safresh1 1229*f2a19305Safresh1 #define INVLIST_INDEX 0 1230*f2a19305Safresh1 #define ONLY_LOCALE_MATCHES_INDEX 1 1231*f2a19305Safresh1 #define DEFERRED_USER_DEFINED_INDEX 2 1232*f2a19305Safresh1 1233*f2a19305Safresh1 /* These two functions currently do the exact same thing */ 1234*f2a19305Safresh1 #define ssc_init_zero ssc_init 1235*f2a19305Safresh1 1236*f2a19305Safresh1 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) 1237*f2a19305Safresh1 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) 1238*f2a19305Safresh1 1239*f2a19305Safresh1 #ifdef DEBUGGING 1240*f2a19305Safresh1 #define REGNODE_GUTS(state,op,extra_size) \ 1241*f2a19305Safresh1 regnode_guts_debug(state,op,extra_size) 1242*f2a19305Safresh1 #else 1243*f2a19305Safresh1 #define REGNODE_GUTS(state,op,extra_size) \ 1244*f2a19305Safresh1 regnode_guts(state,extra_size) 1245*f2a19305Safresh1 #endif 1246*f2a19305Safresh1 1247*f2a19305Safresh1 #define CLEAR_OPTSTART \ 1248*f2a19305Safresh1 if (optstart) STMT_START { \ 1249*f2a19305Safresh1 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ 1250*f2a19305Safresh1 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ 1251*f2a19305Safresh1 optstart=NULL; \ 1252*f2a19305Safresh1 } STMT_END 1253*f2a19305Safresh1 1254*f2a19305Safresh1 #define DUMPUNTIL(b,e) \ 1255*f2a19305Safresh1 CLEAR_OPTSTART; \ 1256*f2a19305Safresh1 node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 1257*f2a19305Safresh1 1258*f2a19305Safresh1 #define REGNODE_STEP_OVER(ret,t1,t2) \ 1259*f2a19305Safresh1 NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode)) 1260*f2a19305Safresh1 1261*f2a19305Safresh1 #endif /* REGCOMP_INTERNAL_H */ 1262