xref: /openbsd/gnu/usr.bin/perl/regcomp_internal.h (revision 5486feef)
1 #ifndef REGCOMP_INTERNAL_H
2 #define REGCOMP_INTERNAL_H
3 #ifndef STATIC
4 #define STATIC  static
5 #endif
6 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM
7 #define RE_OPTIMIZE_CURLYX_TO_CURLYM 1
8 #endif
9 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN
10 #define RE_OPTIMIZE_CURLYX_TO_CURLYN 1
11 #endif
12 
13 /* this is a chain of data about sub patterns we are processing that
14    need to be handled separately/specially in study_chunk. Its so
15    we can simulate recursion without losing state.  */
16 struct scan_frame;
17 typedef struct scan_frame {
18     regnode *last_regnode;      /* last node to process in this frame */
19     regnode *next_regnode;      /* next node to process when last is reached */
20     U32 prev_recursed_depth;
21     I32 stopparen;              /* what stopparen do we use */
22     bool in_gosub;              /* this or an outer frame is for GOSUB */
23 
24     struct scan_frame *this_prev_frame; /* this previous frame */
25     struct scan_frame *prev_frame;      /* previous frame */
26     struct scan_frame *next_frame;      /* next frame */
27 } scan_frame;
28 
29 /* Certain characters are output as a sequence with the first being a
30  * backslash. */
31 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
32 
33 
34 struct RExC_state_t {
35     U32         flags;                  /* RXf_* are we folding, multilining? */
36     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
37     char        *precomp;               /* uncompiled string. */
38     char        *precomp_end;           /* pointer to end of uncompiled string. */
39     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
40     regexp      *rx;                    /* perl core regexp structure */
41     regexp_internal     *rxi;           /* internal data for regexp object
42                                            pprivate field */
43     char        *start;                 /* Start of input for compile */
44     char        *end;                   /* End of input for compile */
45     char        *parse;                 /* Input-scan pointer. */
46     char        *copy_start;            /* start of copy of input within
47                                            constructed parse string */
48     char        *save_copy_start;       /* Provides one level of saving
49                                            and restoring 'copy_start' */
50     char        *copy_start_in_input;   /* Position in input string
51                                            corresponding to copy_start */
52     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
53     regnode     *emit_start;            /* Start of emitted-code area */
54     regnode_offset emit;                /* Code-emit pointer */
55     I32         naughty;                /* How bad is this pattern? */
56     I32         sawback;                /* Did we see \1, ...? */
57     SSize_t     size;                   /* Number of regnode equivalents in
58                                            pattern */
59     Size_t      sets_depth;              /* Counts recursion depth of already-
60                                            compiled regex set patterns */
61     U32         seen;
62 
63     I32      parens_buf_size;           /* #slots malloced open/close_parens */
64     regnode_offset *open_parens;        /* offsets to open parens */
65     regnode_offset *close_parens;       /* offsets to close parens */
66     HV          *paren_names;           /* Paren names */
67 
68     /* position beyond 'precomp' of the warning message furthest away from
69      * 'precomp'.  During the parse, no warnings are raised for any problems
70      * earlier in the parse than this position.  This works if warnings are
71      * raised the first time a given spot is parsed, and if only one
72      * independent warning is raised for any given spot */
73     Size_t      latest_warn_offset;
74 
75     /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id.
76      * "Logical Parno" is the user visible view with branch reset taken into
77      * account. "Parno" (or physical parno) is the actual capture buffers in
78      * the pattern *NOT* taking into account branch reset. We also maintain
79      * a map of "next" pointers which allow us to skip to the next physical
80      * capture buffer with the same logical id, with 0 representing "none".
81      *
82      * As we compile we keep track of the two different counts using the
83      * 'logical_npar' and 'npar' members, and we keep track of the upper bound
84      * of both in 'total_par' and 'logical_total_par', we also populate
85      * the 'logical_to_parno' map, which gives us the first physical parno
86      * for a given logical parno, and the `parno_to_logical` array which gives
87      * us the logical id for each physical parno. When compilation is
88      * completed we construct the 'parno_to_logical_next' array from the
89      * 'parno_to_logical' array. (We do not bother constructing it during
90      * compilation as we do not need it, and we can construct it in O(N) time
91      * once we are done, but would need more complicated logic during the
92      * compile, because we want the next pointers to go from smallest to
93      * largest, eg, left to right.)
94      *
95      * Logical: $1      $2  $3  $4    $2  $3    $2    $5
96      * Physical: 1       2   3   4     5   6     7     8
97      * Next:     0       5   6   0     7   0     0     0
98      * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/
99      *
100      * As much as possible the internals use and store the physical id of
101      * of capture buffers. We decode the physical to the logical only when
102      * we need to, for instance when someone use $2.
103      *
104      * Note that when branch reset is not used logical and physical are the
105      * same and the next data would be all zero. So when branch reset is not
106      * used we do not need to populate this data into the final regexp.
107      *
108      */
109     I32         *logical_to_parno;        /* logical_parno to parno */
110     I32         *parno_to_logical;        /* parno to logical_parno */
111     I32         *parno_to_logical_next;   /* parno to next (greater value)
112                                              parno with the same
113                                              logical_parno as parno.*/
114 
115     I32         npar;                   /* Capture buffer count so far in the
116                                            parse, (OPEN) plus one. ("par" 0 is
117                                            the whole pattern)*/
118     I32         logical_npar;           /* Logical version of npar */
119     I32         total_par;              /* During initial parse, is either 0,
120                                            or -1; the latter indicating a
121                                            reparse is needed.  After that pass,
122                                            it is what 'npar' became after the
123                                            pass.  Hence, it being > 0 indicates
124                                            we are in a reparse situation */
125     I32         logical_total_par;      /* Logical version to total par */
126     I32         nestroot;               /* root parens we are in - used by
127                                            accept */
128     I32         seen_zerolen;
129     regnode     *end_op;                /* END node in program */
130     I32         utf8;           /* whether the pattern is utf8 or not */
131     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
132                                 /* XXX use this for future optimisation of case
133                                  * where pattern must be upgraded to utf8. */
134     I32         uni_semantics;  /* If a d charset modifier should use unicode
135                                    rules, even if the pattern is not in
136                                    utf8 */
137 
138     I32         recurse_count;          /* Number of recurse regops we have generated */
139     regnode     **recurse;              /* Recurse regops */
140     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
141                                            through */
142     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
143     I32         in_lookaround;
144     I32         contains_locale;
145     I32         override_recoding;
146     I32         recode_x_to_native;
147     I32         in_multi_char_class;
148     int         code_index;             /* next code_blocks[] slot */
149     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
150                                             within pattern */
151     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
152     scan_frame *frame_head;
153     scan_frame *frame_last;
154     U32         frame_count;
155     AV         *warn_text;
156     HV         *unlexed_names;
157     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
158     bool        seen_d_op;
159     bool        strict;
160     bool        study_started;
161     bool        in_script_run;
162     bool        use_BRANCHJ;
163     bool        sWARN_EXPERIMENTAL__VLB;
164     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
165     /* DEBUGGING only fields, keep these LAST so that we do not
166      * have any weirdness with static builds.
167      *
168      * We include these if we are building a DEBUGGING perl OR if we
169      * are not using dynamic linking (USE_DYNAMIC_LOADING).
170      *
171      * See GH Issue #21558 and also ba6e2c38aafc23cf114f3ba0d0ff3baead34328b
172      */
173 #if defined(DEBUGGING) || !defined(USE_DYNAMIC_LOADING)
174     const char  *lastparse;
175     I32         lastnum;
176     U32         study_chunk_recursed_count;
177     AV          *paren_name_list;       /* idx -> name */
178     SV          *mysv1;
179     SV          *mysv2;
180 #endif
181 };
182 
183 #ifdef DEBUGGING
184 #define RExC_lastparse  (pRExC_state->lastparse)
185 #define RExC_lastnum    (pRExC_state->lastnum)
186 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
187 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
188 #define RExC_mysv       (pRExC_state->mysv1)
189 #define RExC_mysv1      (pRExC_state->mysv1)
190 #define RExC_mysv2      (pRExC_state->mysv2)
191 #endif
192 
193 #define RExC_flags      (pRExC_state->flags)
194 #define RExC_pm_flags   (pRExC_state->pm_flags)
195 #define RExC_precomp    (pRExC_state->precomp)
196 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
197 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
198 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
199 #define RExC_precomp_end (pRExC_state->precomp_end)
200 #define RExC_rx_sv      (pRExC_state->rx_sv)
201 #define RExC_rx         (pRExC_state->rx)
202 #define RExC_rxi        (pRExC_state->rxi)
203 #define RExC_start      (pRExC_state->start)
204 #define RExC_end        (pRExC_state->end)
205 #define RExC_parse      (pRExC_state->parse)
206 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
207 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
208 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
209                                                    under /d from /u ? */
210 
211 #define RExC_emit       (pRExC_state->emit)
212 #define RExC_emit_start (pRExC_state->emit_start)
213 #define RExC_sawback    (pRExC_state->sawback)
214 #define RExC_seen       (pRExC_state->seen)
215 #define RExC_size       (pRExC_state->size)
216 #define RExC_maxlen        (pRExC_state->maxlen)
217 #define RExC_logical_npar           (pRExC_state->logical_npar)
218 #define RExC_logical_total_parens   (pRExC_state->logical_total_par)
219 #define RExC_logical_to_parno       (pRExC_state->logical_to_parno)
220 #define RExC_parno_to_logical       (pRExC_state->parno_to_logical)
221 #define RExC_parno_to_logical_next  (pRExC_state->parno_to_logical_next)
222 #define RExC_npar       (pRExC_state->npar)
223 #define RExC_total_parens       (pRExC_state->total_par)
224 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
225 #define RExC_nestroot   (pRExC_state->nestroot)
226 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
227 #define RExC_utf8       (pRExC_state->utf8)
228 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
229 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
230 #define RExC_open_parens        (pRExC_state->open_parens)
231 #define RExC_close_parens       (pRExC_state->close_parens)
232 #define RExC_end_op     (pRExC_state->end_op)
233 #define RExC_paren_names        (pRExC_state->paren_names)
234 #define RExC_recurse    (pRExC_state->recurse)
235 #define RExC_recurse_count      (pRExC_state->recurse_count)
236 #define RExC_sets_depth         (pRExC_state->sets_depth)
237 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
238 #define RExC_study_chunk_recursed_bytes  \
239                                    (pRExC_state->study_chunk_recursed_bytes)
240 #define RExC_in_lookaround      (pRExC_state->in_lookaround)
241 #define RExC_contains_locale    (pRExC_state->contains_locale)
242 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
243 
244 #ifdef EBCDIC
245 #  define SET_recode_x_to_native(x)                                         \
246                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
247 #else
248 #  define SET_recode_x_to_native(x) NOOP
249 #endif
250 
251 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
252 #define RExC_frame_head (pRExC_state->frame_head)
253 #define RExC_frame_last (pRExC_state->frame_last)
254 #define RExC_frame_count (pRExC_state->frame_count)
255 #define RExC_strict (pRExC_state->strict)
256 #define RExC_study_started      (pRExC_state->study_started)
257 #define RExC_warn_text (pRExC_state->warn_text)
258 #define RExC_in_script_run      (pRExC_state->in_script_run)
259 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
260 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
261 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
262 #define RExC_unlexed_names (pRExC_state->unlexed_names)
263 
264 
265 /***********************************************************************/
266 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
267  *
268  * All of these macros depend on the above RExC_ accessor macros, which
269  * in turns depend on a variable pRExC_state being in scope where they
270  * are used. This is the standard regexp parser context variable which is
271  * passed into every non-trivial parse function in this file.
272  *
273  * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
274  * of the macros which do not take an argument will operate on the
275  * pRExC_state structure *only*.
276  *
277  * Please do NOT modify RExC_parse without using these macros. In the
278  * future these macros will be extended for enhanced debugging and trace
279  * output during the parse process.
280  */
281 
282 /* RExC_parse_incf(flag)
283  *
284  * Increment RExC_parse to point at the next codepoint, while doing
285  * the right thing depending on whether we are parsing UTF-8 strings
286  * or not. The 'flag' argument determines if content is UTF-8 or not,
287  * intended for cases where this is NOT governed by the UTF macro.
288  *
289  * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
290  *
291  * WARNING: Does NOT take into account RExC_end; it is the callers
292  * responsibility to make sure there are enough octets left in
293  * RExC_parse to ensure that when processing UTF-8 we would not read
294  * past the end of the string.
295  */
296 #define RExC_parse_incf(flag) STMT_START {              \
297     RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1;    \
298 } STMT_END
299 
300 /* RExC_parse_inc_safef(flag)
301  *
302  * Safely increment RExC_parse to point at the next codepoint,
303  * doing the right thing depending on whether we are parsing
304  * UTF-8 strings or not and NOT reading past the end of the buffer.
305  * The 'flag' argument determines if content is UTF-8 or not,
306  * intended for cases where this is NOT governed by the UTF macro.
307  *
308  * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
309  *
310  * NOTE: Will NOT read past RExC_end when content is UTF-8.
311  */
312 #define RExC_parse_inc_safef(flag) STMT_START {                     \
313     RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
314 } STMT_END
315 
316 /* RExC_parse_inc()
317  *
318  * Increment RExC_parse to point at the next codepoint,
319  * doing the right thing depending on whether we are parsing
320  * UTF-8 strings or not.
321  *
322  * WARNING: Does NOT take into account RExC_end, it is the callers
323  * responsibility to make sure there are enough octets left in
324  * RExC_parse to ensure that when processing UTF-8 we would not read
325  * past the end of the string.
326  *
327  * NOTE: whether we are parsing UTF-8 or not is determined by the
328  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
329  * macro operates on the pRExC_state structure only.
330  */
331 #define RExC_parse_inc() RExC_parse_incf(UTF)
332 
333 /* RExC_parse_inc_safe()
334  *
335  * Safely increment RExC_parse to point at the next codepoint,
336  * doing the right thing depending on whether we are parsing
337  * UTF-8 strings or not and NOT reading past the end of the buffer.
338  *
339  * NOTE: whether we are parsing UTF-8 or not is determined by the
340  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
341  * macro operates on the pRExC_state structure only.
342  */
343 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
344 
345 /* RExC_parse_inc_utf8()
346  *
347  * Increment RExC_parse to point at the next utf8 codepoint,
348  * assumes content is UTF-8.
349  *
350  * WARNING: Does NOT take into account RExC_end; it is the callers
351  * responsibility to make sure there are enough octets left in RExC_parse
352  * to ensure that when processing UTF-8 we would not read past the end
353  * of the string.
354  */
355 #define RExC_parse_inc_utf8() STMT_START {  \
356     RExC_parse += UTF8SKIP(RExC_parse);     \
357 } STMT_END
358 
359 /* RExC_parse_inc_if_char()
360  *
361  * Increment RExC_parse to point at the next codepoint, if and only
362  * if the current parse point is NOT a NULL, while doing the right thing
363  * depending on whether we are parsing UTF-8 strings or not.
364  *
365  * WARNING: Does NOT take into account RExC_end, it is the callers
366  * responsibility to make sure there are enough octets left in RExC_parse
367  * to ensure that when processing UTF-8 we would not read past the end
368  * of the string.
369  *
370  * NOTE: whether we are parsing UTF-8 or not is determined by the
371  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
372  * macro operates on the pRExC_state structure only.
373  */
374 #define RExC_parse_inc_if_char() STMT_START {         \
375     RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end);  \
376 } STMT_END
377 
378 /* RExC_parse_inc_by(n_octets)
379  *
380  * Increment the parse cursor by the number of octets specified by
381  * the 'n_octets' argument.
382  *
383  * NOTE: Does NOT check ANY constraints. It is the callers responsibility
384  * that this will not move past the end of the string, or leave the
385  * pointer in the middle of a UTF-8 sequence.
386  *
387  * Typically used to advanced past previously analyzed content.
388  */
389 #define RExC_parse_inc_by(n_octets) STMT_START {  \
390     RExC_parse += (n_octets);                     \
391 } STMT_END
392 
393 /* RExC_parse_set(to_ptr)
394  *
395  * Sets the RExC_parse pointer to the pointer specified by the 'to'
396  * argument. No validation whatsoever is performed on the to pointer.
397  */
398 #define RExC_parse_set(to_ptr) STMT_START { \
399     RExC_parse = (to_ptr);                  \
400 } STMT_END
401 
402 /**********************************************************************/
403 
404 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
405  * a flag to disable back-off on the fixed/floating substrings - if it's
406  * a high complexity pattern we assume the benefit of avoiding a full match
407  * is worth the cost of checking for the substrings even if they rarely help.
408  */
409 #define RExC_naughty    (pRExC_state->naughty)
410 #define TOO_NAUGHTY (10)
411 #define MARK_NAUGHTY(add) \
412     if (RExC_naughty < TOO_NAUGHTY) \
413         RExC_naughty += (add)
414 #define MARK_NAUGHTY_EXP(exp, add) \
415     if (RExC_naughty < TOO_NAUGHTY) \
416         RExC_naughty += RExC_naughty / (exp) + (add)
417 
418 #define isNON_BRACE_QUANTIFIER(c)   ((c) == '*' || (c) == '+' || (c) == '?')
419 #define isQUANTIFIER(s,e)  (   isNON_BRACE_QUANTIFIER(*s)                      \
420                             || ((*s) == '{' && regcurly(s, e, NULL)))
421 
422 /*
423  * Flags to be passed up.
424  */
425 #define HASWIDTH        0x01    /* Known to not match null strings, could match
426                                    non-null ones. */
427 #define SIMPLE          0x02    /* Exactly one character wide */
428                                 /* (or LNBREAK as a special case) */
429 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
430 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
431 #define RESTART_PARSE   0x20    /* Need to redo the parse */
432 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
433                                    calcuate sizes as UTF-8 */
434 
435 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
436 
437 /* whether trie related optimizations are enabled */
438 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
439 #define TRIE_STUDY_OPT
440 #define FULL_TRIE_STUDY
441 #define TRIE_STCLASS
442 #endif
443 
444 /* About the term "restudy" and the var "restudied" and the defines
445  * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
446  * doing multiple study_chunk() calls over the same set of opcodes for* the
447  * purpose of enhanced TRIE optimizations.
448  *
449  * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
450  * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
451  * which then causes the Perl_re_op_compile() to then call the optimizer
452  * S_study_chunk() a second time to perform additional optimizations,
453  * including the aho_corasick startclass optimization.
454  * This additional pass will only happen once, which is managed by the
455  * 'restudied' variable in Perl_re_op_compile().
456  *
457  * When this second pass is under way the flags passed into study_chunk() will
458  * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
459  * to any recursive calls to S_study_chunk().
460  *
461  * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
462  * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
463  * be produced twice.
464  *
465  * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
466  * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
467  */
468 
469 
470 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
471 #define PBITVAL(paren) (1 << ((paren) & 7))
472 #define PAREN_OFFSET(depth) \
473     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
474 #define PAREN_TEST(depth, paren) \
475     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
476 #define PAREN_SET(depth, paren) \
477     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
478 #define PAREN_UNSET(depth, paren) \
479     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
480 
481 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
482                                      if (!UTF) {                           \
483                                          *flagp = RESTART_PARSE|NEED_UTF8; \
484                                          return 0;                         \
485                                      }                                     \
486                              } STMT_END
487 
488 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
489  * pattern is in UTF-8.  This latter condition is in case the outermost rules
490  * are locale.  See GH #17278 */
491 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
492 
493 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
494  * a flag that indicates we need to override /d with /u as a result of
495  * something in the pattern.  It should only be used in regards to calling
496  * set_regex_charset() or get_regex_charset() */
497 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
498     STMT_START {                                                            \
499             if (DEPENDS_SEMANTICS) {                                        \
500                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
501                 RExC_uni_semantics = 1;                                     \
502                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
503                     /* No need to restart the parse if we haven't seen      \
504                      * anything that differs between /u and /d, and no need \
505                      * to restart immediately if we're going to reparse     \
506                      * anyway to count parens */                            \
507                     *flagp |= RESTART_PARSE;                                \
508                     return restart_retval;                                  \
509                 }                                                           \
510             }                                                               \
511     } STMT_END
512 
513 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
514     STMT_START {                                                            \
515                 RExC_use_BRANCHJ = 1;                                       \
516                 *flagp |= RESTART_PARSE;                                    \
517                 return restart_retval;                                      \
518     } STMT_END
519 
520 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
521  * less.  After that, it must always be positive, because the whole re is
522  * considered to be surrounded by virtual parens.  Setting it to negative
523  * indicates there is some construct that needs to know the actual number of
524  * parens to be properly handled.  And that means an extra pass will be
525  * required after we've counted them all */
526 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
527 #define REQUIRE_PARENS_PASS                                                 \
528     STMT_START {  /* No-op if have completed a pass */                      \
529                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
530     } STMT_END
531 #define IN_PARENS_PASS (RExC_total_parens < 0)
532 
533 
534 /* This is used to return failure (zero) early from the calling function if
535  * various flags in 'flags' are set.  Two flags always cause a return:
536  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
537  * additional flags that should cause a return; 0 if none.  If the return will
538  * be done, '*flagp' is first set to be all of the flags that caused the
539  * return. */
540 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
541     STMT_START {                                                            \
542             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
543                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
544                 return 0;                                                   \
545             }                                                               \
546     } STMT_END
547 
548 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
549 
550 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
551                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
552 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
553                                     if (MUST_RESTART(*(flagp))) return 0
554 
555 /* This converts the named class defined in regcomp.h to its equivalent class
556  * number defined in handy.h. */
557 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
558 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
559 
560 #define _invlist_union_complement_2nd(a, b, output) \
561                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
562 #define _invlist_intersection_complement_2nd(a, b, output) \
563                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
564 
565 /* We add a marker if we are deferring expansion of a property that is both
566  * 1) potentiallly user-defined; and
567  * 2) could also be an official Unicode property.
568  *
569  * Without this marker, any deferred expansion can only be for a user-defined
570  * one.  This marker shouldn't conflict with any that could be in a legal name,
571  * and is appended to its name to indicate this.  There is a string and
572  * character form */
573 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
574 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
575 
576 /* What is infinity for optimization purposes */
577 #define OPTIMIZE_INFTY  SSize_t_MAX
578 
579 /* About scan_data_t.
580 
581   During optimisation we recurse through the regexp program performing
582   various inplace (keyhole style) optimisations. In addition study_chunk
583   and scan_commit populate this data structure with information about
584   what strings MUST appear in the pattern. We look for the longest
585   string that must appear at a fixed location, and we look for the
586   longest string that may appear at a floating location. So for instance
587   in the pattern:
588 
589     /FOO[xX]A.*B[xX]BAR/
590 
591   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
592   strings (because they follow a .* construct). study_chunk will identify
593   both FOO and BAR as being the longest fixed and floating strings respectively.
594 
595   The strings can be composites, for instance
596 
597      /(f)(o)(o)/
598 
599   will result in a composite fixed substring 'foo'.
600 
601   For each string some basic information is maintained:
602 
603   - min_offset
604     This is the position the string must appear at, or not before.
605     It also implicitly (when combined with minlenp) tells us how many
606     characters must match before the string we are searching for.
607     Likewise when combined with minlenp and the length of the string it
608     tells us how many characters must appear after the string we have
609     found.
610 
611   - max_offset
612     Only used for floating strings. This is the rightmost point that
613     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
614     string can occur infinitely far to the right.
615     For fixed strings, it is equal to min_offset.
616 
617   - minlenp
618     A pointer to the minimum number of characters of the pattern that the
619     string was found inside. This is important as in the case of positive
620     lookahead or positive lookbehind we can have multiple patterns
621     involved. Consider
622 
623     /(?=FOO).*F/
624 
625     The minimum length of the pattern overall is 3, the minimum length
626     of the lookahead part is 3, but the minimum length of the part that
627     will actually match is 1. So 'FOO's minimum length is 3, but the
628     minimum length for the F is 1. This is important as the minimum length
629     is used to determine offsets in front of and behind the string being
630     looked for.  Since strings can be composites this is the length of the
631     pattern at the time it was committed with a scan_commit. Note that
632     the length is calculated by study_chunk, so that the minimum lengths
633     are not known until the full pattern has been compiled, thus the
634     pointer to the value.
635 
636   - lookbehind
637 
638     In the case of lookbehind the string being searched for can be
639     offset past the start point of the final matching string.
640     If this value was just blithely removed from the min_offset it would
641     invalidate some of the calculations for how many chars must match
642     before or after (as they are derived from min_offset and minlen and
643     the length of the string being searched for).
644     When the final pattern is compiled and the data is moved from the
645     scan_data_t structure into the regexp structure the information
646     about lookbehind is factored in, with the information that would
647     have been lost precalculated in the end_shift field for the
648     associated string.
649 
650   The fields pos_min and pos_delta are used to store the minimum offset
651   and the delta to the maximum offset at the current point in the pattern.
652 
653 */
654 
655 struct scan_data_substrs {
656     SV      *str;       /* longest substring found in pattern */
657     SSize_t min_offset; /* earliest point in string it can appear */
658     SSize_t max_offset; /* latest point in string it can appear */
659     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
660     SSize_t lookbehind; /* is the pos of the string modified by LB */
661     I32 flags;          /* per substring SF_* and SCF_* flags */
662 };
663 
664 /* this is typedef'ed in perl.h */
665 struct scan_data_t {
666     /*I32 len_min;      unused */
667     /*I32 len_delta;    unused */
668     SSize_t pos_min;
669     SSize_t pos_delta;
670     SV *last_found;
671     SSize_t last_end;       /* min value, <0 unless valid. */
672     SSize_t last_start_min;
673     SSize_t last_start_max;
674     U8      cur_is_floating; /* whether the last_* values should be set as
675                               * the next fixed (0) or floating (1)
676                               * substring */
677 
678     /* [0] is longest fixed substring so far, [1] is longest float so far */
679     struct scan_data_substrs  substrs[2];
680 
681     I32 flags;             /* common SF_* and SCF_* flags */
682     I32 whilem_c;
683     SSize_t *last_closep;
684     regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
685                                  seen. DO NOT DEREFERENCE the regnode
686                                  pointer - the op may have been optimized
687                                  away */
688     regnode_ssc *start_class;
689 };
690 
691 /*
692  * Forward declarations for pregcomp()'s friends.
693  */
694 
695 static const scan_data_t zero_scan_data = {
696     0, 0, NULL, 0, 0, 0, 0,
697     {
698         { NULL, 0, 0, 0, 0, 0 },
699         { NULL, 0, 0, 0, 0, 0 },
700     },
701     0, 0, NULL, NULL, NULL
702 };
703 
704 /* study flags */
705 
706 #define SF_BEFORE_SEOL          0x0001
707 #define SF_BEFORE_MEOL          0x0002
708 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
709 
710 #define SF_IS_INF               0x0040
711 #define SF_HAS_PAR              0x0080
712 #define SF_IN_PAR               0x0100
713 #define SF_HAS_EVAL             0x0200
714 
715 
716 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
717  * longest substring in the pattern. When it is not set the optimiser keeps
718  * track of position, but does not keep track of the actual strings seen,
719  *
720  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
721  * /foo/i will not.
722  *
723  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
724  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
725  * turned off because of the alternation (BRANCH). */
726 #define SCF_DO_SUBSTR           0x0400
727 
728 #define SCF_DO_STCLASS_AND      0x0800
729 #define SCF_DO_STCLASS_OR       0x1000
730 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
731 #define SCF_WHILEM_VISITED_POS  0x2000
732 
733 #define SCF_TRIE_RESTUDY        0x4000 /* Need to do restudy in study_chunk()?
734                                           Search for "restudy" in this file
735                                           to find a detailed explanation.*/
736 #define SCF_SEEN_ACCEPT         0x8000
737 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
738                                           Search for "restudy" in this file
739                                           to find a detailed explanation. */
740 #define SCF_IN_DEFINE          0x20000
741 
742 
743 
744 #define UTF cBOOL(RExC_utf8)
745 
746 /* The enums for all these are ordered so things work out correctly */
747 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
748 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
749                                                      == REGEX_DEPENDS_CHARSET)
750 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
751 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
752                                                      >= REGEX_UNICODE_CHARSET)
753 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
754                                             == REGEX_ASCII_RESTRICTED_CHARSET)
755 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
756                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
757 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
758                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
759 
760 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
761 
762 /* For programs that want to be strictly Unicode compatible by dying if any
763  * attempt is made to match a non-Unicode code point against a Unicode
764  * property.  */
765 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
766 
767 #define OOB_NAMEDCLASS          -1
768 
769 /* There is no code point that is out-of-bounds, so this is problematic.  But
770  * its only current use is to initialize a variable that is always set before
771  * looked at. */
772 #define OOB_UNICODE             0xDEADBEEF
773 
774 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
775 
776 
777 /* length of regex to show in messages that don't mark a position within */
778 #define RegexLengthToShowInErrorMessages 127
779 
780 /*
781  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
782  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
783  * op/pragma/warn/regcomp.
784  */
785 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
786 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
787 
788 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
789                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
790 
791 /* The code in this file in places uses one level of recursion with parsing
792  * rebased to an alternate string constructed by us in memory.  This can take
793  * the form of something that is completely different from the input, or
794  * something that uses the input as part of the alternate.  In the first case,
795  * there should be no possibility of an error, as we are in complete control of
796  * the alternate string.  But in the second case we don't completely control
797  * the input portion, so there may be errors in that.  Here's an example:
798  *      /[abc\x{DF}def]/ui
799  * is handled specially because \x{df} folds to a sequence of more than one
800  * character: 'ss'.  What is done is to create and parse an alternate string,
801  * which looks like this:
802  *      /(?:\x{DF}|[abc\x{DF}def])/ui
803  * where it uses the input unchanged in the middle of something it constructs,
804  * which is a branch for the DF outside the character class, and clustering
805  * parens around the whole thing. (It knows enough to skip the DF inside the
806  * class while in this substitute parse.) 'abc' and 'def' may have errors that
807  * need to be reported.  The general situation looks like this:
808  *
809  *                                       |<------- identical ------>|
810  *              sI                       tI               xI       eI
811  * Input:       ---------------------------------------------------------------
812  * Constructed:         ---------------------------------------------------
813  *                      sC               tC               xC       eC     EC
814  *                                       |<------- identical ------>|
815  *
816  * sI..eI   is the portion of the input pattern we are concerned with here.
817  * sC..EC   is the constructed substitute parse string.
818  *  sC..tC  is constructed by us
819  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
820  *          In the diagram, these are vertically aligned.
821  *  eC..EC  is also constructed by us.
822  * xC       is the position in the substitute parse string where we found a
823  *          problem.
824  * xI       is the position in the original pattern corresponding to xC.
825  *
826  * We want to display a message showing the real input string.  Thus we need to
827  * translate from xC to xI.  We know that xC >= tC, since the portion of the
828  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
829  * get:
830  *      xI = tI + (xC - tC)
831  *
832  * When the substitute parse is constructed, the code needs to set:
833  *      RExC_start (sC)
834  *      RExC_end (eC)
835  *      RExC_copy_start_in_input  (tI)
836  *      RExC_copy_start_in_constructed (tC)
837  * and restore them when done.
838  *
839  * During normal processing of the input pattern, both
840  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
841  * sI, so that xC equals xI.
842  */
843 
844 #define sI              RExC_precomp
845 #define eI              RExC_precomp_end
846 #define sC              RExC_start
847 #define eC              RExC_end
848 #define tI              RExC_copy_start_in_input
849 #define tC              RExC_copy_start_in_constructed
850 #define xI(xC)          (tI + (xC - tC))
851 #define xI_offset(xC)   (xI(xC) - sI)
852 
853 #define REPORT_LOCATION_ARGS(xC)                                            \
854     UTF8fARG(UTF,                                                           \
855              (xI(xC) > eI) /* Don't run off end */                          \
856               ? eI - sI   /* Length before the <--HERE */                   \
857               : ((xI_offset(xC) >= 0)                                       \
858                  ? xI_offset(xC)                                            \
859                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
860                                     IVdf " trying to output message for "   \
861                                     " pattern %.*s",                        \
862                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
863                                     ((int) (eC - sC)), sC), 0)),            \
864              sI),         /* The input pattern printed up to the <--HERE */ \
865     UTF8fARG(UTF,                                                           \
866              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
867              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
868 
869 /* Used to point after bad bytes for an error message, but avoid skipping
870  * past a nul byte. */
871 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
872 
873 /* Set up to clean up after our imminent demise */
874 #define PREPARE_TO_DIE                                                      \
875     STMT_START {                                                            \
876         if (RExC_rx_sv)                                                     \
877             SAVEFREESV(RExC_rx_sv);                                         \
878         if (RExC_open_parens)                                               \
879             SAVEFREEPV(RExC_open_parens);                                   \
880         if (RExC_close_parens)                                              \
881             SAVEFREEPV(RExC_close_parens);                                  \
882         if (RExC_logical_to_parno)                                          \
883             SAVEFREEPV(RExC_logical_to_parno);                              \
884         if (RExC_parno_to_logical)                                          \
885             SAVEFREEPV(RExC_parno_to_logical);                              \
886     } STMT_END
887 
888 /*
889  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
890  * arg. Show regex, up to a maximum length. If it's too long, chop and add
891  * "...".
892  */
893 #define _FAIL(code) STMT_START {                                        \
894     const char *ellipses = "";                                          \
895     IV len = RExC_precomp_end - RExC_precomp;                           \
896                                                                         \
897     if (len > RegexLengthToShowInErrorMessages) {                       \
898         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
899         len = RegexLengthToShowInErrorMessages - 10;                    \
900         ellipses = "...";                                               \
901     }                                                                   \
902     code;                                                               \
903 } STMT_END
904 
905 #define FAIL(msg) _FAIL(                            \
906     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
907             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
908 
909 #define FAIL2(msg,arg) _FAIL(                       \
910     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
911             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
912 
913 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
914     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
915      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
916 
917 /*
918  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
919  */
920 #define Simple_vFAIL(m) STMT_START {                                    \
921     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
922             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
923 } STMT_END
924 
925 /*
926  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
927  */
928 #define vFAIL(m) STMT_START {                           \
929     Simple_vFAIL(m);                                    \
930 } STMT_END
931 
932 /*
933  * Like Simple_vFAIL(), but accepts two arguments.
934  */
935 #define Simple_vFAIL2(m,a1) STMT_START {                        \
936     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
937                       REPORT_LOCATION_ARGS(RExC_parse));        \
938 } STMT_END
939 
940 /*
941  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
942  */
943 #define vFAIL2(m,a1) STMT_START {                       \
944     Simple_vFAIL2(m, a1);                               \
945 } STMT_END
946 
947 
948 /*
949  * Like Simple_vFAIL(), but accepts three arguments.
950  */
951 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
952     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
953             REPORT_LOCATION_ARGS(RExC_parse));                  \
954 } STMT_END
955 
956 /*
957  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
958  */
959 #define vFAIL3(m,a1,a2) STMT_START {                    \
960     Simple_vFAIL3(m, a1, a2);                           \
961 } STMT_END
962 
963 /*
964  * Like Simple_vFAIL(), but accepts four arguments.
965  */
966 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
967     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
968             REPORT_LOCATION_ARGS(RExC_parse));                  \
969 } STMT_END
970 
971 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
972     Simple_vFAIL4(m, a1, a2, a3);                       \
973 } STMT_END
974 
975 /* A specialized version of vFAIL2 that works with UTF8f */
976 #define vFAIL2utf8f(m, a1) STMT_START {             \
977     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
978             REPORT_LOCATION_ARGS(RExC_parse));      \
979 } STMT_END
980 
981 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
982     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
983             REPORT_LOCATION_ARGS(RExC_parse));          \
984 } STMT_END
985 
986 /* Setting this to NULL is a signal to not output warnings */
987 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
988     STMT_START {                                                            \
989       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
990       RExC_copy_start_in_constructed = NULL;                                \
991     } STMT_END
992 #define RESTORE_WARNINGS                                                    \
993     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
994 
995 /* Since a warning can be generated multiple times as the input is reparsed, we
996  * output it the first time we come to that point in the parse, but suppress it
997  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
998  * generate any warnings */
999 #define TO_OUTPUT_WARNINGS(loc)                                         \
1000   (   RExC_copy_start_in_constructed                                    \
1001    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
1002 
1003 /* After we've emitted a warning, we save the position in the input so we don't
1004  * output it again */
1005 #define UPDATE_WARNINGS_LOC(loc)                                        \
1006     STMT_START {                                                        \
1007         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1008             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
1009                                                        - RExC_precomp;  \
1010         }                                                               \
1011     } STMT_END
1012 
1013 /* 'warns' is the output of the packWARNx macro used in 'code' */
1014 #define _WARN_HELPER(loc, warns, code)                                  \
1015     STMT_START {                                                        \
1016         if (! RExC_copy_start_in_constructed) {                         \
1017             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
1018                               " expected at '%s'",                      \
1019                               __FILE__, __LINE__, loc);                 \
1020         }                                                               \
1021         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1022             code;                                                       \
1023             UPDATE_WARNINGS_LOC(loc);                                   \
1024         }                                                               \
1025     } STMT_END
1026 
1027 /* m is not necessarily a "literal string", in this macro */
1028 #define warn_non_literal_string(loc, packed_warn, m)                    \
1029     _WARN_HELPER(loc, packed_warn,                                      \
1030                       Perl_warner(aTHX_ packed_warn,                    \
1031                                        "%s" REPORT_LOCATION,            \
1032                                   m, REPORT_LOCATION_ARGS(loc)))
1033 #define reg_warn_non_literal_string(loc, m)                             \
1034                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1035 
1036 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
1037     STMT_START {                                                            \
1038                 char * format;                                              \
1039                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1040                 Newx(format, format_size, char);                            \
1041                 my_strlcpy(format, m, format_size);                         \
1042                 my_strlcat(format, REPORT_LOCATION, format_size);           \
1043                 SAVEFREEPV(format);                                         \
1044                 _WARN_HELPER(loc, packwarn,                                 \
1045                       Perl_ck_warner(aTHX_ packwarn,                        \
1046                                         format,                             \
1047                                         a1, REPORT_LOCATION_ARGS(loc)));    \
1048     } STMT_END
1049 
1050 #define ckWARNreg(loc,m)                                                \
1051     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1052                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1053                                           m REPORT_LOCATION,            \
1054                                           REPORT_LOCATION_ARGS(loc)))
1055 
1056 #define vWARN(loc, m)                                                   \
1057     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1058                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1059                                        m REPORT_LOCATION,               \
1060                                        REPORT_LOCATION_ARGS(loc)))      \
1061 
1062 #define vWARN_dep(loc,category,m)                                           \
1063     _WARN_HELPER(loc, packWARN(category),                                   \
1064                       Perl_warner(aTHX_ packWARN(category),                 \
1065                                        m REPORT_LOCATION,                   \
1066                                        REPORT_LOCATION_ARGS(loc)))
1067 
1068 #define ckWARNdep(loc,category,m)                                           \
1069     _WARN_HELPER(loc, packWARN(category),                                   \
1070                       Perl_ck_warner_d(aTHX_ packWARN(category),            \
1071                                             m REPORT_LOCATION,              \
1072                                             REPORT_LOCATION_ARGS(loc)))
1073 
1074 #define ckWARNregdep(loc,category,m)                                        \
1075     _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP),                     \
1076                       Perl_ck_warner_d(aTHX_ packWARN2(category,            \
1077                                                       WARN_REGEXP),         \
1078                                              m REPORT_LOCATION,             \
1079                                              REPORT_LOCATION_ARGS(loc)))
1080 
1081 #define ckWARN2reg_d(loc,m, a1)                                             \
1082     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1083                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1084                                             m REPORT_LOCATION,              \
1085                                             a1, REPORT_LOCATION_ARGS(loc)))
1086 
1087 #define ckWARN2reg(loc, m, a1)                                              \
1088     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1089                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1090                                           m REPORT_LOCATION,                \
1091                                           a1, REPORT_LOCATION_ARGS(loc)))
1092 
1093 #define vWARN3(loc, m, a1, a2)                                              \
1094     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1095                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1096                                        m REPORT_LOCATION,                   \
1097                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1098 
1099 #define ckWARN3reg(loc, m, a1, a2)                                          \
1100     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1101                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1102                                           m REPORT_LOCATION,                \
1103                                           a1, a2,                           \
1104                                           REPORT_LOCATION_ARGS(loc)))
1105 
1106 #define vWARN4(loc, m, a1, a2, a3)                                      \
1107     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1108                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1109                                        m REPORT_LOCATION,               \
1110                                        a1, a2, a3,                      \
1111                                        REPORT_LOCATION_ARGS(loc)))
1112 
1113 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1114     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1115                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1116                                           m REPORT_LOCATION,            \
1117                                           a1, a2, a3,                   \
1118                                           REPORT_LOCATION_ARGS(loc)))
1119 
1120 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1121     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1122                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1123                                        m REPORT_LOCATION,               \
1124                                        a1, a2, a3, a4,                  \
1125                                        REPORT_LOCATION_ARGS(loc)))
1126 
1127 #define ckWARNexperimental(loc, class, m)                               \
1128     STMT_START {                                                        \
1129         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1130             RExC_warned_ ## class = 1;                                  \
1131             _WARN_HELPER(loc, packWARN(class),                          \
1132                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1133                                             m REPORT_LOCATION,          \
1134                                             REPORT_LOCATION_ARGS(loc)));\
1135         }                                                               \
1136     } STMT_END
1137 
1138 #define ckWARNexperimental_with_arg(loc, class, m, arg)                 \
1139     STMT_START {                                                        \
1140         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1141             RExC_warned_ ## class = 1;                                  \
1142             _WARN_HELPER(loc, packWARN(class),                          \
1143                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1144                                        m REPORT_LOCATION,               \
1145                                        arg, REPORT_LOCATION_ARGS(loc)));\
1146         }                                                               \
1147     } STMT_END
1148 
1149 /* Convert between a pointer to a node and its offset from the beginning of the
1150  * program */
1151 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1152 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start)      \
1153                               (SSize_t) ((node) - RExC_emit_start))
1154 
1155 #define ProgLen(ri) ri->proglen
1156 #define SetProgLen(ri,x) ri->proglen = x
1157 
1158 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1159 #define EXPERIMENTAL_INPLACESCAN
1160 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1161 
1162 #define DEBUG_RExC_seen()                                                   \
1163         DEBUG_OPTIMISE_MORE_r({                                             \
1164             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1165                                                                             \
1166             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1167                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1168                                                                             \
1169             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1170                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1171                                                                             \
1172             if (RExC_seen & REG_GPOS_SEEN)                                  \
1173                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1174                                                                             \
1175             if (RExC_seen & REG_RECURSE_SEEN)                               \
1176                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1177                                                                             \
1178             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1179                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1180                                                                             \
1181             if (RExC_seen & REG_VERBARG_SEEN)                               \
1182                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1183                                                                             \
1184             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1185                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1186                                                                             \
1187             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1188                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1189                                                                             \
1190             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1191                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1192                                                                             \
1193             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1194                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1195                                                                             \
1196             if (RExC_seen & REG_PESSIMIZE_SEEN)                             \
1197                 Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN ");               \
1198                                                                             \
1199             Perl_re_printf( aTHX_ "\n");                                    \
1200         });
1201 
1202 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1203   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1204 
1205 
1206 #ifdef DEBUGGING
1207 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1208                     debug_studydata(where, data, depth, is_inf, min, stopmin, delta)
1209 
1210 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1211                     debug_peep(str, pRExC_state, scan, depth, flags)
1212 #else
1213 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1214 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1215 #endif
1216 
1217 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
1218 #ifdef DEBUGGING
1219 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
1220 #else
1221 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
1222 #endif
1223 
1224 #define MADE_TRIE       1
1225 #define MADE_JUMP_TRIE  2
1226 #define MADE_EXACT_TRIE 4
1227 
1228 #define INVLIST_INDEX                   0
1229 #define ONLY_LOCALE_MATCHES_INDEX       1
1230 #define DEFERRED_USER_DEFINED_INDEX     2
1231 
1232 /* These two functions currently do the exact same thing */
1233 #define ssc_init_zero           ssc_init
1234 
1235 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1236 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1237 
1238 #ifdef DEBUGGING
1239 #define REGNODE_GUTS(state,op,extra_size) \
1240     regnode_guts_debug(state,op,extra_size)
1241 #else
1242 #define REGNODE_GUTS(state,op,extra_size) \
1243     regnode_guts(state,extra_size)
1244 #endif
1245 
1246 #define CLEAR_OPTSTART                                                          \
1247     if (optstart) STMT_START {                                                  \
1248         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                  \
1249                               " (%" IVdf " nodes)\n", (IV)(node - optstart)));  \
1250         optstart=NULL;                                                          \
1251     } STMT_END
1252 
1253 #define DUMPUNTIL(b,e)                                          \
1254     CLEAR_OPTSTART;                                             \
1255     node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
1256 
1257 #define REGNODE_STEP_OVER(ret,t1,t2) \
1258     NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode))
1259 
1260 #endif /* REGCOMP_INTERNAL_H */
1261