xref: /openbsd/gnu/usr.bin/perl/regcomp_internal.h (revision f2a19305)
1*f2a19305Safresh1 #ifndef REGCOMP_INTERNAL_H
2*f2a19305Safresh1 #define REGCOMP_INTERNAL_H
3*f2a19305Safresh1 #ifndef STATIC
4*f2a19305Safresh1 #define STATIC  static
5*f2a19305Safresh1 #endif
6*f2a19305Safresh1 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM
7*f2a19305Safresh1 #define RE_OPTIMIZE_CURLYX_TO_CURLYM 1
8*f2a19305Safresh1 #endif
9*f2a19305Safresh1 #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN
10*f2a19305Safresh1 #define RE_OPTIMIZE_CURLYX_TO_CURLYN 1
11*f2a19305Safresh1 #endif
12*f2a19305Safresh1 
13*f2a19305Safresh1 /* this is a chain of data about sub patterns we are processing that
14*f2a19305Safresh1    need to be handled separately/specially in study_chunk. Its so
15*f2a19305Safresh1    we can simulate recursion without losing state.  */
16*f2a19305Safresh1 struct scan_frame;
17*f2a19305Safresh1 typedef struct scan_frame {
18*f2a19305Safresh1     regnode *last_regnode;      /* last node to process in this frame */
19*f2a19305Safresh1     regnode *next_regnode;      /* next node to process when last is reached */
20*f2a19305Safresh1     U32 prev_recursed_depth;
21*f2a19305Safresh1     I32 stopparen;              /* what stopparen do we use */
22*f2a19305Safresh1     bool in_gosub;              /* this or an outer frame is for GOSUB */
23*f2a19305Safresh1 
24*f2a19305Safresh1     struct scan_frame *this_prev_frame; /* this previous frame */
25*f2a19305Safresh1     struct scan_frame *prev_frame;      /* previous frame */
26*f2a19305Safresh1     struct scan_frame *next_frame;      /* next frame */
27*f2a19305Safresh1 } scan_frame;
28*f2a19305Safresh1 
29*f2a19305Safresh1 /* Certain characters are output as a sequence with the first being a
30*f2a19305Safresh1  * backslash. */
31*f2a19305Safresh1 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
32*f2a19305Safresh1 
33*f2a19305Safresh1 
34*f2a19305Safresh1 struct RExC_state_t {
35*f2a19305Safresh1     U32         flags;                  /* RXf_* are we folding, multilining? */
36*f2a19305Safresh1     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
37*f2a19305Safresh1     char        *precomp;               /* uncompiled string. */
38*f2a19305Safresh1     char        *precomp_end;           /* pointer to end of uncompiled string. */
39*f2a19305Safresh1     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
40*f2a19305Safresh1     regexp      *rx;                    /* perl core regexp structure */
41*f2a19305Safresh1     regexp_internal     *rxi;           /* internal data for regexp object
42*f2a19305Safresh1                                            pprivate field */
43*f2a19305Safresh1     char        *start;                 /* Start of input for compile */
44*f2a19305Safresh1     char        *end;                   /* End of input for compile */
45*f2a19305Safresh1     char        *parse;                 /* Input-scan pointer. */
46*f2a19305Safresh1     char        *copy_start;            /* start of copy of input within
47*f2a19305Safresh1                                            constructed parse string */
48*f2a19305Safresh1     char        *save_copy_start;       /* Provides one level of saving
49*f2a19305Safresh1                                            and restoring 'copy_start' */
50*f2a19305Safresh1     char        *copy_start_in_input;   /* Position in input string
51*f2a19305Safresh1                                            corresponding to copy_start */
52*f2a19305Safresh1     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
53*f2a19305Safresh1     regnode     *emit_start;            /* Start of emitted-code area */
54*f2a19305Safresh1     regnode_offset emit;                /* Code-emit pointer */
55*f2a19305Safresh1     I32         naughty;                /* How bad is this pattern? */
56*f2a19305Safresh1     I32         sawback;                /* Did we see \1, ...? */
57*f2a19305Safresh1     SSize_t     size;                   /* Number of regnode equivalents in
58*f2a19305Safresh1                                            pattern */
59*f2a19305Safresh1     Size_t      sets_depth;              /* Counts recursion depth of already-
60*f2a19305Safresh1                                            compiled regex set patterns */
61*f2a19305Safresh1     U32         seen;
62*f2a19305Safresh1 
63*f2a19305Safresh1     I32      parens_buf_size;           /* #slots malloced open/close_parens */
64*f2a19305Safresh1     regnode_offset *open_parens;        /* offsets to open parens */
65*f2a19305Safresh1     regnode_offset *close_parens;       /* offsets to close parens */
66*f2a19305Safresh1     HV          *paren_names;           /* Paren names */
67*f2a19305Safresh1 
68*f2a19305Safresh1     /* position beyond 'precomp' of the warning message furthest away from
69*f2a19305Safresh1      * 'precomp'.  During the parse, no warnings are raised for any problems
70*f2a19305Safresh1      * earlier in the parse than this position.  This works if warnings are
71*f2a19305Safresh1      * raised the first time a given spot is parsed, and if only one
72*f2a19305Safresh1      * independent warning is raised for any given spot */
73*f2a19305Safresh1     Size_t      latest_warn_offset;
74*f2a19305Safresh1 
75*f2a19305Safresh1     /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id.
76*f2a19305Safresh1      * "Logical Parno" is the user visible view with branch reset taken into
77*f2a19305Safresh1      * account. "Parno" (or physical parno) is the actual capture buffers in
78*f2a19305Safresh1      * the pattern *NOT* taking into account branch reset. We also maintain
79*f2a19305Safresh1      * a map of "next" pointers which allow us to skip to the next physical
80*f2a19305Safresh1      * capture buffer with the same logical id, with 0 representing "none".
81*f2a19305Safresh1      *
82*f2a19305Safresh1      * As we compile we keep track of the two different counts using the
83*f2a19305Safresh1      * 'logical_npar' and 'npar' members, and we keep track of the upper bound
84*f2a19305Safresh1      * of both in 'total_par' and 'logical_total_par', we also populate
85*f2a19305Safresh1      * the 'logical_to_parno' map, which gives us the first physical parno
86*f2a19305Safresh1      * for a given logical parno, and the `parno_to_logical` array which gives
87*f2a19305Safresh1      * us the logical id for each physical parno. When compilation is
88*f2a19305Safresh1      * completed we construct the 'parno_to_logical_next' array from the
89*f2a19305Safresh1      * 'parno_to_logical' array. (We do not bother constructing it during
90*f2a19305Safresh1      * compilation as we do not need it, and we can construct it in O(N) time
91*f2a19305Safresh1      * once we are done, but would need more complicated logic during the
92*f2a19305Safresh1      * compile, because we want the next pointers to go from smallest to
93*f2a19305Safresh1      * largest, eg, left to right.)
94*f2a19305Safresh1      *
95*f2a19305Safresh1      * Logical: $1      $2  $3  $4    $2  $3    $2    $5
96*f2a19305Safresh1      * Physical: 1       2   3   4     5   6     7     8
97*f2a19305Safresh1      * Next:     0       5   6   0     7   0     0     0
98*f2a19305Safresh1      * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/
99*f2a19305Safresh1      *
100*f2a19305Safresh1      * As much as possible the internals use and store the physical id of
101*f2a19305Safresh1      * of capture buffers. We decode the physical to the logical only when
102*f2a19305Safresh1      * we need to, for instance when someone use $2.
103*f2a19305Safresh1      *
104*f2a19305Safresh1      * Note that when branch reset is not used logical and physical are the
105*f2a19305Safresh1      * same and the next data would be all zero. So when branch reset is not
106*f2a19305Safresh1      * used we do not need to populate this data into the final regexp.
107*f2a19305Safresh1      *
108*f2a19305Safresh1      */
109*f2a19305Safresh1     I32         *logical_to_parno;        /* logical_parno to parno */
110*f2a19305Safresh1     I32         *parno_to_logical;        /* parno to logical_parno */
111*f2a19305Safresh1     I32         *parno_to_logical_next;   /* parno to next (greater value)
112*f2a19305Safresh1                                              parno with the same
113*f2a19305Safresh1                                              logical_parno as parno.*/
114*f2a19305Safresh1 
115*f2a19305Safresh1     I32         npar;                   /* Capture buffer count so far in the
116*f2a19305Safresh1                                            parse, (OPEN) plus one. ("par" 0 is
117*f2a19305Safresh1                                            the whole pattern)*/
118*f2a19305Safresh1     I32         logical_npar;           /* Logical version of npar */
119*f2a19305Safresh1     I32         total_par;              /* During initial parse, is either 0,
120*f2a19305Safresh1                                            or -1; the latter indicating a
121*f2a19305Safresh1                                            reparse is needed.  After that pass,
122*f2a19305Safresh1                                            it is what 'npar' became after the
123*f2a19305Safresh1                                            pass.  Hence, it being > 0 indicates
124*f2a19305Safresh1                                            we are in a reparse situation */
125*f2a19305Safresh1     I32         logical_total_par;      /* Logical version to total par */
126*f2a19305Safresh1     I32         nestroot;               /* root parens we are in - used by
127*f2a19305Safresh1                                            accept */
128*f2a19305Safresh1     I32         seen_zerolen;
129*f2a19305Safresh1     regnode     *end_op;                /* END node in program */
130*f2a19305Safresh1     I32         utf8;           /* whether the pattern is utf8 or not */
131*f2a19305Safresh1     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
132*f2a19305Safresh1                                 /* XXX use this for future optimisation of case
133*f2a19305Safresh1                                  * where pattern must be upgraded to utf8. */
134*f2a19305Safresh1     I32         uni_semantics;  /* If a d charset modifier should use unicode
135*f2a19305Safresh1                                    rules, even if the pattern is not in
136*f2a19305Safresh1                                    utf8 */
137*f2a19305Safresh1 
138*f2a19305Safresh1     I32         recurse_count;          /* Number of recurse regops we have generated */
139*f2a19305Safresh1     regnode     **recurse;              /* Recurse regops */
140*f2a19305Safresh1     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
141*f2a19305Safresh1                                            through */
142*f2a19305Safresh1     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
143*f2a19305Safresh1     I32         in_lookaround;
144*f2a19305Safresh1     I32         contains_locale;
145*f2a19305Safresh1     I32         override_recoding;
146*f2a19305Safresh1     I32         recode_x_to_native;
147*f2a19305Safresh1     I32         in_multi_char_class;
148*f2a19305Safresh1     int         code_index;             /* next code_blocks[] slot */
149*f2a19305Safresh1     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
150*f2a19305Safresh1                                             within pattern */
151*f2a19305Safresh1     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
152*f2a19305Safresh1     scan_frame *frame_head;
153*f2a19305Safresh1     scan_frame *frame_last;
154*f2a19305Safresh1     U32         frame_count;
155*f2a19305Safresh1     AV         *warn_text;
156*f2a19305Safresh1     HV         *unlexed_names;
157*f2a19305Safresh1     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
158*f2a19305Safresh1 #ifdef DEBUGGING
159*f2a19305Safresh1     const char  *lastparse;
160*f2a19305Safresh1     I32         lastnum;
161*f2a19305Safresh1     U32         study_chunk_recursed_count;
162*f2a19305Safresh1     AV          *paren_name_list;       /* idx -> name */
163*f2a19305Safresh1     SV          *mysv1;
164*f2a19305Safresh1     SV          *mysv2;
165*f2a19305Safresh1 #endif
166*f2a19305Safresh1     bool        seen_d_op;
167*f2a19305Safresh1     bool        strict;
168*f2a19305Safresh1     bool        study_started;
169*f2a19305Safresh1     bool        in_script_run;
170*f2a19305Safresh1     bool        use_BRANCHJ;
171*f2a19305Safresh1     bool        sWARN_EXPERIMENTAL__VLB;
172*f2a19305Safresh1     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
173*f2a19305Safresh1 };
174*f2a19305Safresh1 
175*f2a19305Safresh1 #ifdef DEBUGGING
176*f2a19305Safresh1 #define RExC_lastparse  (pRExC_state->lastparse)
177*f2a19305Safresh1 #define RExC_lastnum    (pRExC_state->lastnum)
178*f2a19305Safresh1 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
179*f2a19305Safresh1 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
180*f2a19305Safresh1 #define RExC_mysv       (pRExC_state->mysv1)
181*f2a19305Safresh1 #define RExC_mysv1      (pRExC_state->mysv1)
182*f2a19305Safresh1 #define RExC_mysv2      (pRExC_state->mysv2)
183*f2a19305Safresh1 #endif
184*f2a19305Safresh1 
185*f2a19305Safresh1 #define RExC_flags      (pRExC_state->flags)
186*f2a19305Safresh1 #define RExC_pm_flags   (pRExC_state->pm_flags)
187*f2a19305Safresh1 #define RExC_precomp    (pRExC_state->precomp)
188*f2a19305Safresh1 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
189*f2a19305Safresh1 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
190*f2a19305Safresh1 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
191*f2a19305Safresh1 #define RExC_precomp_end (pRExC_state->precomp_end)
192*f2a19305Safresh1 #define RExC_rx_sv      (pRExC_state->rx_sv)
193*f2a19305Safresh1 #define RExC_rx         (pRExC_state->rx)
194*f2a19305Safresh1 #define RExC_rxi        (pRExC_state->rxi)
195*f2a19305Safresh1 #define RExC_start      (pRExC_state->start)
196*f2a19305Safresh1 #define RExC_end        (pRExC_state->end)
197*f2a19305Safresh1 #define RExC_parse      (pRExC_state->parse)
198*f2a19305Safresh1 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
199*f2a19305Safresh1 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
200*f2a19305Safresh1 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
201*f2a19305Safresh1                                                    under /d from /u ? */
202*f2a19305Safresh1 
203*f2a19305Safresh1 #define RExC_emit       (pRExC_state->emit)
204*f2a19305Safresh1 #define RExC_emit_start (pRExC_state->emit_start)
205*f2a19305Safresh1 #define RExC_sawback    (pRExC_state->sawback)
206*f2a19305Safresh1 #define RExC_seen       (pRExC_state->seen)
207*f2a19305Safresh1 #define RExC_size       (pRExC_state->size)
208*f2a19305Safresh1 #define RExC_maxlen        (pRExC_state->maxlen)
209*f2a19305Safresh1 #define RExC_logical_npar           (pRExC_state->logical_npar)
210*f2a19305Safresh1 #define RExC_logical_total_parens   (pRExC_state->logical_total_par)
211*f2a19305Safresh1 #define RExC_logical_to_parno       (pRExC_state->logical_to_parno)
212*f2a19305Safresh1 #define RExC_parno_to_logical       (pRExC_state->parno_to_logical)
213*f2a19305Safresh1 #define RExC_parno_to_logical_next  (pRExC_state->parno_to_logical_next)
214*f2a19305Safresh1 #define RExC_npar       (pRExC_state->npar)
215*f2a19305Safresh1 #define RExC_total_parens       (pRExC_state->total_par)
216*f2a19305Safresh1 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
217*f2a19305Safresh1 #define RExC_nestroot   (pRExC_state->nestroot)
218*f2a19305Safresh1 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
219*f2a19305Safresh1 #define RExC_utf8       (pRExC_state->utf8)
220*f2a19305Safresh1 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
221*f2a19305Safresh1 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
222*f2a19305Safresh1 #define RExC_open_parens        (pRExC_state->open_parens)
223*f2a19305Safresh1 #define RExC_close_parens       (pRExC_state->close_parens)
224*f2a19305Safresh1 #define RExC_end_op     (pRExC_state->end_op)
225*f2a19305Safresh1 #define RExC_paren_names        (pRExC_state->paren_names)
226*f2a19305Safresh1 #define RExC_recurse    (pRExC_state->recurse)
227*f2a19305Safresh1 #define RExC_recurse_count      (pRExC_state->recurse_count)
228*f2a19305Safresh1 #define RExC_sets_depth         (pRExC_state->sets_depth)
229*f2a19305Safresh1 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
230*f2a19305Safresh1 #define RExC_study_chunk_recursed_bytes  \
231*f2a19305Safresh1                                    (pRExC_state->study_chunk_recursed_bytes)
232*f2a19305Safresh1 #define RExC_in_lookaround      (pRExC_state->in_lookaround)
233*f2a19305Safresh1 #define RExC_contains_locale    (pRExC_state->contains_locale)
234*f2a19305Safresh1 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
235*f2a19305Safresh1 
236*f2a19305Safresh1 #ifdef EBCDIC
237*f2a19305Safresh1 #  define SET_recode_x_to_native(x)                                         \
238*f2a19305Safresh1                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
239*f2a19305Safresh1 #else
240*f2a19305Safresh1 #  define SET_recode_x_to_native(x) NOOP
241*f2a19305Safresh1 #endif
242*f2a19305Safresh1 
243*f2a19305Safresh1 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
244*f2a19305Safresh1 #define RExC_frame_head (pRExC_state->frame_head)
245*f2a19305Safresh1 #define RExC_frame_last (pRExC_state->frame_last)
246*f2a19305Safresh1 #define RExC_frame_count (pRExC_state->frame_count)
247*f2a19305Safresh1 #define RExC_strict (pRExC_state->strict)
248*f2a19305Safresh1 #define RExC_study_started      (pRExC_state->study_started)
249*f2a19305Safresh1 #define RExC_warn_text (pRExC_state->warn_text)
250*f2a19305Safresh1 #define RExC_in_script_run      (pRExC_state->in_script_run)
251*f2a19305Safresh1 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
252*f2a19305Safresh1 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
253*f2a19305Safresh1 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
254*f2a19305Safresh1 #define RExC_unlexed_names (pRExC_state->unlexed_names)
255*f2a19305Safresh1 
256*f2a19305Safresh1 
257*f2a19305Safresh1 /***********************************************************************/
258*f2a19305Safresh1 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
259*f2a19305Safresh1  *
260*f2a19305Safresh1  * All of these macros depend on the above RExC_ accessor macros, which
261*f2a19305Safresh1  * in turns depend on a variable pRExC_state being in scope where they
262*f2a19305Safresh1  * are used. This is the standard regexp parser context variable which is
263*f2a19305Safresh1  * passed into every non-trivial parse function in this file.
264*f2a19305Safresh1  *
265*f2a19305Safresh1  * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
266*f2a19305Safresh1  * of the macros which do not take an argument will operate on the
267*f2a19305Safresh1  * pRExC_state structure *only*.
268*f2a19305Safresh1  *
269*f2a19305Safresh1  * Please do NOT modify RExC_parse without using these macros. In the
270*f2a19305Safresh1  * future these macros will be extended for enhanced debugging and trace
271*f2a19305Safresh1  * output during the parse process.
272*f2a19305Safresh1  */
273*f2a19305Safresh1 
274*f2a19305Safresh1 /* RExC_parse_incf(flag)
275*f2a19305Safresh1  *
276*f2a19305Safresh1  * Increment RExC_parse to point at the next codepoint, while doing
277*f2a19305Safresh1  * the right thing depending on whether we are parsing UTF-8 strings
278*f2a19305Safresh1  * or not. The 'flag' argument determines if content is UTF-8 or not,
279*f2a19305Safresh1  * intended for cases where this is NOT governed by the UTF macro.
280*f2a19305Safresh1  *
281*f2a19305Safresh1  * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
282*f2a19305Safresh1  *
283*f2a19305Safresh1  * WARNING: Does NOT take into account RExC_end; it is the callers
284*f2a19305Safresh1  * responsibility to make sure there are enough octets left in
285*f2a19305Safresh1  * RExC_parse to ensure that when processing UTF-8 we would not read
286*f2a19305Safresh1  * past the end of the string.
287*f2a19305Safresh1  */
288*f2a19305Safresh1 #define RExC_parse_incf(flag) STMT_START {              \
289*f2a19305Safresh1     RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1;    \
290*f2a19305Safresh1 } STMT_END
291*f2a19305Safresh1 
292*f2a19305Safresh1 /* RExC_parse_inc_safef(flag)
293*f2a19305Safresh1  *
294*f2a19305Safresh1  * Safely increment RExC_parse to point at the next codepoint,
295*f2a19305Safresh1  * doing the right thing depending on whether we are parsing
296*f2a19305Safresh1  * UTF-8 strings or not and NOT reading past the end of the buffer.
297*f2a19305Safresh1  * The 'flag' argument determines if content is UTF-8 or not,
298*f2a19305Safresh1  * intended for cases where this is NOT governed by the UTF macro.
299*f2a19305Safresh1  *
300*f2a19305Safresh1  * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
301*f2a19305Safresh1  *
302*f2a19305Safresh1  * NOTE: Will NOT read past RExC_end when content is UTF-8.
303*f2a19305Safresh1  */
304*f2a19305Safresh1 #define RExC_parse_inc_safef(flag) STMT_START {                     \
305*f2a19305Safresh1     RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
306*f2a19305Safresh1 } STMT_END
307*f2a19305Safresh1 
308*f2a19305Safresh1 /* RExC_parse_inc()
309*f2a19305Safresh1  *
310*f2a19305Safresh1  * Increment RExC_parse to point at the next codepoint,
311*f2a19305Safresh1  * doing the right thing depending on whether we are parsing
312*f2a19305Safresh1  * UTF-8 strings or not.
313*f2a19305Safresh1  *
314*f2a19305Safresh1  * WARNING: Does NOT take into account RExC_end, it is the callers
315*f2a19305Safresh1  * responsibility to make sure there are enough octets left in
316*f2a19305Safresh1  * RExC_parse to ensure that when processing UTF-8 we would not read
317*f2a19305Safresh1  * past the end of the string.
318*f2a19305Safresh1  *
319*f2a19305Safresh1  * NOTE: whether we are parsing UTF-8 or not is determined by the
320*f2a19305Safresh1  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
321*f2a19305Safresh1  * macro operates on the pRExC_state structure only.
322*f2a19305Safresh1  */
323*f2a19305Safresh1 #define RExC_parse_inc() RExC_parse_incf(UTF)
324*f2a19305Safresh1 
325*f2a19305Safresh1 /* RExC_parse_inc_safe()
326*f2a19305Safresh1  *
327*f2a19305Safresh1  * Safely increment RExC_parse to point at the next codepoint,
328*f2a19305Safresh1  * doing the right thing depending on whether we are parsing
329*f2a19305Safresh1  * UTF-8 strings or not and NOT reading past the end of the buffer.
330*f2a19305Safresh1  *
331*f2a19305Safresh1  * NOTE: whether we are parsing UTF-8 or not is determined by the
332*f2a19305Safresh1  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
333*f2a19305Safresh1  * macro operates on the pRExC_state structure only.
334*f2a19305Safresh1  */
335*f2a19305Safresh1 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
336*f2a19305Safresh1 
337*f2a19305Safresh1 /* RExC_parse_inc_utf8()
338*f2a19305Safresh1  *
339*f2a19305Safresh1  * Increment RExC_parse to point at the next utf8 codepoint,
340*f2a19305Safresh1  * assumes content is UTF-8.
341*f2a19305Safresh1  *
342*f2a19305Safresh1  * WARNING: Does NOT take into account RExC_end; it is the callers
343*f2a19305Safresh1  * responsibility to make sure there are enough octets left in RExC_parse
344*f2a19305Safresh1  * to ensure that when processing UTF-8 we would not read past the end
345*f2a19305Safresh1  * of the string.
346*f2a19305Safresh1  */
347*f2a19305Safresh1 #define RExC_parse_inc_utf8() STMT_START {  \
348*f2a19305Safresh1     RExC_parse += UTF8SKIP(RExC_parse);     \
349*f2a19305Safresh1 } STMT_END
350*f2a19305Safresh1 
351*f2a19305Safresh1 /* RExC_parse_inc_if_char()
352*f2a19305Safresh1  *
353*f2a19305Safresh1  * Increment RExC_parse to point at the next codepoint, if and only
354*f2a19305Safresh1  * if the current parse point is NOT a NULL, while doing the right thing
355*f2a19305Safresh1  * depending on whether we are parsing UTF-8 strings or not.
356*f2a19305Safresh1  *
357*f2a19305Safresh1  * WARNING: Does NOT take into account RExC_end, it is the callers
358*f2a19305Safresh1  * responsibility to make sure there are enough octets left in RExC_parse
359*f2a19305Safresh1  * to ensure that when processing UTF-8 we would not read past the end
360*f2a19305Safresh1  * of the string.
361*f2a19305Safresh1  *
362*f2a19305Safresh1  * NOTE: whether we are parsing UTF-8 or not is determined by the
363*f2a19305Safresh1  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
364*f2a19305Safresh1  * macro operates on the pRExC_state structure only.
365*f2a19305Safresh1  */
366*f2a19305Safresh1 #define RExC_parse_inc_if_char() STMT_START {         \
367*f2a19305Safresh1     RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end);  \
368*f2a19305Safresh1 } STMT_END
369*f2a19305Safresh1 
370*f2a19305Safresh1 /* RExC_parse_inc_by(n_octets)
371*f2a19305Safresh1  *
372*f2a19305Safresh1  * Increment the parse cursor by the number of octets specified by
373*f2a19305Safresh1  * the 'n_octets' argument.
374*f2a19305Safresh1  *
375*f2a19305Safresh1  * NOTE: Does NOT check ANY constraints. It is the callers responsibility
376*f2a19305Safresh1  * that this will not move past the end of the string, or leave the
377*f2a19305Safresh1  * pointer in the middle of a UTF-8 sequence.
378*f2a19305Safresh1  *
379*f2a19305Safresh1  * Typically used to advanced past previously analyzed content.
380*f2a19305Safresh1  */
381*f2a19305Safresh1 #define RExC_parse_inc_by(n_octets) STMT_START {  \
382*f2a19305Safresh1     RExC_parse += (n_octets);                     \
383*f2a19305Safresh1 } STMT_END
384*f2a19305Safresh1 
385*f2a19305Safresh1 /* RExC_parse_set(to_ptr)
386*f2a19305Safresh1  *
387*f2a19305Safresh1  * Sets the RExC_parse pointer to the pointer specified by the 'to'
388*f2a19305Safresh1  * argument. No validation whatsoever is performed on the to pointer.
389*f2a19305Safresh1  */
390*f2a19305Safresh1 #define RExC_parse_set(to_ptr) STMT_START { \
391*f2a19305Safresh1     RExC_parse = (to_ptr);                  \
392*f2a19305Safresh1 } STMT_END
393*f2a19305Safresh1 
394*f2a19305Safresh1 /**********************************************************************/
395*f2a19305Safresh1 
396*f2a19305Safresh1 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
397*f2a19305Safresh1  * a flag to disable back-off on the fixed/floating substrings - if it's
398*f2a19305Safresh1  * a high complexity pattern we assume the benefit of avoiding a full match
399*f2a19305Safresh1  * is worth the cost of checking for the substrings even if they rarely help.
400*f2a19305Safresh1  */
401*f2a19305Safresh1 #define RExC_naughty    (pRExC_state->naughty)
402*f2a19305Safresh1 #define TOO_NAUGHTY (10)
403*f2a19305Safresh1 #define MARK_NAUGHTY(add) \
404*f2a19305Safresh1     if (RExC_naughty < TOO_NAUGHTY) \
405*f2a19305Safresh1         RExC_naughty += (add)
406*f2a19305Safresh1 #define MARK_NAUGHTY_EXP(exp, add) \
407*f2a19305Safresh1     if (RExC_naughty < TOO_NAUGHTY) \
408*f2a19305Safresh1         RExC_naughty += RExC_naughty / (exp) + (add)
409*f2a19305Safresh1 
410*f2a19305Safresh1 #define isNON_BRACE_QUANTIFIER(c)   ((c) == '*' || (c) == '+' || (c) == '?')
411*f2a19305Safresh1 #define isQUANTIFIER(s,e)  (   isNON_BRACE_QUANTIFIER(*s)                      \
412*f2a19305Safresh1                             || ((*s) == '{' && regcurly(s, e, NULL)))
413*f2a19305Safresh1 
414*f2a19305Safresh1 /*
415*f2a19305Safresh1  * Flags to be passed up.
416*f2a19305Safresh1  */
417*f2a19305Safresh1 #define HASWIDTH        0x01    /* Known to not match null strings, could match
418*f2a19305Safresh1                                    non-null ones. */
419*f2a19305Safresh1 #define SIMPLE          0x02    /* Exactly one character wide */
420*f2a19305Safresh1                                 /* (or LNBREAK as a special case) */
421*f2a19305Safresh1 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
422*f2a19305Safresh1 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
423*f2a19305Safresh1 #define RESTART_PARSE   0x20    /* Need to redo the parse */
424*f2a19305Safresh1 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
425*f2a19305Safresh1                                    calcuate sizes as UTF-8 */
426*f2a19305Safresh1 
427*f2a19305Safresh1 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
428*f2a19305Safresh1 
429*f2a19305Safresh1 /* whether trie related optimizations are enabled */
430*f2a19305Safresh1 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
431*f2a19305Safresh1 #define TRIE_STUDY_OPT
432*f2a19305Safresh1 #define FULL_TRIE_STUDY
433*f2a19305Safresh1 #define TRIE_STCLASS
434*f2a19305Safresh1 #endif
435*f2a19305Safresh1 
436*f2a19305Safresh1 /* About the term "restudy" and the var "restudied" and the defines
437*f2a19305Safresh1  * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
438*f2a19305Safresh1  * doing multiple study_chunk() calls over the same set of opcodes for* the
439*f2a19305Safresh1  * purpose of enhanced TRIE optimizations.
440*f2a19305Safresh1  *
441*f2a19305Safresh1  * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
442*f2a19305Safresh1  * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
443*f2a19305Safresh1  * which then causes the Perl_re_op_compile() to then call the optimizer
444*f2a19305Safresh1  * S_study_chunk() a second time to perform additional optimizations,
445*f2a19305Safresh1  * including the aho_corasick startclass optimization.
446*f2a19305Safresh1  * This additional pass will only happen once, which is managed by the
447*f2a19305Safresh1  * 'restudied' variable in Perl_re_op_compile().
448*f2a19305Safresh1  *
449*f2a19305Safresh1  * When this second pass is under way the flags passed into study_chunk() will
450*f2a19305Safresh1  * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
451*f2a19305Safresh1  * to any recursive calls to S_study_chunk().
452*f2a19305Safresh1  *
453*f2a19305Safresh1  * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
454*f2a19305Safresh1  * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
455*f2a19305Safresh1  * be produced twice.
456*f2a19305Safresh1  *
457*f2a19305Safresh1  * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
458*f2a19305Safresh1  * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
459*f2a19305Safresh1  */
460*f2a19305Safresh1 
461*f2a19305Safresh1 
462*f2a19305Safresh1 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
463*f2a19305Safresh1 #define PBITVAL(paren) (1 << ((paren) & 7))
464*f2a19305Safresh1 #define PAREN_OFFSET(depth) \
465*f2a19305Safresh1     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
466*f2a19305Safresh1 #define PAREN_TEST(depth, paren) \
467*f2a19305Safresh1     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
468*f2a19305Safresh1 #define PAREN_SET(depth, paren) \
469*f2a19305Safresh1     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
470*f2a19305Safresh1 #define PAREN_UNSET(depth, paren) \
471*f2a19305Safresh1     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
472*f2a19305Safresh1 
473*f2a19305Safresh1 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
474*f2a19305Safresh1                                      if (!UTF) {                           \
475*f2a19305Safresh1                                          *flagp = RESTART_PARSE|NEED_UTF8; \
476*f2a19305Safresh1                                          return 0;                         \
477*f2a19305Safresh1                                      }                                     \
478*f2a19305Safresh1                              } STMT_END
479*f2a19305Safresh1 
480*f2a19305Safresh1 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
481*f2a19305Safresh1  * pattern is in UTF-8.  This latter condition is in case the outermost rules
482*f2a19305Safresh1  * are locale.  See GH #17278 */
483*f2a19305Safresh1 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
484*f2a19305Safresh1 
485*f2a19305Safresh1 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
486*f2a19305Safresh1  * a flag that indicates we need to override /d with /u as a result of
487*f2a19305Safresh1  * something in the pattern.  It should only be used in regards to calling
488*f2a19305Safresh1  * set_regex_charset() or get_regex_charset() */
489*f2a19305Safresh1 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
490*f2a19305Safresh1     STMT_START {                                                            \
491*f2a19305Safresh1             if (DEPENDS_SEMANTICS) {                                        \
492*f2a19305Safresh1                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
493*f2a19305Safresh1                 RExC_uni_semantics = 1;                                     \
494*f2a19305Safresh1                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
495*f2a19305Safresh1                     /* No need to restart the parse if we haven't seen      \
496*f2a19305Safresh1                      * anything that differs between /u and /d, and no need \
497*f2a19305Safresh1                      * to restart immediately if we're going to reparse     \
498*f2a19305Safresh1                      * anyway to count parens */                            \
499*f2a19305Safresh1                     *flagp |= RESTART_PARSE;                                \
500*f2a19305Safresh1                     return restart_retval;                                  \
501*f2a19305Safresh1                 }                                                           \
502*f2a19305Safresh1             }                                                               \
503*f2a19305Safresh1     } STMT_END
504*f2a19305Safresh1 
505*f2a19305Safresh1 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
506*f2a19305Safresh1     STMT_START {                                                            \
507*f2a19305Safresh1                 RExC_use_BRANCHJ = 1;                                       \
508*f2a19305Safresh1                 *flagp |= RESTART_PARSE;                                    \
509*f2a19305Safresh1                 return restart_retval;                                      \
510*f2a19305Safresh1     } STMT_END
511*f2a19305Safresh1 
512*f2a19305Safresh1 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
513*f2a19305Safresh1  * less.  After that, it must always be positive, because the whole re is
514*f2a19305Safresh1  * considered to be surrounded by virtual parens.  Setting it to negative
515*f2a19305Safresh1  * indicates there is some construct that needs to know the actual number of
516*f2a19305Safresh1  * parens to be properly handled.  And that means an extra pass will be
517*f2a19305Safresh1  * required after we've counted them all */
518*f2a19305Safresh1 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
519*f2a19305Safresh1 #define REQUIRE_PARENS_PASS                                                 \
520*f2a19305Safresh1     STMT_START {  /* No-op if have completed a pass */                      \
521*f2a19305Safresh1                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
522*f2a19305Safresh1     } STMT_END
523*f2a19305Safresh1 #define IN_PARENS_PASS (RExC_total_parens < 0)
524*f2a19305Safresh1 
525*f2a19305Safresh1 
526*f2a19305Safresh1 /* This is used to return failure (zero) early from the calling function if
527*f2a19305Safresh1  * various flags in 'flags' are set.  Two flags always cause a return:
528*f2a19305Safresh1  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
529*f2a19305Safresh1  * additional flags that should cause a return; 0 if none.  If the return will
530*f2a19305Safresh1  * be done, '*flagp' is first set to be all of the flags that caused the
531*f2a19305Safresh1  * return. */
532*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
533*f2a19305Safresh1     STMT_START {                                                            \
534*f2a19305Safresh1             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
535*f2a19305Safresh1                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
536*f2a19305Safresh1                 return 0;                                                   \
537*f2a19305Safresh1             }                                                               \
538*f2a19305Safresh1     } STMT_END
539*f2a19305Safresh1 
540*f2a19305Safresh1 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
541*f2a19305Safresh1 
542*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
543*f2a19305Safresh1                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
544*f2a19305Safresh1 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
545*f2a19305Safresh1                                     if (MUST_RESTART(*(flagp))) return 0
546*f2a19305Safresh1 
547*f2a19305Safresh1 /* This converts the named class defined in regcomp.h to its equivalent class
548*f2a19305Safresh1  * number defined in handy.h. */
549*f2a19305Safresh1 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
550*f2a19305Safresh1 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
551*f2a19305Safresh1 
552*f2a19305Safresh1 #define _invlist_union_complement_2nd(a, b, output) \
553*f2a19305Safresh1                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
554*f2a19305Safresh1 #define _invlist_intersection_complement_2nd(a, b, output) \
555*f2a19305Safresh1                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
556*f2a19305Safresh1 
557*f2a19305Safresh1 /* We add a marker if we are deferring expansion of a property that is both
558*f2a19305Safresh1  * 1) potentiallly user-defined; and
559*f2a19305Safresh1  * 2) could also be an official Unicode property.
560*f2a19305Safresh1  *
561*f2a19305Safresh1  * Without this marker, any deferred expansion can only be for a user-defined
562*f2a19305Safresh1  * one.  This marker shouldn't conflict with any that could be in a legal name,
563*f2a19305Safresh1  * and is appended to its name to indicate this.  There is a string and
564*f2a19305Safresh1  * character form */
565*f2a19305Safresh1 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
566*f2a19305Safresh1 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
567*f2a19305Safresh1 
568*f2a19305Safresh1 /* What is infinity for optimization purposes */
569*f2a19305Safresh1 #define OPTIMIZE_INFTY  SSize_t_MAX
570*f2a19305Safresh1 
571*f2a19305Safresh1 /* About scan_data_t.
572*f2a19305Safresh1 
573*f2a19305Safresh1   During optimisation we recurse through the regexp program performing
574*f2a19305Safresh1   various inplace (keyhole style) optimisations. In addition study_chunk
575*f2a19305Safresh1   and scan_commit populate this data structure with information about
576*f2a19305Safresh1   what strings MUST appear in the pattern. We look for the longest
577*f2a19305Safresh1   string that must appear at a fixed location, and we look for the
578*f2a19305Safresh1   longest string that may appear at a floating location. So for instance
579*f2a19305Safresh1   in the pattern:
580*f2a19305Safresh1 
581*f2a19305Safresh1     /FOO[xX]A.*B[xX]BAR/
582*f2a19305Safresh1 
583*f2a19305Safresh1   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
584*f2a19305Safresh1   strings (because they follow a .* construct). study_chunk will identify
585*f2a19305Safresh1   both FOO and BAR as being the longest fixed and floating strings respectively.
586*f2a19305Safresh1 
587*f2a19305Safresh1   The strings can be composites, for instance
588*f2a19305Safresh1 
589*f2a19305Safresh1      /(f)(o)(o)/
590*f2a19305Safresh1 
591*f2a19305Safresh1   will result in a composite fixed substring 'foo'.
592*f2a19305Safresh1 
593*f2a19305Safresh1   For each string some basic information is maintained:
594*f2a19305Safresh1 
595*f2a19305Safresh1   - min_offset
596*f2a19305Safresh1     This is the position the string must appear at, or not before.
597*f2a19305Safresh1     It also implicitly (when combined with minlenp) tells us how many
598*f2a19305Safresh1     characters must match before the string we are searching for.
599*f2a19305Safresh1     Likewise when combined with minlenp and the length of the string it
600*f2a19305Safresh1     tells us how many characters must appear after the string we have
601*f2a19305Safresh1     found.
602*f2a19305Safresh1 
603*f2a19305Safresh1   - max_offset
604*f2a19305Safresh1     Only used for floating strings. This is the rightmost point that
605*f2a19305Safresh1     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
606*f2a19305Safresh1     string can occur infinitely far to the right.
607*f2a19305Safresh1     For fixed strings, it is equal to min_offset.
608*f2a19305Safresh1 
609*f2a19305Safresh1   - minlenp
610*f2a19305Safresh1     A pointer to the minimum number of characters of the pattern that the
611*f2a19305Safresh1     string was found inside. This is important as in the case of positive
612*f2a19305Safresh1     lookahead or positive lookbehind we can have multiple patterns
613*f2a19305Safresh1     involved. Consider
614*f2a19305Safresh1 
615*f2a19305Safresh1     /(?=FOO).*F/
616*f2a19305Safresh1 
617*f2a19305Safresh1     The minimum length of the pattern overall is 3, the minimum length
618*f2a19305Safresh1     of the lookahead part is 3, but the minimum length of the part that
619*f2a19305Safresh1     will actually match is 1. So 'FOO's minimum length is 3, but the
620*f2a19305Safresh1     minimum length for the F is 1. This is important as the minimum length
621*f2a19305Safresh1     is used to determine offsets in front of and behind the string being
622*f2a19305Safresh1     looked for.  Since strings can be composites this is the length of the
623*f2a19305Safresh1     pattern at the time it was committed with a scan_commit. Note that
624*f2a19305Safresh1     the length is calculated by study_chunk, so that the minimum lengths
625*f2a19305Safresh1     are not known until the full pattern has been compiled, thus the
626*f2a19305Safresh1     pointer to the value.
627*f2a19305Safresh1 
628*f2a19305Safresh1   - lookbehind
629*f2a19305Safresh1 
630*f2a19305Safresh1     In the case of lookbehind the string being searched for can be
631*f2a19305Safresh1     offset past the start point of the final matching string.
632*f2a19305Safresh1     If this value was just blithely removed from the min_offset it would
633*f2a19305Safresh1     invalidate some of the calculations for how many chars must match
634*f2a19305Safresh1     before or after (as they are derived from min_offset and minlen and
635*f2a19305Safresh1     the length of the string being searched for).
636*f2a19305Safresh1     When the final pattern is compiled and the data is moved from the
637*f2a19305Safresh1     scan_data_t structure into the regexp structure the information
638*f2a19305Safresh1     about lookbehind is factored in, with the information that would
639*f2a19305Safresh1     have been lost precalculated in the end_shift field for the
640*f2a19305Safresh1     associated string.
641*f2a19305Safresh1 
642*f2a19305Safresh1   The fields pos_min and pos_delta are used to store the minimum offset
643*f2a19305Safresh1   and the delta to the maximum offset at the current point in the pattern.
644*f2a19305Safresh1 
645*f2a19305Safresh1 */
646*f2a19305Safresh1 
647*f2a19305Safresh1 struct scan_data_substrs {
648*f2a19305Safresh1     SV      *str;       /* longest substring found in pattern */
649*f2a19305Safresh1     SSize_t min_offset; /* earliest point in string it can appear */
650*f2a19305Safresh1     SSize_t max_offset; /* latest point in string it can appear */
651*f2a19305Safresh1     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
652*f2a19305Safresh1     SSize_t lookbehind; /* is the pos of the string modified by LB */
653*f2a19305Safresh1     I32 flags;          /* per substring SF_* and SCF_* flags */
654*f2a19305Safresh1 };
655*f2a19305Safresh1 
656*f2a19305Safresh1 /* this is typedef'ed in perl.h */
657*f2a19305Safresh1 struct scan_data_t {
658*f2a19305Safresh1     /*I32 len_min;      unused */
659*f2a19305Safresh1     /*I32 len_delta;    unused */
660*f2a19305Safresh1     SSize_t pos_min;
661*f2a19305Safresh1     SSize_t pos_delta;
662*f2a19305Safresh1     SV *last_found;
663*f2a19305Safresh1     SSize_t last_end;       /* min value, <0 unless valid. */
664*f2a19305Safresh1     SSize_t last_start_min;
665*f2a19305Safresh1     SSize_t last_start_max;
666*f2a19305Safresh1     U8      cur_is_floating; /* whether the last_* values should be set as
667*f2a19305Safresh1                               * the next fixed (0) or floating (1)
668*f2a19305Safresh1                               * substring */
669*f2a19305Safresh1 
670*f2a19305Safresh1     /* [0] is longest fixed substring so far, [1] is longest float so far */
671*f2a19305Safresh1     struct scan_data_substrs  substrs[2];
672*f2a19305Safresh1 
673*f2a19305Safresh1     I32 flags;             /* common SF_* and SCF_* flags */
674*f2a19305Safresh1     I32 whilem_c;
675*f2a19305Safresh1     SSize_t *last_closep;
676*f2a19305Safresh1     regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
677*f2a19305Safresh1                                  seen. DO NOT DEREFERENCE the regnode
678*f2a19305Safresh1                                  pointer - the op may have been optimized
679*f2a19305Safresh1                                  away */
680*f2a19305Safresh1     regnode_ssc *start_class;
681*f2a19305Safresh1 };
682*f2a19305Safresh1 
683*f2a19305Safresh1 /*
684*f2a19305Safresh1  * Forward declarations for pregcomp()'s friends.
685*f2a19305Safresh1  */
686*f2a19305Safresh1 
687*f2a19305Safresh1 static const scan_data_t zero_scan_data = {
688*f2a19305Safresh1     0, 0, NULL, 0, 0, 0, 0,
689*f2a19305Safresh1     {
690*f2a19305Safresh1         { NULL, 0, 0, 0, 0, 0 },
691*f2a19305Safresh1         { NULL, 0, 0, 0, 0, 0 },
692*f2a19305Safresh1     },
693*f2a19305Safresh1     0, 0, NULL, NULL, NULL
694*f2a19305Safresh1 };
695*f2a19305Safresh1 
696*f2a19305Safresh1 /* study flags */
697*f2a19305Safresh1 
698*f2a19305Safresh1 #define SF_BEFORE_SEOL          0x0001
699*f2a19305Safresh1 #define SF_BEFORE_MEOL          0x0002
700*f2a19305Safresh1 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
701*f2a19305Safresh1 
702*f2a19305Safresh1 #define SF_IS_INF               0x0040
703*f2a19305Safresh1 #define SF_HAS_PAR              0x0080
704*f2a19305Safresh1 #define SF_IN_PAR               0x0100
705*f2a19305Safresh1 #define SF_HAS_EVAL             0x0200
706*f2a19305Safresh1 
707*f2a19305Safresh1 
708*f2a19305Safresh1 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
709*f2a19305Safresh1  * longest substring in the pattern. When it is not set the optimiser keeps
710*f2a19305Safresh1  * track of position, but does not keep track of the actual strings seen,
711*f2a19305Safresh1  *
712*f2a19305Safresh1  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
713*f2a19305Safresh1  * /foo/i will not.
714*f2a19305Safresh1  *
715*f2a19305Safresh1  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
716*f2a19305Safresh1  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
717*f2a19305Safresh1  * turned off because of the alternation (BRANCH). */
718*f2a19305Safresh1 #define SCF_DO_SUBSTR           0x0400
719*f2a19305Safresh1 
720*f2a19305Safresh1 #define SCF_DO_STCLASS_AND      0x0800
721*f2a19305Safresh1 #define SCF_DO_STCLASS_OR       0x1000
722*f2a19305Safresh1 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
723*f2a19305Safresh1 #define SCF_WHILEM_VISITED_POS  0x2000
724*f2a19305Safresh1 
725*f2a19305Safresh1 #define SCF_TRIE_RESTUDY        0x4000 /* Need to do restudy in study_chunk()?
726*f2a19305Safresh1                                           Search for "restudy" in this file
727*f2a19305Safresh1                                           to find a detailed explanation.*/
728*f2a19305Safresh1 #define SCF_SEEN_ACCEPT         0x8000
729*f2a19305Safresh1 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
730*f2a19305Safresh1                                           Search for "restudy" in this file
731*f2a19305Safresh1                                           to find a detailed explanation. */
732*f2a19305Safresh1 #define SCF_IN_DEFINE          0x20000
733*f2a19305Safresh1 
734*f2a19305Safresh1 
735*f2a19305Safresh1 
736*f2a19305Safresh1 #define UTF cBOOL(RExC_utf8)
737*f2a19305Safresh1 
738*f2a19305Safresh1 /* The enums for all these are ordered so things work out correctly */
739*f2a19305Safresh1 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
740*f2a19305Safresh1 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
741*f2a19305Safresh1                                                      == REGEX_DEPENDS_CHARSET)
742*f2a19305Safresh1 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
743*f2a19305Safresh1 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
744*f2a19305Safresh1                                                      >= REGEX_UNICODE_CHARSET)
745*f2a19305Safresh1 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
746*f2a19305Safresh1                                             == REGEX_ASCII_RESTRICTED_CHARSET)
747*f2a19305Safresh1 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
748*f2a19305Safresh1                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
749*f2a19305Safresh1 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
750*f2a19305Safresh1                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
751*f2a19305Safresh1 
752*f2a19305Safresh1 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
753*f2a19305Safresh1 
754*f2a19305Safresh1 /* For programs that want to be strictly Unicode compatible by dying if any
755*f2a19305Safresh1  * attempt is made to match a non-Unicode code point against a Unicode
756*f2a19305Safresh1  * property.  */
757*f2a19305Safresh1 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
758*f2a19305Safresh1 
759*f2a19305Safresh1 #define OOB_NAMEDCLASS          -1
760*f2a19305Safresh1 
761*f2a19305Safresh1 /* There is no code point that is out-of-bounds, so this is problematic.  But
762*f2a19305Safresh1  * its only current use is to initialize a variable that is always set before
763*f2a19305Safresh1  * looked at. */
764*f2a19305Safresh1 #define OOB_UNICODE             0xDEADBEEF
765*f2a19305Safresh1 
766*f2a19305Safresh1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
767*f2a19305Safresh1 
768*f2a19305Safresh1 
769*f2a19305Safresh1 /* length of regex to show in messages that don't mark a position within */
770*f2a19305Safresh1 #define RegexLengthToShowInErrorMessages 127
771*f2a19305Safresh1 
772*f2a19305Safresh1 /*
773*f2a19305Safresh1  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
774*f2a19305Safresh1  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
775*f2a19305Safresh1  * op/pragma/warn/regcomp.
776*f2a19305Safresh1  */
777*f2a19305Safresh1 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
778*f2a19305Safresh1 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
779*f2a19305Safresh1 
780*f2a19305Safresh1 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
781*f2a19305Safresh1                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
782*f2a19305Safresh1 
783*f2a19305Safresh1 /* The code in this file in places uses one level of recursion with parsing
784*f2a19305Safresh1  * rebased to an alternate string constructed by us in memory.  This can take
785*f2a19305Safresh1  * the form of something that is completely different from the input, or
786*f2a19305Safresh1  * something that uses the input as part of the alternate.  In the first case,
787*f2a19305Safresh1  * there should be no possibility of an error, as we are in complete control of
788*f2a19305Safresh1  * the alternate string.  But in the second case we don't completely control
789*f2a19305Safresh1  * the input portion, so there may be errors in that.  Here's an example:
790*f2a19305Safresh1  *      /[abc\x{DF}def]/ui
791*f2a19305Safresh1  * is handled specially because \x{df} folds to a sequence of more than one
792*f2a19305Safresh1  * character: 'ss'.  What is done is to create and parse an alternate string,
793*f2a19305Safresh1  * which looks like this:
794*f2a19305Safresh1  *      /(?:\x{DF}|[abc\x{DF}def])/ui
795*f2a19305Safresh1  * where it uses the input unchanged in the middle of something it constructs,
796*f2a19305Safresh1  * which is a branch for the DF outside the character class, and clustering
797*f2a19305Safresh1  * parens around the whole thing. (It knows enough to skip the DF inside the
798*f2a19305Safresh1  * class while in this substitute parse.) 'abc' and 'def' may have errors that
799*f2a19305Safresh1  * need to be reported.  The general situation looks like this:
800*f2a19305Safresh1  *
801*f2a19305Safresh1  *                                       |<------- identical ------>|
802*f2a19305Safresh1  *              sI                       tI               xI       eI
803*f2a19305Safresh1  * Input:       ---------------------------------------------------------------
804*f2a19305Safresh1  * Constructed:         ---------------------------------------------------
805*f2a19305Safresh1  *                      sC               tC               xC       eC     EC
806*f2a19305Safresh1  *                                       |<------- identical ------>|
807*f2a19305Safresh1  *
808*f2a19305Safresh1  * sI..eI   is the portion of the input pattern we are concerned with here.
809*f2a19305Safresh1  * sC..EC   is the constructed substitute parse string.
810*f2a19305Safresh1  *  sC..tC  is constructed by us
811*f2a19305Safresh1  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
812*f2a19305Safresh1  *          In the diagram, these are vertically aligned.
813*f2a19305Safresh1  *  eC..EC  is also constructed by us.
814*f2a19305Safresh1  * xC       is the position in the substitute parse string where we found a
815*f2a19305Safresh1  *          problem.
816*f2a19305Safresh1  * xI       is the position in the original pattern corresponding to xC.
817*f2a19305Safresh1  *
818*f2a19305Safresh1  * We want to display a message showing the real input string.  Thus we need to
819*f2a19305Safresh1  * translate from xC to xI.  We know that xC >= tC, since the portion of the
820*f2a19305Safresh1  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
821*f2a19305Safresh1  * get:
822*f2a19305Safresh1  *      xI = tI + (xC - tC)
823*f2a19305Safresh1  *
824*f2a19305Safresh1  * When the substitute parse is constructed, the code needs to set:
825*f2a19305Safresh1  *      RExC_start (sC)
826*f2a19305Safresh1  *      RExC_end (eC)
827*f2a19305Safresh1  *      RExC_copy_start_in_input  (tI)
828*f2a19305Safresh1  *      RExC_copy_start_in_constructed (tC)
829*f2a19305Safresh1  * and restore them when done.
830*f2a19305Safresh1  *
831*f2a19305Safresh1  * During normal processing of the input pattern, both
832*f2a19305Safresh1  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
833*f2a19305Safresh1  * sI, so that xC equals xI.
834*f2a19305Safresh1  */
835*f2a19305Safresh1 
836*f2a19305Safresh1 #define sI              RExC_precomp
837*f2a19305Safresh1 #define eI              RExC_precomp_end
838*f2a19305Safresh1 #define sC              RExC_start
839*f2a19305Safresh1 #define eC              RExC_end
840*f2a19305Safresh1 #define tI              RExC_copy_start_in_input
841*f2a19305Safresh1 #define tC              RExC_copy_start_in_constructed
842*f2a19305Safresh1 #define xI(xC)          (tI + (xC - tC))
843*f2a19305Safresh1 #define xI_offset(xC)   (xI(xC) - sI)
844*f2a19305Safresh1 
845*f2a19305Safresh1 #define REPORT_LOCATION_ARGS(xC)                                            \
846*f2a19305Safresh1     UTF8fARG(UTF,                                                           \
847*f2a19305Safresh1              (xI(xC) > eI) /* Don't run off end */                          \
848*f2a19305Safresh1               ? eI - sI   /* Length before the <--HERE */                   \
849*f2a19305Safresh1               : ((xI_offset(xC) >= 0)                                       \
850*f2a19305Safresh1                  ? xI_offset(xC)                                            \
851*f2a19305Safresh1                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
852*f2a19305Safresh1                                     IVdf " trying to output message for "   \
853*f2a19305Safresh1                                     " pattern %.*s",                        \
854*f2a19305Safresh1                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
855*f2a19305Safresh1                                     ((int) (eC - sC)), sC), 0)),            \
856*f2a19305Safresh1              sI),         /* The input pattern printed up to the <--HERE */ \
857*f2a19305Safresh1     UTF8fARG(UTF,                                                           \
858*f2a19305Safresh1              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
859*f2a19305Safresh1              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
860*f2a19305Safresh1 
861*f2a19305Safresh1 /* Used to point after bad bytes for an error message, but avoid skipping
862*f2a19305Safresh1  * past a nul byte. */
863*f2a19305Safresh1 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
864*f2a19305Safresh1 
865*f2a19305Safresh1 /* Set up to clean up after our imminent demise */
866*f2a19305Safresh1 #define PREPARE_TO_DIE                                                      \
867*f2a19305Safresh1     STMT_START {                                                            \
868*f2a19305Safresh1         if (RExC_rx_sv)                                                     \
869*f2a19305Safresh1             SAVEFREESV(RExC_rx_sv);                                         \
870*f2a19305Safresh1         if (RExC_open_parens)                                               \
871*f2a19305Safresh1             SAVEFREEPV(RExC_open_parens);                                   \
872*f2a19305Safresh1         if (RExC_close_parens)                                              \
873*f2a19305Safresh1             SAVEFREEPV(RExC_close_parens);                                  \
874*f2a19305Safresh1         if (RExC_logical_to_parno)                                          \
875*f2a19305Safresh1             SAVEFREEPV(RExC_logical_to_parno);                              \
876*f2a19305Safresh1         if (RExC_parno_to_logical)                                          \
877*f2a19305Safresh1             SAVEFREEPV(RExC_parno_to_logical);                              \
878*f2a19305Safresh1     } STMT_END
879*f2a19305Safresh1 
880*f2a19305Safresh1 /*
881*f2a19305Safresh1  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
882*f2a19305Safresh1  * arg. Show regex, up to a maximum length. If it's too long, chop and add
883*f2a19305Safresh1  * "...".
884*f2a19305Safresh1  */
885*f2a19305Safresh1 #define _FAIL(code) STMT_START {                                        \
886*f2a19305Safresh1     const char *ellipses = "";                                          \
887*f2a19305Safresh1     IV len = RExC_precomp_end - RExC_precomp;                           \
888*f2a19305Safresh1                                                                         \
889*f2a19305Safresh1     PREPARE_TO_DIE;                                                     \
890*f2a19305Safresh1     if (len > RegexLengthToShowInErrorMessages) {                       \
891*f2a19305Safresh1         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
892*f2a19305Safresh1         len = RegexLengthToShowInErrorMessages - 10;                    \
893*f2a19305Safresh1         ellipses = "...";                                               \
894*f2a19305Safresh1     }                                                                   \
895*f2a19305Safresh1     code;                                                               \
896*f2a19305Safresh1 } STMT_END
897*f2a19305Safresh1 
898*f2a19305Safresh1 #define FAIL(msg) _FAIL(                            \
899*f2a19305Safresh1     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
900*f2a19305Safresh1             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
901*f2a19305Safresh1 
902*f2a19305Safresh1 #define FAIL2(msg,arg) _FAIL(                       \
903*f2a19305Safresh1     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
904*f2a19305Safresh1             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
905*f2a19305Safresh1 
906*f2a19305Safresh1 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
907*f2a19305Safresh1     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
908*f2a19305Safresh1      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
909*f2a19305Safresh1 
910*f2a19305Safresh1 /*
911*f2a19305Safresh1  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
912*f2a19305Safresh1  */
913*f2a19305Safresh1 #define Simple_vFAIL(m) STMT_START {                                    \
914*f2a19305Safresh1     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
915*f2a19305Safresh1             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
916*f2a19305Safresh1 } STMT_END
917*f2a19305Safresh1 
918*f2a19305Safresh1 /*
919*f2a19305Safresh1  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
920*f2a19305Safresh1  */
921*f2a19305Safresh1 #define vFAIL(m) STMT_START {                           \
922*f2a19305Safresh1     PREPARE_TO_DIE;                                     \
923*f2a19305Safresh1     Simple_vFAIL(m);                                    \
924*f2a19305Safresh1 } STMT_END
925*f2a19305Safresh1 
926*f2a19305Safresh1 /*
927*f2a19305Safresh1  * Like Simple_vFAIL(), but accepts two arguments.
928*f2a19305Safresh1  */
929*f2a19305Safresh1 #define Simple_vFAIL2(m,a1) STMT_START {                        \
930*f2a19305Safresh1     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
931*f2a19305Safresh1                       REPORT_LOCATION_ARGS(RExC_parse));        \
932*f2a19305Safresh1 } STMT_END
933*f2a19305Safresh1 
934*f2a19305Safresh1 /*
935*f2a19305Safresh1  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
936*f2a19305Safresh1  */
937*f2a19305Safresh1 #define vFAIL2(m,a1) STMT_START {                       \
938*f2a19305Safresh1     PREPARE_TO_DIE;                                     \
939*f2a19305Safresh1     Simple_vFAIL2(m, a1);                               \
940*f2a19305Safresh1 } STMT_END
941*f2a19305Safresh1 
942*f2a19305Safresh1 
943*f2a19305Safresh1 /*
944*f2a19305Safresh1  * Like Simple_vFAIL(), but accepts three arguments.
945*f2a19305Safresh1  */
946*f2a19305Safresh1 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
947*f2a19305Safresh1     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
948*f2a19305Safresh1             REPORT_LOCATION_ARGS(RExC_parse));                  \
949*f2a19305Safresh1 } STMT_END
950*f2a19305Safresh1 
951*f2a19305Safresh1 /*
952*f2a19305Safresh1  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
953*f2a19305Safresh1  */
954*f2a19305Safresh1 #define vFAIL3(m,a1,a2) STMT_START {                    \
955*f2a19305Safresh1     PREPARE_TO_DIE;                                     \
956*f2a19305Safresh1     Simple_vFAIL3(m, a1, a2);                           \
957*f2a19305Safresh1 } STMT_END
958*f2a19305Safresh1 
959*f2a19305Safresh1 /*
960*f2a19305Safresh1  * Like Simple_vFAIL(), but accepts four arguments.
961*f2a19305Safresh1  */
962*f2a19305Safresh1 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
963*f2a19305Safresh1     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
964*f2a19305Safresh1             REPORT_LOCATION_ARGS(RExC_parse));                  \
965*f2a19305Safresh1 } STMT_END
966*f2a19305Safresh1 
967*f2a19305Safresh1 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
968*f2a19305Safresh1     PREPARE_TO_DIE;                                     \
969*f2a19305Safresh1     Simple_vFAIL4(m, a1, a2, a3);                       \
970*f2a19305Safresh1 } STMT_END
971*f2a19305Safresh1 
972*f2a19305Safresh1 /* A specialized version of vFAIL2 that works with UTF8f */
973*f2a19305Safresh1 #define vFAIL2utf8f(m, a1) STMT_START {             \
974*f2a19305Safresh1     PREPARE_TO_DIE;                                 \
975*f2a19305Safresh1     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
976*f2a19305Safresh1             REPORT_LOCATION_ARGS(RExC_parse));      \
977*f2a19305Safresh1 } STMT_END
978*f2a19305Safresh1 
979*f2a19305Safresh1 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
980*f2a19305Safresh1     PREPARE_TO_DIE;                                     \
981*f2a19305Safresh1     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
982*f2a19305Safresh1             REPORT_LOCATION_ARGS(RExC_parse));          \
983*f2a19305Safresh1 } STMT_END
984*f2a19305Safresh1 
985*f2a19305Safresh1 /* Setting this to NULL is a signal to not output warnings */
986*f2a19305Safresh1 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
987*f2a19305Safresh1     STMT_START {                                                            \
988*f2a19305Safresh1       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
989*f2a19305Safresh1       RExC_copy_start_in_constructed = NULL;                                \
990*f2a19305Safresh1     } STMT_END
991*f2a19305Safresh1 #define RESTORE_WARNINGS                                                    \
992*f2a19305Safresh1     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
993*f2a19305Safresh1 
994*f2a19305Safresh1 /* Since a warning can be generated multiple times as the input is reparsed, we
995*f2a19305Safresh1  * output it the first time we come to that point in the parse, but suppress it
996*f2a19305Safresh1  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
997*f2a19305Safresh1  * generate any warnings */
998*f2a19305Safresh1 #define TO_OUTPUT_WARNINGS(loc)                                         \
999*f2a19305Safresh1   (   RExC_copy_start_in_constructed                                    \
1000*f2a19305Safresh1    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
1001*f2a19305Safresh1 
1002*f2a19305Safresh1 /* After we've emitted a warning, we save the position in the input so we don't
1003*f2a19305Safresh1  * output it again */
1004*f2a19305Safresh1 #define UPDATE_WARNINGS_LOC(loc)                                        \
1005*f2a19305Safresh1     STMT_START {                                                        \
1006*f2a19305Safresh1         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1007*f2a19305Safresh1             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
1008*f2a19305Safresh1                                                        - RExC_precomp;  \
1009*f2a19305Safresh1         }                                                               \
1010*f2a19305Safresh1     } STMT_END
1011*f2a19305Safresh1 
1012*f2a19305Safresh1 /* 'warns' is the output of the packWARNx macro used in 'code' */
1013*f2a19305Safresh1 #define _WARN_HELPER(loc, warns, code)                                  \
1014*f2a19305Safresh1     STMT_START {                                                        \
1015*f2a19305Safresh1         if (! RExC_copy_start_in_constructed) {                         \
1016*f2a19305Safresh1             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
1017*f2a19305Safresh1                               " expected at '%s'",                      \
1018*f2a19305Safresh1                               __FILE__, __LINE__, loc);                 \
1019*f2a19305Safresh1         }                                                               \
1020*f2a19305Safresh1         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1021*f2a19305Safresh1             if (ckDEAD(warns))                                          \
1022*f2a19305Safresh1                 PREPARE_TO_DIE;                                         \
1023*f2a19305Safresh1             code;                                                       \
1024*f2a19305Safresh1             UPDATE_WARNINGS_LOC(loc);                                   \
1025*f2a19305Safresh1         }                                                               \
1026*f2a19305Safresh1     } STMT_END
1027*f2a19305Safresh1 
1028*f2a19305Safresh1 /* m is not necessarily a "literal string", in this macro */
1029*f2a19305Safresh1 #define warn_non_literal_string(loc, packed_warn, m)                    \
1030*f2a19305Safresh1     _WARN_HELPER(loc, packed_warn,                                      \
1031*f2a19305Safresh1                       Perl_warner(aTHX_ packed_warn,                    \
1032*f2a19305Safresh1                                        "%s" REPORT_LOCATION,            \
1033*f2a19305Safresh1                                   m, REPORT_LOCATION_ARGS(loc)))
1034*f2a19305Safresh1 #define reg_warn_non_literal_string(loc, m)                             \
1035*f2a19305Safresh1                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1036*f2a19305Safresh1 
1037*f2a19305Safresh1 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
1038*f2a19305Safresh1     STMT_START {                                                            \
1039*f2a19305Safresh1                 char * format;                                              \
1040*f2a19305Safresh1                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1041*f2a19305Safresh1                 Newx(format, format_size, char);                            \
1042*f2a19305Safresh1                 my_strlcpy(format, m, format_size);                         \
1043*f2a19305Safresh1                 my_strlcat(format, REPORT_LOCATION, format_size);           \
1044*f2a19305Safresh1                 SAVEFREEPV(format);                                         \
1045*f2a19305Safresh1                 _WARN_HELPER(loc, packwarn,                                 \
1046*f2a19305Safresh1                       Perl_ck_warner(aTHX_ packwarn,                        \
1047*f2a19305Safresh1                                         format,                             \
1048*f2a19305Safresh1                                         a1, REPORT_LOCATION_ARGS(loc)));    \
1049*f2a19305Safresh1     } STMT_END
1050*f2a19305Safresh1 
1051*f2a19305Safresh1 #define ckWARNreg(loc,m)                                                \
1052*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1053*f2a19305Safresh1                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1054*f2a19305Safresh1                                           m REPORT_LOCATION,            \
1055*f2a19305Safresh1                                           REPORT_LOCATION_ARGS(loc)))
1056*f2a19305Safresh1 
1057*f2a19305Safresh1 #define vWARN(loc, m)                                                   \
1058*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1059*f2a19305Safresh1                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1060*f2a19305Safresh1                                        m REPORT_LOCATION,               \
1061*f2a19305Safresh1                                        REPORT_LOCATION_ARGS(loc)))      \
1062*f2a19305Safresh1 
1063*f2a19305Safresh1 #define vWARN_dep(loc,category,m)                                           \
1064*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(category),                                   \
1065*f2a19305Safresh1                       Perl_warner(aTHX_ packWARN(category),                 \
1066*f2a19305Safresh1                                        m REPORT_LOCATION,                   \
1067*f2a19305Safresh1                                        REPORT_LOCATION_ARGS(loc)))
1068*f2a19305Safresh1 
1069*f2a19305Safresh1 #define ckWARNdep(loc,category,m)                                           \
1070*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(category),                                   \
1071*f2a19305Safresh1                       Perl_ck_warner_d(aTHX_ packWARN(category),            \
1072*f2a19305Safresh1                                             m REPORT_LOCATION,              \
1073*f2a19305Safresh1                                             REPORT_LOCATION_ARGS(loc)))
1074*f2a19305Safresh1 
1075*f2a19305Safresh1 #define ckWARNregdep(loc,category,m)                                        \
1076*f2a19305Safresh1     _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP),                     \
1077*f2a19305Safresh1                       Perl_ck_warner_d(aTHX_ packWARN2(category,            \
1078*f2a19305Safresh1                                                       WARN_REGEXP),         \
1079*f2a19305Safresh1                                              m REPORT_LOCATION,             \
1080*f2a19305Safresh1                                              REPORT_LOCATION_ARGS(loc)))
1081*f2a19305Safresh1 
1082*f2a19305Safresh1 #define ckWARN2reg_d(loc,m, a1)                                             \
1083*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1084*f2a19305Safresh1                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1085*f2a19305Safresh1                                             m REPORT_LOCATION,              \
1086*f2a19305Safresh1                                             a1, REPORT_LOCATION_ARGS(loc)))
1087*f2a19305Safresh1 
1088*f2a19305Safresh1 #define ckWARN2reg(loc, m, a1)                                              \
1089*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1090*f2a19305Safresh1                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1091*f2a19305Safresh1                                           m REPORT_LOCATION,                \
1092*f2a19305Safresh1                                           a1, REPORT_LOCATION_ARGS(loc)))
1093*f2a19305Safresh1 
1094*f2a19305Safresh1 #define vWARN3(loc, m, a1, a2)                                              \
1095*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1096*f2a19305Safresh1                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1097*f2a19305Safresh1                                        m REPORT_LOCATION,                   \
1098*f2a19305Safresh1                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1099*f2a19305Safresh1 
1100*f2a19305Safresh1 #define ckWARN3reg(loc, m, a1, a2)                                          \
1101*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1102*f2a19305Safresh1                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1103*f2a19305Safresh1                                           m REPORT_LOCATION,                \
1104*f2a19305Safresh1                                           a1, a2,                           \
1105*f2a19305Safresh1                                           REPORT_LOCATION_ARGS(loc)))
1106*f2a19305Safresh1 
1107*f2a19305Safresh1 #define vWARN4(loc, m, a1, a2, a3)                                      \
1108*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1109*f2a19305Safresh1                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1110*f2a19305Safresh1                                        m REPORT_LOCATION,               \
1111*f2a19305Safresh1                                        a1, a2, a3,                      \
1112*f2a19305Safresh1                                        REPORT_LOCATION_ARGS(loc)))
1113*f2a19305Safresh1 
1114*f2a19305Safresh1 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1115*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1116*f2a19305Safresh1                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1117*f2a19305Safresh1                                           m REPORT_LOCATION,            \
1118*f2a19305Safresh1                                           a1, a2, a3,                   \
1119*f2a19305Safresh1                                           REPORT_LOCATION_ARGS(loc)))
1120*f2a19305Safresh1 
1121*f2a19305Safresh1 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1122*f2a19305Safresh1     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1123*f2a19305Safresh1                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1124*f2a19305Safresh1                                        m REPORT_LOCATION,               \
1125*f2a19305Safresh1                                        a1, a2, a3, a4,                  \
1126*f2a19305Safresh1                                        REPORT_LOCATION_ARGS(loc)))
1127*f2a19305Safresh1 
1128*f2a19305Safresh1 #define ckWARNexperimental(loc, class, m)                               \
1129*f2a19305Safresh1     STMT_START {                                                        \
1130*f2a19305Safresh1         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1131*f2a19305Safresh1             RExC_warned_ ## class = 1;                                  \
1132*f2a19305Safresh1             _WARN_HELPER(loc, packWARN(class),                          \
1133*f2a19305Safresh1                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1134*f2a19305Safresh1                                             m REPORT_LOCATION,          \
1135*f2a19305Safresh1                                             REPORT_LOCATION_ARGS(loc)));\
1136*f2a19305Safresh1         }                                                               \
1137*f2a19305Safresh1     } STMT_END
1138*f2a19305Safresh1 
1139*f2a19305Safresh1 #define ckWARNexperimental_with_arg(loc, class, m, arg)                 \
1140*f2a19305Safresh1     STMT_START {                                                        \
1141*f2a19305Safresh1         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1142*f2a19305Safresh1             RExC_warned_ ## class = 1;                                  \
1143*f2a19305Safresh1             _WARN_HELPER(loc, packWARN(class),                          \
1144*f2a19305Safresh1                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1145*f2a19305Safresh1                                        m REPORT_LOCATION,               \
1146*f2a19305Safresh1                                        arg, REPORT_LOCATION_ARGS(loc)));\
1147*f2a19305Safresh1         }                                                               \
1148*f2a19305Safresh1     } STMT_END
1149*f2a19305Safresh1 
1150*f2a19305Safresh1 /* Convert between a pointer to a node and its offset from the beginning of the
1151*f2a19305Safresh1  * program */
1152*f2a19305Safresh1 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1153*f2a19305Safresh1 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start)      \
1154*f2a19305Safresh1                               (SSize_t) ((node) - RExC_emit_start))
1155*f2a19305Safresh1 
1156*f2a19305Safresh1 #define ProgLen(ri) ri->proglen
1157*f2a19305Safresh1 #define SetProgLen(ri,x) ri->proglen = x
1158*f2a19305Safresh1 
1159*f2a19305Safresh1 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1160*f2a19305Safresh1 #define EXPERIMENTAL_INPLACESCAN
1161*f2a19305Safresh1 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1162*f2a19305Safresh1 
1163*f2a19305Safresh1 #define DEBUG_RExC_seen()                                                   \
1164*f2a19305Safresh1         DEBUG_OPTIMISE_MORE_r({                                             \
1165*f2a19305Safresh1             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1166*f2a19305Safresh1                                                                             \
1167*f2a19305Safresh1             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1168*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1169*f2a19305Safresh1                                                                             \
1170*f2a19305Safresh1             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1171*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1172*f2a19305Safresh1                                                                             \
1173*f2a19305Safresh1             if (RExC_seen & REG_GPOS_SEEN)                                  \
1174*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1175*f2a19305Safresh1                                                                             \
1176*f2a19305Safresh1             if (RExC_seen & REG_RECURSE_SEEN)                               \
1177*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1178*f2a19305Safresh1                                                                             \
1179*f2a19305Safresh1             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1180*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1181*f2a19305Safresh1                                                                             \
1182*f2a19305Safresh1             if (RExC_seen & REG_VERBARG_SEEN)                               \
1183*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1184*f2a19305Safresh1                                                                             \
1185*f2a19305Safresh1             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1186*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1187*f2a19305Safresh1                                                                             \
1188*f2a19305Safresh1             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1189*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1190*f2a19305Safresh1                                                                             \
1191*f2a19305Safresh1             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1192*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1193*f2a19305Safresh1                                                                             \
1194*f2a19305Safresh1             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1195*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1196*f2a19305Safresh1                                                                             \
1197*f2a19305Safresh1             if (RExC_seen & REG_PESSIMIZE_SEEN)                             \
1198*f2a19305Safresh1                 Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN ");               \
1199*f2a19305Safresh1                                                                             \
1200*f2a19305Safresh1             Perl_re_printf( aTHX_ "\n");                                    \
1201*f2a19305Safresh1         });
1202*f2a19305Safresh1 
1203*f2a19305Safresh1 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1204*f2a19305Safresh1   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1205*f2a19305Safresh1 
1206*f2a19305Safresh1 
1207*f2a19305Safresh1 #ifdef DEBUGGING
1208*f2a19305Safresh1 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1209*f2a19305Safresh1                     debug_studydata(where, data, depth, is_inf, min, stopmin, delta)
1210*f2a19305Safresh1 
1211*f2a19305Safresh1 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1212*f2a19305Safresh1                     debug_peep(str, pRExC_state, scan, depth, flags)
1213*f2a19305Safresh1 #else
1214*f2a19305Safresh1 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1215*f2a19305Safresh1 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1216*f2a19305Safresh1 #endif
1217*f2a19305Safresh1 
1218*f2a19305Safresh1 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
1219*f2a19305Safresh1 #ifdef DEBUGGING
1220*f2a19305Safresh1 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
1221*f2a19305Safresh1 #else
1222*f2a19305Safresh1 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
1223*f2a19305Safresh1 #endif
1224*f2a19305Safresh1 
1225*f2a19305Safresh1 #define MADE_TRIE       1
1226*f2a19305Safresh1 #define MADE_JUMP_TRIE  2
1227*f2a19305Safresh1 #define MADE_EXACT_TRIE 4
1228*f2a19305Safresh1 
1229*f2a19305Safresh1 #define INVLIST_INDEX                   0
1230*f2a19305Safresh1 #define ONLY_LOCALE_MATCHES_INDEX       1
1231*f2a19305Safresh1 #define DEFERRED_USER_DEFINED_INDEX     2
1232*f2a19305Safresh1 
1233*f2a19305Safresh1 /* These two functions currently do the exact same thing */
1234*f2a19305Safresh1 #define ssc_init_zero           ssc_init
1235*f2a19305Safresh1 
1236*f2a19305Safresh1 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1237*f2a19305Safresh1 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1238*f2a19305Safresh1 
1239*f2a19305Safresh1 #ifdef DEBUGGING
1240*f2a19305Safresh1 #define REGNODE_GUTS(state,op,extra_size) \
1241*f2a19305Safresh1     regnode_guts_debug(state,op,extra_size)
1242*f2a19305Safresh1 #else
1243*f2a19305Safresh1 #define REGNODE_GUTS(state,op,extra_size) \
1244*f2a19305Safresh1     regnode_guts(state,extra_size)
1245*f2a19305Safresh1 #endif
1246*f2a19305Safresh1 
1247*f2a19305Safresh1 #define CLEAR_OPTSTART                                                          \
1248*f2a19305Safresh1     if (optstart) STMT_START {                                                  \
1249*f2a19305Safresh1         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                  \
1250*f2a19305Safresh1                               " (%" IVdf " nodes)\n", (IV)(node - optstart)));  \
1251*f2a19305Safresh1         optstart=NULL;                                                          \
1252*f2a19305Safresh1     } STMT_END
1253*f2a19305Safresh1 
1254*f2a19305Safresh1 #define DUMPUNTIL(b,e)                                          \
1255*f2a19305Safresh1     CLEAR_OPTSTART;                                             \
1256*f2a19305Safresh1     node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
1257*f2a19305Safresh1 
1258*f2a19305Safresh1 #define REGNODE_STEP_OVER(ret,t1,t2) \
1259*f2a19305Safresh1     NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode))
1260*f2a19305Safresh1 
1261*f2a19305Safresh1 #endif /* REGCOMP_INTERNAL_H */
1262