1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGCOMP_C
75 #include "perl.h"
76 
77 #define REG_COMP_C
78 #ifdef PERL_IN_XSUB_RE
79 #  include "re_comp.h"
80 EXTERN_C const struct regexp_engine my_reg_engine;
81 #else
82 #  include "regcomp.h"
83 #endif
84 
85 #include "dquote_inline.h"
86 #include "invlist_inline.h"
87 #include "unicode_constants.h"
88 
89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
95 
96 #ifndef STATIC
97 #define	STATIC	static
98 #endif
99 
100 /* this is a chain of data about sub patterns we are processing that
101    need to be handled separately/specially in study_chunk. Its so
102    we can simulate recursion without losing state.  */
103 struct scan_frame;
104 typedef struct scan_frame {
105     regnode *last_regnode;      /* last node to process in this frame */
106     regnode *next_regnode;      /* next node to process when last is reached */
107     U32 prev_recursed_depth;
108     I32 stopparen;              /* what stopparen do we use */
109     bool in_gosub;              /* this or an outer frame is for GOSUB */
110 
111     struct scan_frame *this_prev_frame; /* this previous frame */
112     struct scan_frame *prev_frame;      /* previous frame */
113     struct scan_frame *next_frame;      /* next frame */
114 } scan_frame;
115 
116 /* Certain characters are output as a sequence with the first being a
117  * backslash. */
118 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
119 
120 
121 struct RExC_state_t {
122     U32		flags;			/* RXf_* are we folding, multilining? */
123     U32		pm_flags;		/* PMf_* stuff from the calling PMOP */
124     char	*precomp;		/* uncompiled string. */
125     char	*precomp_end;		/* pointer to end of uncompiled string. */
126     REGEXP	*rx_sv;			/* The SV that is the regexp. */
127     regexp	*rx;                    /* perl core regexp structure */
128     regexp_internal	*rxi;           /* internal data for regexp object
129                                            pprivate field */
130     char	*start;			/* Start of input for compile */
131     char	*end;			/* End of input for compile */
132     char	*parse;			/* Input-scan pointer. */
133     char        *copy_start;            /* start of copy of input within
134                                            constructed parse string */
135     char        *save_copy_start;       /* Provides one level of saving
136                                            and restoring 'copy_start' */
137     char        *copy_start_in_input;   /* Position in input string
138                                            corresponding to copy_start */
139     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
140     regnode	*emit_start;		/* Start of emitted-code area */
141     regnode_offset emit;		/* Code-emit pointer */
142     I32		naughty;		/* How bad is this pattern? */
143     I32		sawback;		/* Did we see \1, ...? */
144     U32		seen;
145     SSize_t	size;			/* Number of regnode equivalents in
146                                            pattern */
147 
148     /* position beyond 'precomp' of the warning message furthest away from
149      * 'precomp'.  During the parse, no warnings are raised for any problems
150      * earlier in the parse than this position.  This works if warnings are
151      * raised the first time a given spot is parsed, and if only one
152      * independent warning is raised for any given spot */
153     Size_t	latest_warn_offset;
154 
155     I32         npar;                   /* Capture buffer count so far in the
156                                            parse, (OPEN) plus one. ("par" 0 is
157                                            the whole pattern)*/
158     I32         total_par;              /* During initial parse, is either 0,
159                                            or -1; the latter indicating a
160                                            reparse is needed.  After that pass,
161                                            it is what 'npar' became after the
162                                            pass.  Hence, it being > 0 indicates
163                                            we are in a reparse situation */
164     I32		nestroot;		/* root parens we are in - used by
165                                            accept */
166     I32		seen_zerolen;
167     regnode_offset *open_parens;	/* offsets to open parens */
168     regnode_offset *close_parens;	/* offsets to close parens */
169     I32      parens_buf_size;           /* #slots malloced open/close_parens */
170     regnode     *end_op;                /* END node in program */
171     I32		utf8;		/* whether the pattern is utf8 or not */
172     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
173 				/* XXX use this for future optimisation of case
174 				 * where pattern must be upgraded to utf8. */
175     I32		uni_semantics;	/* If a d charset modifier should use unicode
176 				   rules, even if the pattern is not in
177 				   utf8 */
178     HV		*paren_names;		/* Paren names */
179 
180     regnode	**recurse;		/* Recurse regops */
181     I32         recurse_count;          /* Number of recurse regops we have generated */
182     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
183                                            through */
184     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
185     I32		in_lookbehind;
186     I32		contains_locale;
187     I32		override_recoding;
188 #ifdef EBCDIC
189     I32		recode_x_to_native;
190 #endif
191     I32		in_multi_char_class;
192     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
193 					    within pattern */
194     int		code_index;		/* next code_blocks[] slot */
195     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
196     scan_frame *frame_head;
197     scan_frame *frame_last;
198     U32         frame_count;
199     AV         *warn_text;
200     HV         *unlexed_names;
201 #ifdef ADD_TO_REGEXEC
202     char 	*starttry;		/* -Dr: where regtry was called. */
203 #define RExC_starttry	(pRExC_state->starttry)
204 #endif
205     SV		*runtime_code_qr;	/* qr with the runtime code blocks */
206 #ifdef DEBUGGING
207     const char  *lastparse;
208     I32         lastnum;
209     AV          *paren_name_list;       /* idx -> name */
210     U32         study_chunk_recursed_count;
211     SV          *mysv1;
212     SV          *mysv2;
213 
214 #define RExC_lastparse	(pRExC_state->lastparse)
215 #define RExC_lastnum	(pRExC_state->lastnum)
216 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
217 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
218 #define RExC_mysv	(pRExC_state->mysv1)
219 #define RExC_mysv1	(pRExC_state->mysv1)
220 #define RExC_mysv2	(pRExC_state->mysv2)
221 
222 #endif
223     bool        seen_d_op;
224     bool        strict;
225     bool        study_started;
226     bool        in_script_run;
227     bool        use_BRANCHJ;
228 };
229 
230 #define RExC_flags	(pRExC_state->flags)
231 #define RExC_pm_flags	(pRExC_state->pm_flags)
232 #define RExC_precomp	(pRExC_state->precomp)
233 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
234 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
235 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
236 #define RExC_precomp_end (pRExC_state->precomp_end)
237 #define RExC_rx_sv	(pRExC_state->rx_sv)
238 #define RExC_rx		(pRExC_state->rx)
239 #define RExC_rxi	(pRExC_state->rxi)
240 #define RExC_start	(pRExC_state->start)
241 #define RExC_end	(pRExC_state->end)
242 #define RExC_parse	(pRExC_state->parse)
243 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
244 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
245 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
246                                                    under /d from /u ? */
247 
248 
249 #ifdef RE_TRACK_PATTERN_OFFSETS
250 #  define RExC_offsets	(RExC_rxi->u.offsets) /* I am not like the
251                                                          others */
252 #endif
253 #define RExC_emit	(pRExC_state->emit)
254 #define RExC_emit_start	(pRExC_state->emit_start)
255 #define RExC_sawback	(pRExC_state->sawback)
256 #define RExC_seen	(pRExC_state->seen)
257 #define RExC_size	(pRExC_state->size)
258 #define RExC_maxlen        (pRExC_state->maxlen)
259 #define RExC_npar	(pRExC_state->npar)
260 #define RExC_total_parens	(pRExC_state->total_par)
261 #define RExC_parens_buf_size	(pRExC_state->parens_buf_size)
262 #define RExC_nestroot   (pRExC_state->nestroot)
263 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
264 #define RExC_utf8	(pRExC_state->utf8)
265 #define RExC_uni_semantics	(pRExC_state->uni_semantics)
266 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
267 #define RExC_open_parens	(pRExC_state->open_parens)
268 #define RExC_close_parens	(pRExC_state->close_parens)
269 #define RExC_end_op	(pRExC_state->end_op)
270 #define RExC_paren_names	(pRExC_state->paren_names)
271 #define RExC_recurse	(pRExC_state->recurse)
272 #define RExC_recurse_count	(pRExC_state->recurse_count)
273 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
274 #define RExC_study_chunk_recursed_bytes  \
275                                    (pRExC_state->study_chunk_recursed_bytes)
276 #define RExC_in_lookbehind	(pRExC_state->in_lookbehind)
277 #define RExC_contains_locale	(pRExC_state->contains_locale)
278 #ifdef EBCDIC
279 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
280 #endif
281 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
282 #define RExC_frame_head (pRExC_state->frame_head)
283 #define RExC_frame_last (pRExC_state->frame_last)
284 #define RExC_frame_count (pRExC_state->frame_count)
285 #define RExC_strict (pRExC_state->strict)
286 #define RExC_study_started      (pRExC_state->study_started)
287 #define RExC_warn_text (pRExC_state->warn_text)
288 #define RExC_in_script_run      (pRExC_state->in_script_run)
289 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
290 #define RExC_unlexed_names (pRExC_state->unlexed_names)
291 
292 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
293  * a flag to disable back-off on the fixed/floating substrings - if it's
294  * a high complexity pattern we assume the benefit of avoiding a full match
295  * is worth the cost of checking for the substrings even if they rarely help.
296  */
297 #define RExC_naughty	(pRExC_state->naughty)
298 #define TOO_NAUGHTY (10)
299 #define MARK_NAUGHTY(add) \
300     if (RExC_naughty < TOO_NAUGHTY) \
301         RExC_naughty += (add)
302 #define MARK_NAUGHTY_EXP(exp, add) \
303     if (RExC_naughty < TOO_NAUGHTY) \
304         RExC_naughty += RExC_naughty / (exp) + (add)
305 
306 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
307 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
308 	((*s) == '{' && regcurly(s)))
309 
310 /*
311  * Flags to be passed up and down.
312  */
313 #define	WORST		0	/* Worst case. */
314 #define	HASWIDTH	0x01	/* Known to not match null strings, could match
315                                    non-null ones. */
316 
317 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
318  * character.  (There needs to be a case: in the switch statement in regexec.c
319  * for any node marked SIMPLE.)  Note that this is not the same thing as
320  * REGNODE_SIMPLE */
321 #define	SIMPLE		0x02
322 #define	SPSTART		0x04	/* Starts with * or + */
323 #define POSTPONED	0x08    /* (?1),(?&name), (??{...}) or similar */
324 #define TRYAGAIN	0x10	/* Weeded out a declaration. */
325 #define RESTART_PARSE   0x20    /* Need to redo the parse */
326 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
327                                    calcuate sizes as UTF-8 */
328 
329 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
330 
331 /* whether trie related optimizations are enabled */
332 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
333 #define TRIE_STUDY_OPT
334 #define FULL_TRIE_STUDY
335 #define TRIE_STCLASS
336 #endif
337 
338 
339 
340 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
341 #define PBITVAL(paren) (1 << ((paren) & 7))
342 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
343 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
344 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
345 
346 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
347                                      if (!UTF) {                           \
348                                          *flagp = RESTART_PARSE|NEED_UTF8; \
349                                          return 0;                         \
350                                      }                                     \
351                              } STMT_END
352 
353 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
354  * a flag that indicates we need to override /d with /u as a result of
355  * something in the pattern.  It should only be used in regards to calling
356  * set_regex_charset() or get_regex_charse() */
357 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
358     STMT_START {                                                            \
359             if (DEPENDS_SEMANTICS) {                                        \
360                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
361                 RExC_uni_semantics = 1;                                     \
362                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
363                     /* No need to restart the parse if we haven't seen      \
364                      * anything that differs between /u and /d, and no need \
365                      * to restart immediately if we're going to reparse     \
366                      * anyway to count parens */                            \
367                     *flagp |= RESTART_PARSE;                                \
368                     return restart_retval;                                  \
369                 }                                                           \
370             }                                                               \
371     } STMT_END
372 
373 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
374     STMT_START {                                                            \
375                 RExC_use_BRANCHJ = 1;                                       \
376                 *flagp |= RESTART_PARSE;                                    \
377                 return restart_retval;                                      \
378     } STMT_END
379 
380 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
381  * less.  After that, it must always be positive, because the whole re is
382  * considered to be surrounded by virtual parens.  Setting it to negative
383  * indicates there is some construct that needs to know the actual number of
384  * parens to be properly handled.  And that means an extra pass will be
385  * required after we've counted them all */
386 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
387 #define REQUIRE_PARENS_PASS                                                 \
388     STMT_START {  /* No-op if have completed a pass */                      \
389                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
390     } STMT_END
391 #define IN_PARENS_PASS (RExC_total_parens < 0)
392 
393 
394 /* This is used to return failure (zero) early from the calling function if
395  * various flags in 'flags' are set.  Two flags always cause a return:
396  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
397  * additional flags that should cause a return; 0 if none.  If the return will
398  * be done, '*flagp' is first set to be all of the flags that caused the
399  * return. */
400 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
401     STMT_START {                                                            \
402             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
403                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
404                 return 0;                                                   \
405             }                                                               \
406     } STMT_END
407 
408 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
409 
410 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
411                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
412 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
413                                     if (MUST_RESTART(*(flagp))) return 0
414 
415 /* This converts the named class defined in regcomp.h to its equivalent class
416  * number defined in handy.h. */
417 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
418 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
419 
420 #define _invlist_union_complement_2nd(a, b, output) \
421                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
422 #define _invlist_intersection_complement_2nd(a, b, output) \
423                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
424 
425 /* About scan_data_t.
426 
427   During optimisation we recurse through the regexp program performing
428   various inplace (keyhole style) optimisations. In addition study_chunk
429   and scan_commit populate this data structure with information about
430   what strings MUST appear in the pattern. We look for the longest
431   string that must appear at a fixed location, and we look for the
432   longest string that may appear at a floating location. So for instance
433   in the pattern:
434 
435     /FOO[xX]A.*B[xX]BAR/
436 
437   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
438   strings (because they follow a .* construct). study_chunk will identify
439   both FOO and BAR as being the longest fixed and floating strings respectively.
440 
441   The strings can be composites, for instance
442 
443      /(f)(o)(o)/
444 
445   will result in a composite fixed substring 'foo'.
446 
447   For each string some basic information is maintained:
448 
449   - min_offset
450     This is the position the string must appear at, or not before.
451     It also implicitly (when combined with minlenp) tells us how many
452     characters must match before the string we are searching for.
453     Likewise when combined with minlenp and the length of the string it
454     tells us how many characters must appear after the string we have
455     found.
456 
457   - max_offset
458     Only used for floating strings. This is the rightmost point that
459     the string can appear at. If set to SSize_t_MAX it indicates that the
460     string can occur infinitely far to the right.
461     For fixed strings, it is equal to min_offset.
462 
463   - minlenp
464     A pointer to the minimum number of characters of the pattern that the
465     string was found inside. This is important as in the case of positive
466     lookahead or positive lookbehind we can have multiple patterns
467     involved. Consider
468 
469     /(?=FOO).*F/
470 
471     The minimum length of the pattern overall is 3, the minimum length
472     of the lookahead part is 3, but the minimum length of the part that
473     will actually match is 1. So 'FOO's minimum length is 3, but the
474     minimum length for the F is 1. This is important as the minimum length
475     is used to determine offsets in front of and behind the string being
476     looked for.  Since strings can be composites this is the length of the
477     pattern at the time it was committed with a scan_commit. Note that
478     the length is calculated by study_chunk, so that the minimum lengths
479     are not known until the full pattern has been compiled, thus the
480     pointer to the value.
481 
482   - lookbehind
483 
484     In the case of lookbehind the string being searched for can be
485     offset past the start point of the final matching string.
486     If this value was just blithely removed from the min_offset it would
487     invalidate some of the calculations for how many chars must match
488     before or after (as they are derived from min_offset and minlen and
489     the length of the string being searched for).
490     When the final pattern is compiled and the data is moved from the
491     scan_data_t structure into the regexp structure the information
492     about lookbehind is factored in, with the information that would
493     have been lost precalculated in the end_shift field for the
494     associated string.
495 
496   The fields pos_min and pos_delta are used to store the minimum offset
497   and the delta to the maximum offset at the current point in the pattern.
498 
499 */
500 
501 struct scan_data_substrs {
502     SV      *str;       /* longest substring found in pattern */
503     SSize_t min_offset; /* earliest point in string it can appear */
504     SSize_t max_offset; /* latest point in string it can appear */
505     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
506     SSize_t lookbehind; /* is the pos of the string modified by LB */
507     I32 flags;          /* per substring SF_* and SCF_* flags */
508 };
509 
510 typedef struct scan_data_t {
511     /*I32 len_min;      unused */
512     /*I32 len_delta;    unused */
513     SSize_t pos_min;
514     SSize_t pos_delta;
515     SV *last_found;
516     SSize_t last_end;	    /* min value, <0 unless valid. */
517     SSize_t last_start_min;
518     SSize_t last_start_max;
519     U8      cur_is_floating; /* whether the last_* values should be set as
520                               * the next fixed (0) or floating (1)
521                               * substring */
522 
523     /* [0] is longest fixed substring so far, [1] is longest float so far */
524     struct scan_data_substrs  substrs[2];
525 
526     I32 flags;             /* common SF_* and SCF_* flags */
527     I32 whilem_c;
528     SSize_t *last_closep;
529     regnode_ssc *start_class;
530 } scan_data_t;
531 
532 /*
533  * Forward declarations for pregcomp()'s friends.
534  */
535 
536 static const scan_data_t zero_scan_data = {
537     0, 0, NULL, 0, 0, 0, 0,
538     {
539         { NULL, 0, 0, 0, 0, 0 },
540         { NULL, 0, 0, 0, 0, 0 },
541     },
542     0, 0, NULL, NULL
543 };
544 
545 /* study flags */
546 
547 #define SF_BEFORE_SEOL		0x0001
548 #define SF_BEFORE_MEOL		0x0002
549 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
550 
551 #define SF_IS_INF		0x0040
552 #define SF_HAS_PAR		0x0080
553 #define SF_IN_PAR		0x0100
554 #define SF_HAS_EVAL		0x0200
555 
556 
557 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
558  * longest substring in the pattern. When it is not set the optimiser keeps
559  * track of position, but does not keep track of the actual strings seen,
560  *
561  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
562  * /foo/i will not.
563  *
564  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
565  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
566  * turned off because of the alternation (BRANCH). */
567 #define SCF_DO_SUBSTR		0x0400
568 
569 #define SCF_DO_STCLASS_AND	0x0800
570 #define SCF_DO_STCLASS_OR	0x1000
571 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
572 #define SCF_WHILEM_VISITED_POS	0x2000
573 
574 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
575 #define SCF_SEEN_ACCEPT         0x8000
576 #define SCF_TRIE_DOING_RESTUDY 0x10000
577 #define SCF_IN_DEFINE          0x20000
578 
579 
580 
581 
582 #define UTF cBOOL(RExC_utf8)
583 
584 /* The enums for all these are ordered so things work out correctly */
585 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
586 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
587                                                      == REGEX_DEPENDS_CHARSET)
588 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
589 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
590                                                      >= REGEX_UNICODE_CHARSET)
591 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
592                                             == REGEX_ASCII_RESTRICTED_CHARSET)
593 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
594                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
595 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
596                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
597 
598 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
599 
600 /* For programs that want to be strictly Unicode compatible by dying if any
601  * attempt is made to match a non-Unicode code point against a Unicode
602  * property.  */
603 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
604 
605 #define OOB_NAMEDCLASS		-1
606 
607 /* There is no code point that is out-of-bounds, so this is problematic.  But
608  * its only current use is to initialize a variable that is always set before
609  * looked at. */
610 #define OOB_UNICODE		0xDEADBEEF
611 
612 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
613 
614 
615 /* length of regex to show in messages that don't mark a position within */
616 #define RegexLengthToShowInErrorMessages 127
617 
618 /*
619  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
620  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
621  * op/pragma/warn/regcomp.
622  */
623 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
624 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
625 
626 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
627                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
628 
629 /* The code in this file in places uses one level of recursion with parsing
630  * rebased to an alternate string constructed by us in memory.  This can take
631  * the form of something that is completely different from the input, or
632  * something that uses the input as part of the alternate.  In the first case,
633  * there should be no possibility of an error, as we are in complete control of
634  * the alternate string.  But in the second case we don't completely control
635  * the input portion, so there may be errors in that.  Here's an example:
636  *      /[abc\x{DF}def]/ui
637  * is handled specially because \x{df} folds to a sequence of more than one
638  * character: 'ss'.  What is done is to create and parse an alternate string,
639  * which looks like this:
640  *      /(?:\x{DF}|[abc\x{DF}def])/ui
641  * where it uses the input unchanged in the middle of something it constructs,
642  * which is a branch for the DF outside the character class, and clustering
643  * parens around the whole thing. (It knows enough to skip the DF inside the
644  * class while in this substitute parse.) 'abc' and 'def' may have errors that
645  * need to be reported.  The general situation looks like this:
646  *
647  *                                       |<------- identical ------>|
648  *              sI                       tI               xI       eI
649  * Input:       ---------------------------------------------------------------
650  * Constructed:         ---------------------------------------------------
651  *                      sC               tC               xC       eC     EC
652  *                                       |<------- identical ------>|
653  *
654  * sI..eI   is the portion of the input pattern we are concerned with here.
655  * sC..EC   is the constructed substitute parse string.
656  *  sC..tC  is constructed by us
657  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
658  *          In the diagram, these are vertically aligned.
659  *  eC..EC  is also constructed by us.
660  * xC       is the position in the substitute parse string where we found a
661  *          problem.
662  * xI       is the position in the original pattern corresponding to xC.
663  *
664  * We want to display a message showing the real input string.  Thus we need to
665  * translate from xC to xI.  We know that xC >= tC, since the portion of the
666  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
667  * get:
668  *      xI = tI + (xC - tC)
669  *
670  * When the substitute parse is constructed, the code needs to set:
671  *      RExC_start (sC)
672  *      RExC_end (eC)
673  *      RExC_copy_start_in_input  (tI)
674  *      RExC_copy_start_in_constructed (tC)
675  * and restore them when done.
676  *
677  * During normal processing of the input pattern, both
678  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
679  * sI, so that xC equals xI.
680  */
681 
682 #define sI              RExC_precomp
683 #define eI              RExC_precomp_end
684 #define sC              RExC_start
685 #define eC              RExC_end
686 #define tI              RExC_copy_start_in_input
687 #define tC              RExC_copy_start_in_constructed
688 #define xI(xC)          (tI + (xC - tC))
689 #define xI_offset(xC)   (xI(xC) - sI)
690 
691 #define REPORT_LOCATION_ARGS(xC)                                            \
692     UTF8fARG(UTF,                                                           \
693              (xI(xC) > eI) /* Don't run off end */                          \
694               ? eI - sI   /* Length before the <--HERE */                   \
695               : ((xI_offset(xC) >= 0)                                       \
696                  ? xI_offset(xC)                                            \
697                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
698                                     IVdf " trying to output message for "   \
699                                     " pattern %.*s",                        \
700                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
701                                     ((int) (eC - sC)), sC), 0)),            \
702              sI),         /* The input pattern printed up to the <--HERE */ \
703     UTF8fARG(UTF,                                                           \
704              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
705              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
706 
707 /* Used to point after bad bytes for an error message, but avoid skipping
708  * past a nul byte. */
709 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
710 
711 /* Set up to clean up after our imminent demise */
712 #define PREPARE_TO_DIE                                                      \
713     STMT_START {					                    \
714         if (RExC_rx_sv)                                                     \
715             SAVEFREESV(RExC_rx_sv);                                         \
716         if (RExC_open_parens)                                               \
717             SAVEFREEPV(RExC_open_parens);                                   \
718         if (RExC_close_parens)                                              \
719             SAVEFREEPV(RExC_close_parens);                                  \
720     } STMT_END
721 
722 /*
723  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
724  * arg. Show regex, up to a maximum length. If it's too long, chop and add
725  * "...".
726  */
727 #define _FAIL(code) STMT_START {					\
728     const char *ellipses = "";						\
729     IV len = RExC_precomp_end - RExC_precomp;				\
730 									\
731     PREPARE_TO_DIE;						        \
732     if (len > RegexLengthToShowInErrorMessages) {			\
733 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
734 	len = RegexLengthToShowInErrorMessages - 10;			\
735 	ellipses = "...";						\
736     }									\
737     code;                                                               \
738 } STMT_END
739 
740 #define	FAIL(msg) _FAIL(			    \
741     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",	    \
742 	    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
743 
744 #define	FAIL2(msg,arg) _FAIL(			    \
745     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",	    \
746 	    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
747 
748 /*
749  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
750  */
751 #define	Simple_vFAIL(m) STMT_START {					\
752     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
753 	    m, REPORT_LOCATION_ARGS(RExC_parse));	                \
754 } STMT_END
755 
756 /*
757  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
758  */
759 #define	vFAIL(m) STMT_START {				\
760     PREPARE_TO_DIE;                                     \
761     Simple_vFAIL(m);					\
762 } STMT_END
763 
764 /*
765  * Like Simple_vFAIL(), but accepts two arguments.
766  */
767 #define	Simple_vFAIL2(m,a1) STMT_START {			\
768     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,		\
769                       REPORT_LOCATION_ARGS(RExC_parse));	\
770 } STMT_END
771 
772 /*
773  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
774  */
775 #define	vFAIL2(m,a1) STMT_START {			\
776     PREPARE_TO_DIE;                                     \
777     Simple_vFAIL2(m, a1);				\
778 } STMT_END
779 
780 
781 /*
782  * Like Simple_vFAIL(), but accepts three arguments.
783  */
784 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
785     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,		\
786 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
787 } STMT_END
788 
789 /*
790  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
791  */
792 #define	vFAIL3(m,a1,a2) STMT_START {			\
793     PREPARE_TO_DIE;                                     \
794     Simple_vFAIL3(m, a1, a2);				\
795 } STMT_END
796 
797 /*
798  * Like Simple_vFAIL(), but accepts four arguments.
799  */
800 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
801     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,	\
802 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
803 } STMT_END
804 
805 #define	vFAIL4(m,a1,a2,a3) STMT_START {			\
806     PREPARE_TO_DIE;                                     \
807     Simple_vFAIL4(m, a1, a2, a3);			\
808 } STMT_END
809 
810 /* A specialized version of vFAIL2 that works with UTF8f */
811 #define vFAIL2utf8f(m, a1) STMT_START {             \
812     PREPARE_TO_DIE;                                 \
813     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
814             REPORT_LOCATION_ARGS(RExC_parse));      \
815 } STMT_END
816 
817 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
818     PREPARE_TO_DIE;                                     \
819     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
820             REPORT_LOCATION_ARGS(RExC_parse));          \
821 } STMT_END
822 
823 /* Setting this to NULL is a signal to not output warnings */
824 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
825     STMT_START {                                                            \
826       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
827       RExC_copy_start_in_constructed = NULL;                                \
828     } STMT_END
829 #define RESTORE_WARNINGS                                                    \
830     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
831 
832 /* Since a warning can be generated multiple times as the input is reparsed, we
833  * output it the first time we come to that point in the parse, but suppress it
834  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
835  * generate any warnings */
836 #define TO_OUTPUT_WARNINGS(loc)                                         \
837   (   RExC_copy_start_in_constructed                                    \
838    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
839 
840 /* After we've emitted a warning, we save the position in the input so we don't
841  * output it again */
842 #define UPDATE_WARNINGS_LOC(loc)                                        \
843     STMT_START {                                                        \
844         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
845             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
846                                                        - RExC_precomp;  \
847         }                                                               \
848     } STMT_END
849 
850 /* 'warns' is the output of the packWARNx macro used in 'code' */
851 #define _WARN_HELPER(loc, warns, code)                                  \
852     STMT_START {                                                        \
853         if (! RExC_copy_start_in_constructed) {                         \
854             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
855                               " expected at '%s'",                      \
856                               __FILE__, __LINE__, loc);                 \
857         }                                                               \
858         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
859             if (ckDEAD(warns))                                          \
860                 PREPARE_TO_DIE;                                         \
861             code;                                                       \
862             UPDATE_WARNINGS_LOC(loc);                                   \
863         }                                                               \
864     } STMT_END
865 
866 /* m is not necessarily a "literal string", in this macro */
867 #define reg_warn_non_literal_string(loc, m)                             \
868     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
869                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
870                                        "%s" REPORT_LOCATION,            \
871                                   m, REPORT_LOCATION_ARGS(loc)))
872 
873 #define	ckWARNreg(loc,m) 					        \
874     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
875                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
876                                           m REPORT_LOCATION,	        \
877 	                                  REPORT_LOCATION_ARGS(loc)))
878 
879 #define	vWARN(loc, m)           				        \
880     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
881                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
882                                        m REPORT_LOCATION,               \
883                                        REPORT_LOCATION_ARGS(loc)))      \
884 
885 #define	vWARN_dep(loc, m)           				        \
886     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
887                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
888                                        m REPORT_LOCATION,               \
889 	                               REPORT_LOCATION_ARGS(loc)))
890 
891 #define	ckWARNdep(loc,m)            				        \
892     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
893                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
894 	                                    m REPORT_LOCATION,          \
895 	                                    REPORT_LOCATION_ARGS(loc)))
896 
897 #define	ckWARNregdep(loc,m)             				    \
898     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
899                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
900                                                       WARN_REGEXP),         \
901 	                                     m REPORT_LOCATION,             \
902 	                                     REPORT_LOCATION_ARGS(loc)))
903 
904 #define	ckWARN2reg_d(loc,m, a1)             				    \
905     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
906                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
907 	                                    m REPORT_LOCATION,              \
908 	                                    a1, REPORT_LOCATION_ARGS(loc)))
909 
910 #define	ckWARN2reg(loc, m, a1)                                              \
911     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
912                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
913                                           m REPORT_LOCATION,	            \
914                                           a1, REPORT_LOCATION_ARGS(loc)))
915 
916 #define	vWARN3(loc, m, a1, a2)          				    \
917     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
918                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
919                                        m REPORT_LOCATION,                   \
920 	                               a1, a2, REPORT_LOCATION_ARGS(loc)))
921 
922 #define	ckWARN3reg(loc, m, a1, a2)          				    \
923     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
924                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
925                                           m REPORT_LOCATION,                \
926 	                                  a1, a2,                           \
927                                           REPORT_LOCATION_ARGS(loc)))
928 
929 #define	vWARN4(loc, m, a1, a2, a3)          				\
930     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
931                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
932                                        m REPORT_LOCATION,               \
933 	                               a1, a2, a3,                      \
934                                        REPORT_LOCATION_ARGS(loc)))
935 
936 #define	ckWARN4reg(loc, m, a1, a2, a3)          			\
937     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
938                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
939                                           m REPORT_LOCATION,            \
940 	                                  a1, a2, a3,                   \
941                                           REPORT_LOCATION_ARGS(loc)))
942 
943 #define	vWARN5(loc, m, a1, a2, a3, a4)          			\
944     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
945                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
946                                        m REPORT_LOCATION,		\
947 	                               a1, a2, a3, a4,                  \
948                                        REPORT_LOCATION_ARGS(loc)))
949 
950 #define	ckWARNexperimental(loc, class, m)                               \
951     _WARN_HELPER(loc, packWARN(class),                                  \
952                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
953                                             m REPORT_LOCATION,          \
954                                             REPORT_LOCATION_ARGS(loc)))
955 
956 /* Convert between a pointer to a node and its offset from the beginning of the
957  * program */
958 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
959 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
960 
961 /* Macros for recording node offsets.   20001227 mjd@plover.com
962  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
963  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
964  * Element 0 holds the number n.
965  * Position is 1 indexed.
966  */
967 #ifndef RE_TRACK_PATTERN_OFFSETS
968 #define Set_Node_Offset_To_R(offset,byte)
969 #define Set_Node_Offset(node,byte)
970 #define Set_Cur_Node_Offset
971 #define Set_Node_Length_To_R(node,len)
972 #define Set_Node_Length(node,len)
973 #define Set_Node_Cur_Length(node,start)
974 #define Node_Offset(n)
975 #define Node_Length(n)
976 #define Set_Node_Offset_Length(node,offset,len)
977 #define ProgLen(ri) ri->u.proglen
978 #define SetProgLen(ri,x) ri->u.proglen = x
979 #define Track_Code(code)
980 #else
981 #define ProgLen(ri) ri->u.offsets[0]
982 #define SetProgLen(ri,x) ri->u.offsets[0] = x
983 #define Set_Node_Offset_To_R(offset,byte) STMT_START {			\
984 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
985 		    __LINE__, (int)(offset), (int)(byte)));		\
986 	if((offset) < 0) {						\
987 	    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
988                                          (int)(offset));                \
989 	} else {							\
990             RExC_offsets[2*(offset)-1] = (byte);	                \
991 	}								\
992 } STMT_END
993 
994 #define Set_Node_Offset(node,byte)                                      \
995     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
996 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
997 
998 #define Set_Node_Length_To_R(node,len) STMT_START {			\
999 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
1000 		__LINE__, (int)(node), (int)(len)));			\
1001 	if((node) < 0) {						\
1002 	    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1003                                          (int)(node));                  \
1004 	} else {							\
1005 	    RExC_offsets[2*(node)] = (len);				\
1006 	}								\
1007 } STMT_END
1008 
1009 #define Set_Node_Length(node,len) \
1010     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1011 #define Set_Node_Cur_Length(node, start)                \
1012     Set_Node_Length(node, RExC_parse - start)
1013 
1014 /* Get offsets and lengths */
1015 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1016 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1017 
1018 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
1019     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));	\
1020     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));	\
1021 } STMT_END
1022 
1023 #define Track_Code(code) STMT_START { code } STMT_END
1024 #endif
1025 
1026 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1027 #define EXPERIMENTAL_INPLACESCAN
1028 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1029 
1030 #ifdef DEBUGGING
1031 int
Perl_re_printf(pTHX_ const char * fmt,...)1032 Perl_re_printf(pTHX_ const char *fmt, ...)
1033 {
1034     va_list ap;
1035     int result;
1036     PerlIO *f= Perl_debug_log;
1037     PERL_ARGS_ASSERT_RE_PRINTF;
1038     va_start(ap, fmt);
1039     result = PerlIO_vprintf(f, fmt, ap);
1040     va_end(ap);
1041     return result;
1042 }
1043 
1044 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)1045 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1046 {
1047     va_list ap;
1048     int result;
1049     PerlIO *f= Perl_debug_log;
1050     PERL_ARGS_ASSERT_RE_INDENTF;
1051     va_start(ap, depth);
1052     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1053     result = PerlIO_vprintf(f, fmt, ap);
1054     va_end(ap);
1055     return result;
1056 }
1057 #endif /* DEBUGGING */
1058 
1059 #define DEBUG_RExC_seen()                                                   \
1060         DEBUG_OPTIMISE_MORE_r({                                             \
1061             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1062                                                                             \
1063             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1064                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1065                                                                             \
1066             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1067                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1068                                                                             \
1069             if (RExC_seen & REG_GPOS_SEEN)                                  \
1070                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1071                                                                             \
1072             if (RExC_seen & REG_RECURSE_SEEN)                               \
1073                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1074                                                                             \
1075             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1076                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1077                                                                             \
1078             if (RExC_seen & REG_VERBARG_SEEN)                               \
1079                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1080                                                                             \
1081             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1082                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1083                                                                             \
1084             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1085                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1086                                                                             \
1087             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1088                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1089                                                                             \
1090             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1091                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1092                                                                             \
1093             Perl_re_printf( aTHX_ "\n");                                    \
1094         });
1095 
1096 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1097   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1098 
1099 
1100 #ifdef DEBUGGING
1101 static void
S_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)1102 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1103                                     const char *close_str)
1104 {
1105     if (!flags)
1106         return;
1107 
1108     Perl_re_printf( aTHX_  "%s", open_str);
1109     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1110     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1111     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1112     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1113     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1114     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1115     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1116     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1117     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1118     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1119     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1120     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1121     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1122     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1123     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1124     Perl_re_printf( aTHX_  "%s", close_str);
1125 }
1126 
1127 
1128 static void
S_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf)1129 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1130                     U32 depth, int is_inf)
1131 {
1132     GET_RE_DEBUG_FLAGS_DECL;
1133 
1134     DEBUG_OPTIMISE_MORE_r({
1135         if (!data)
1136             return;
1137         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1138             depth,
1139             where,
1140             (IV)data->pos_min,
1141             (IV)data->pos_delta,
1142             (UV)data->flags
1143         );
1144 
1145         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1146 
1147         Perl_re_printf( aTHX_
1148             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1149             (IV)data->whilem_c,
1150             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1151             is_inf ? "INF " : ""
1152         );
1153 
1154         if (data->last_found) {
1155             int i;
1156             Perl_re_printf(aTHX_
1157                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1158                     SvPVX_const(data->last_found),
1159                     (IV)data->last_end,
1160                     (IV)data->last_start_min,
1161                     (IV)data->last_start_max
1162             );
1163 
1164             for (i = 0; i < 2; i++) {
1165                 Perl_re_printf(aTHX_
1166                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1167                     data->cur_is_floating == i ? "*" : "",
1168                     i ? "Float" : "Fixed",
1169                     SvPVX_const(data->substrs[i].str),
1170                     (IV)data->substrs[i].min_offset,
1171                     (IV)data->substrs[i].max_offset
1172                 );
1173                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1174             }
1175         }
1176 
1177         Perl_re_printf( aTHX_ "\n");
1178     });
1179 }
1180 
1181 
1182 static void
S_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)1183 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1184                 regnode *scan, U32 depth, U32 flags)
1185 {
1186     GET_RE_DEBUG_FLAGS_DECL;
1187 
1188     DEBUG_OPTIMISE_r({
1189         regnode *Next;
1190 
1191         if (!scan)
1192             return;
1193         Next = regnext(scan);
1194         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1195         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1196             depth,
1197             str,
1198             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1199             Next ? (REG_NODE_NUM(Next)) : 0 );
1200         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1201         Perl_re_printf( aTHX_  "\n");
1202    });
1203 }
1204 
1205 
1206 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1207                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1208 
1209 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1210                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1211 
1212 #else
1213 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1214 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1215 #endif
1216 
1217 
1218 /* =========================================================
1219  * BEGIN edit_distance stuff.
1220  *
1221  * This calculates how many single character changes of any type are needed to
1222  * transform a string into another one.  It is taken from version 3.1 of
1223  *
1224  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1225  */
1226 
1227 /* Our unsorted dictionary linked list.   */
1228 /* Note we use UVs, not chars. */
1229 
1230 struct dictionary{
1231   UV key;
1232   UV value;
1233   struct dictionary* next;
1234 };
1235 typedef struct dictionary item;
1236 
1237 
1238 PERL_STATIC_INLINE item*
push(UV key,item * curr)1239 push(UV key, item* curr)
1240 {
1241     item* head;
1242     Newx(head, 1, item);
1243     head->key = key;
1244     head->value = 0;
1245     head->next = curr;
1246     return head;
1247 }
1248 
1249 
1250 PERL_STATIC_INLINE item*
find(item * head,UV key)1251 find(item* head, UV key)
1252 {
1253     item* iterator = head;
1254     while (iterator){
1255         if (iterator->key == key){
1256             return iterator;
1257         }
1258         iterator = iterator->next;
1259     }
1260 
1261     return NULL;
1262 }
1263 
1264 PERL_STATIC_INLINE item*
uniquePush(item * head,UV key)1265 uniquePush(item* head, UV key)
1266 {
1267     item* iterator = head;
1268 
1269     while (iterator){
1270         if (iterator->key == key) {
1271             return head;
1272         }
1273         iterator = iterator->next;
1274     }
1275 
1276     return push(key, head);
1277 }
1278 
1279 PERL_STATIC_INLINE void
dict_free(item * head)1280 dict_free(item* head)
1281 {
1282     item* iterator = head;
1283 
1284     while (iterator) {
1285         item* temp = iterator;
1286         iterator = iterator->next;
1287         Safefree(temp);
1288     }
1289 
1290     head = NULL;
1291 }
1292 
1293 /* End of Dictionary Stuff */
1294 
1295 /* All calculations/work are done here */
1296 STATIC int
S_edit_distance(const UV * src,const UV * tgt,const STRLEN x,const STRLEN y,const SSize_t maxDistance)1297 S_edit_distance(const UV* src,
1298                 const UV* tgt,
1299                 const STRLEN x,             /* length of src[] */
1300                 const STRLEN y,             /* length of tgt[] */
1301                 const SSize_t maxDistance
1302 )
1303 {
1304     item *head = NULL;
1305     UV swapCount, swapScore, targetCharCount, i, j;
1306     UV *scores;
1307     UV score_ceil = x + y;
1308 
1309     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1310 
1311     /* intialize matrix start values */
1312     Newx(scores, ( (x + 2) * (y + 2)), UV);
1313     scores[0] = score_ceil;
1314     scores[1 * (y + 2) + 0] = score_ceil;
1315     scores[0 * (y + 2) + 1] = score_ceil;
1316     scores[1 * (y + 2) + 1] = 0;
1317     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1318 
1319     /* work loops    */
1320     /* i = src index */
1321     /* j = tgt index */
1322     for (i=1;i<=x;i++) {
1323         if (i < x)
1324             head = uniquePush(head, src[i]);
1325         scores[(i+1) * (y + 2) + 1] = i;
1326         scores[(i+1) * (y + 2) + 0] = score_ceil;
1327         swapCount = 0;
1328 
1329         for (j=1;j<=y;j++) {
1330             if (i == 1) {
1331                 if(j < y)
1332                 head = uniquePush(head, tgt[j]);
1333                 scores[1 * (y + 2) + (j + 1)] = j;
1334                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1335             }
1336 
1337             targetCharCount = find(head, tgt[j-1])->value;
1338             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1339 
1340             if (src[i-1] != tgt[j-1]){
1341                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1342             }
1343             else {
1344                 swapCount = j;
1345                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1346             }
1347         }
1348 
1349         find(head, src[i-1])->value = i;
1350     }
1351 
1352     {
1353         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1354         dict_free(head);
1355         Safefree(scores);
1356         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1357     }
1358 }
1359 
1360 /* END of edit_distance() stuff
1361  * ========================================================= */
1362 
1363 /* is c a control character for which we have a mnemonic? */
1364 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1365 
1366 STATIC const char *
S_cntrl_to_mnemonic(const U8 c)1367 S_cntrl_to_mnemonic(const U8 c)
1368 {
1369     /* Returns the mnemonic string that represents character 'c', if one
1370      * exists; NULL otherwise.  The only ones that exist for the purposes of
1371      * this routine are a few control characters */
1372 
1373     switch (c) {
1374         case '\a':       return "\\a";
1375         case '\b':       return "\\b";
1376         case ESC_NATIVE: return "\\e";
1377         case '\f':       return "\\f";
1378         case '\n':       return "\\n";
1379         case '\r':       return "\\r";
1380         case '\t':       return "\\t";
1381     }
1382 
1383     return NULL;
1384 }
1385 
1386 /* Mark that we cannot extend a found fixed substring at this point.
1387    Update the longest found anchored substring or the longest found
1388    floating substrings if needed. */
1389 
1390 STATIC void
S_scan_commit(pTHX_ const RExC_state_t * pRExC_state,scan_data_t * data,SSize_t * minlenp,int is_inf)1391 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1392                     SSize_t *minlenp, int is_inf)
1393 {
1394     const STRLEN l = CHR_SVLEN(data->last_found);
1395     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1396     const STRLEN old_l = CHR_SVLEN(longest_sv);
1397     GET_RE_DEBUG_FLAGS_DECL;
1398 
1399     PERL_ARGS_ASSERT_SCAN_COMMIT;
1400 
1401     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1402         const U8 i = data->cur_is_floating;
1403 	SvSetMagicSV(longest_sv, data->last_found);
1404         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1405 
1406 	if (!i) /* fixed */
1407 	    data->substrs[0].max_offset = data->substrs[0].min_offset;
1408 	else { /* float */
1409 	    data->substrs[1].max_offset = (l
1410                           ? data->last_start_max
1411                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1412 					 ? SSize_t_MAX
1413 					 : data->pos_min + data->pos_delta));
1414 	    if (is_inf
1415 		 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1416 		data->substrs[1].max_offset = SSize_t_MAX;
1417         }
1418 
1419         if (data->flags & SF_BEFORE_EOL)
1420             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1421         else
1422             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1423         data->substrs[i].minlenp = minlenp;
1424         data->substrs[i].lookbehind = 0;
1425     }
1426 
1427     SvCUR_set(data->last_found, 0);
1428     {
1429 	SV * const sv = data->last_found;
1430 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
1431 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1432 	    if (mg)
1433 		mg->mg_len = 0;
1434 	}
1435     }
1436     data->last_end = -1;
1437     data->flags &= ~SF_BEFORE_EOL;
1438     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1439 }
1440 
1441 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1442  * list that describes which code points it matches */
1443 
1444 STATIC void
S_ssc_anything(pTHX_ regnode_ssc * ssc)1445 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1446 {
1447     /* Set the SSC 'ssc' to match an empty string or any code point */
1448 
1449     PERL_ARGS_ASSERT_SSC_ANYTHING;
1450 
1451     assert(is_ANYOF_SYNTHETIC(ssc));
1452 
1453     /* mortalize so won't leak */
1454     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1455     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1456 }
1457 
1458 STATIC int
S_ssc_is_anything(const regnode_ssc * ssc)1459 S_ssc_is_anything(const regnode_ssc *ssc)
1460 {
1461     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1462      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1463      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1464      * in any way, so there's no point in using it */
1465 
1466     UV start, end;
1467     bool ret;
1468 
1469     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1470 
1471     assert(is_ANYOF_SYNTHETIC(ssc));
1472 
1473     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1474         return FALSE;
1475     }
1476 
1477     /* See if the list consists solely of the range 0 - Infinity */
1478     invlist_iterinit(ssc->invlist);
1479     ret = invlist_iternext(ssc->invlist, &start, &end)
1480           && start == 0
1481           && end == UV_MAX;
1482 
1483     invlist_iterfinish(ssc->invlist);
1484 
1485     if (ret) {
1486         return TRUE;
1487     }
1488 
1489     /* If e.g., both \w and \W are set, matches everything */
1490     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1491         int i;
1492         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1493             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1494                 return TRUE;
1495             }
1496         }
1497     }
1498 
1499     return FALSE;
1500 }
1501 
1502 STATIC void
S_ssc_init(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc)1503 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1504 {
1505     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1506      * string, any code point, or any posix class under locale */
1507 
1508     PERL_ARGS_ASSERT_SSC_INIT;
1509 
1510     Zero(ssc, 1, regnode_ssc);
1511     set_ANYOF_SYNTHETIC(ssc);
1512     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1513     ssc_anything(ssc);
1514 
1515     /* If any portion of the regex is to operate under locale rules that aren't
1516      * fully known at compile time, initialization includes it.  The reason
1517      * this isn't done for all regexes is that the optimizer was written under
1518      * the assumption that locale was all-or-nothing.  Given the complexity and
1519      * lack of documentation in the optimizer, and that there are inadequate
1520      * test cases for locale, many parts of it may not work properly, it is
1521      * safest to avoid locale unless necessary. */
1522     if (RExC_contains_locale) {
1523 	ANYOF_POSIXL_SETALL(ssc);
1524     }
1525     else {
1526 	ANYOF_POSIXL_ZERO(ssc);
1527     }
1528 }
1529 
1530 STATIC int
S_ssc_is_cp_posixl_init(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)1531 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1532                         const regnode_ssc *ssc)
1533 {
1534     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1535      * to the list of code points matched, and locale posix classes; hence does
1536      * not check its flags) */
1537 
1538     UV start, end;
1539     bool ret;
1540 
1541     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1542 
1543     assert(is_ANYOF_SYNTHETIC(ssc));
1544 
1545     invlist_iterinit(ssc->invlist);
1546     ret = invlist_iternext(ssc->invlist, &start, &end)
1547           && start == 0
1548           && end == UV_MAX;
1549 
1550     invlist_iterfinish(ssc->invlist);
1551 
1552     if (! ret) {
1553         return FALSE;
1554     }
1555 
1556     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1557         return FALSE;
1558     }
1559 
1560     return TRUE;
1561 }
1562 
1563 #define INVLIST_INDEX 0
1564 #define ONLY_LOCALE_MATCHES_INDEX 1
1565 #define DEFERRED_USER_DEFINED_INDEX 2
1566 
1567 STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t * pRExC_state,const regnode_charclass * const node)1568 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1569                                const regnode_charclass* const node)
1570 {
1571     /* Returns a mortal inversion list defining which code points are matched
1572      * by 'node', which is of type ANYOF.  Handles complementing the result if
1573      * appropriate.  If some code points aren't knowable at this time, the
1574      * returned list must, and will, contain every code point that is a
1575      * possibility. */
1576 
1577     dVAR;
1578     SV* invlist = NULL;
1579     SV* only_utf8_locale_invlist = NULL;
1580     unsigned int i;
1581     const U32 n = ARG(node);
1582     bool new_node_has_latin1 = FALSE;
1583     const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
1584 
1585     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1586 
1587     /* Look at the data structure created by S_set_ANYOF_arg() */
1588     if (n != ANYOF_ONLY_HAS_BITMAP) {
1589         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1590         AV * const av = MUTABLE_AV(SvRV(rv));
1591         SV **const ary = AvARRAY(av);
1592         assert(RExC_rxi->data->what[n] == 's');
1593 
1594         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1595 
1596             /* Here there are things that won't be known until runtime -- we
1597              * have to assume it could be anything */
1598             invlist = sv_2mortal(_new_invlist(1));
1599             return _add_range_to_invlist(invlist, 0, UV_MAX);
1600         }
1601         else if (ary[INVLIST_INDEX]) {
1602 
1603             /* Use the node's inversion list */
1604             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1605         }
1606 
1607         /* Get the code points valid only under UTF-8 locales */
1608         if (   (flags & ANYOFL_FOLD)
1609             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1610         {
1611             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1612         }
1613     }
1614 
1615     if (! invlist) {
1616         invlist = sv_2mortal(_new_invlist(0));
1617     }
1618 
1619     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1620      * code points, and an inversion list for the others, but if there are code
1621      * points that should match only conditionally on the target string being
1622      * UTF-8, those are placed in the inversion list, and not the bitmap.
1623      * Since there are circumstances under which they could match, they are
1624      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1625      * to exclude them here, so that when we invert below, the end result
1626      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1627      * have to do this here before we add the unconditionally matched code
1628      * points */
1629     if (flags & ANYOF_INVERT) {
1630         _invlist_intersection_complement_2nd(invlist,
1631                                              PL_UpperLatin1,
1632                                              &invlist);
1633     }
1634 
1635     /* Add in the points from the bit map */
1636     if (OP(node) != ANYOFH) {
1637         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1638             if (ANYOF_BITMAP_TEST(node, i)) {
1639                 unsigned int start = i++;
1640 
1641                 for (;    i < NUM_ANYOF_CODE_POINTS
1642                        && ANYOF_BITMAP_TEST(node, i); ++i)
1643                 {
1644                     /* empty */
1645                 }
1646                 invlist = _add_range_to_invlist(invlist, start, i-1);
1647                 new_node_has_latin1 = TRUE;
1648             }
1649         }
1650     }
1651 
1652     /* If this can match all upper Latin1 code points, have to add them
1653      * as well.  But don't add them if inverting, as when that gets done below,
1654      * it would exclude all these characters, including the ones it shouldn't
1655      * that were added just above */
1656     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1657         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1658     {
1659         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1660     }
1661 
1662     /* Similarly for these */
1663     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1664         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1665     }
1666 
1667     if (flags & ANYOF_INVERT) {
1668         _invlist_invert(invlist);
1669     }
1670     else if (flags & ANYOFL_FOLD) {
1671         if (new_node_has_latin1) {
1672 
1673             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1674              * the locale.  We can skip this if there are no 0-255 at all. */
1675             _invlist_union(invlist, PL_Latin1, &invlist);
1676 
1677             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1678             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1679         }
1680         else {
1681             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1682                 invlist = add_cp_to_invlist(invlist, 'I');
1683             }
1684             if (_invlist_contains_cp(invlist,
1685                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1686             {
1687                 invlist = add_cp_to_invlist(invlist, 'i');
1688             }
1689         }
1690     }
1691 
1692     /* Similarly add the UTF-8 locale possible matches.  These have to be
1693      * deferred until after the non-UTF-8 locale ones are taken care of just
1694      * above, or it leads to wrong results under ANYOF_INVERT */
1695     if (only_utf8_locale_invlist) {
1696         _invlist_union_maybe_complement_2nd(invlist,
1697                                             only_utf8_locale_invlist,
1698                                             flags & ANYOF_INVERT,
1699                                             &invlist);
1700     }
1701 
1702     return invlist;
1703 }
1704 
1705 /* These two functions currently do the exact same thing */
1706 #define ssc_init_zero		ssc_init
1707 
1708 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1709 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1710 
1711 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1712  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1713  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1714 
1715 STATIC void
S_ssc_and(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * and_with)1716 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1717                 const regnode_charclass *and_with)
1718 {
1719     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1720      * another SSC or a regular ANYOF class.  Can create false positives. */
1721 
1722     SV* anded_cp_list;
1723     U8  and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
1724     U8  anded_flags;
1725 
1726     PERL_ARGS_ASSERT_SSC_AND;
1727 
1728     assert(is_ANYOF_SYNTHETIC(ssc));
1729 
1730     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1731      * the code point inversion list and just the relevant flags */
1732     if (is_ANYOF_SYNTHETIC(and_with)) {
1733         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1734         anded_flags = and_with_flags;
1735 
1736         /* XXX This is a kludge around what appears to be deficiencies in the
1737          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1738          * there are paths through the optimizer where it doesn't get weeded
1739          * out when it should.  And if we don't make some extra provision for
1740          * it like the code just below, it doesn't get added when it should.
1741          * This solution is to add it only when AND'ing, which is here, and
1742          * only when what is being AND'ed is the pristine, original node
1743          * matching anything.  Thus it is like adding it to ssc_anything() but
1744          * only when the result is to be AND'ed.  Probably the same solution
1745          * could be adopted for the same problem we have with /l matching,
1746          * which is solved differently in S_ssc_init(), and that would lead to
1747          * fewer false positives than that solution has.  But if this solution
1748          * creates bugs, the consequences are only that a warning isn't raised
1749          * that should be; while the consequences for having /l bugs is
1750          * incorrect matches */
1751         if (ssc_is_anything((regnode_ssc *)and_with)) {
1752             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1753         }
1754     }
1755     else {
1756         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1757         if (OP(and_with) == ANYOFD) {
1758             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1759         }
1760         else {
1761             anded_flags = and_with_flags
1762             &( ANYOF_COMMON_FLAGS
1763               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1764               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1765             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1766                 anded_flags &=
1767                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1768             }
1769         }
1770     }
1771 
1772     ANYOF_FLAGS(ssc) &= anded_flags;
1773 
1774     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1775      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1776      * 'and_with' may be inverted.  When not inverted, we have the situation of
1777      * computing:
1778      *  (C1 | P1) & (C2 | P2)
1779      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1780      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1781      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1782      *                    <=  ((C1 & C2) | P1 | P2)
1783      * Alternatively, the last few steps could be:
1784      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1785      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1786      *                    <=  (C1 | C2 | (P1 & P2))
1787      * We favor the second approach if either P1 or P2 is non-empty.  This is
1788      * because these components are a barrier to doing optimizations, as what
1789      * they match cannot be known until the moment of matching as they are
1790      * dependent on the current locale, 'AND"ing them likely will reduce or
1791      * eliminate them.
1792      * But we can do better if we know that C1,P1 are in their initial state (a
1793      * frequent occurrence), each matching everything:
1794      *  (<everything>) & (C2 | P2) =  C2 | P2
1795      * Similarly, if C2,P2 are in their initial state (again a frequent
1796      * occurrence), the result is a no-op
1797      *  (C1 | P1) & (<everything>) =  C1 | P1
1798      *
1799      * Inverted, we have
1800      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1801      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1802      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1803      * */
1804 
1805     if ((and_with_flags & ANYOF_INVERT)
1806         && ! is_ANYOF_SYNTHETIC(and_with))
1807     {
1808         unsigned int i;
1809 
1810         ssc_intersection(ssc,
1811                          anded_cp_list,
1812                          FALSE /* Has already been inverted */
1813                          );
1814 
1815         /* If either P1 or P2 is empty, the intersection will be also; can skip
1816          * the loop */
1817         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1818             ANYOF_POSIXL_ZERO(ssc);
1819         }
1820         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1821 
1822             /* Note that the Posix class component P from 'and_with' actually
1823              * looks like:
1824              *      P = Pa | Pb | ... | Pn
1825              * where each component is one posix class, such as in [\w\s].
1826              * Thus
1827              *      ~P = ~(Pa | Pb | ... | Pn)
1828              *         = ~Pa & ~Pb & ... & ~Pn
1829              *        <= ~Pa | ~Pb | ... | ~Pn
1830              * The last is something we can easily calculate, but unfortunately
1831              * is likely to have many false positives.  We could do better
1832              * in some (but certainly not all) instances if two classes in
1833              * P have known relationships.  For example
1834              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1835              * So
1836              *      :lower: & :print: = :lower:
1837              * And similarly for classes that must be disjoint.  For example,
1838              * since \s and \w can have no elements in common based on rules in
1839              * the POSIX standard,
1840              *      \w & ^\S = nothing
1841              * Unfortunately, some vendor locales do not meet the Posix
1842              * standard, in particular almost everything by Microsoft.
1843              * The loop below just changes e.g., \w into \W and vice versa */
1844 
1845             regnode_charclass_posixl temp;
1846             int add = 1;    /* To calculate the index of the complement */
1847 
1848             Zero(&temp, 1, regnode_charclass_posixl);
1849             ANYOF_POSIXL_ZERO(&temp);
1850             for (i = 0; i < ANYOF_MAX; i++) {
1851                 assert(i % 2 != 0
1852                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1853                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1854 
1855                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1856                     ANYOF_POSIXL_SET(&temp, i + add);
1857                 }
1858                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1859             }
1860             ANYOF_POSIXL_AND(&temp, ssc);
1861 
1862         } /* else ssc already has no posixes */
1863     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1864          in its initial state */
1865     else if (! is_ANYOF_SYNTHETIC(and_with)
1866              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1867     {
1868         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1869          * copy it over 'ssc' */
1870         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1871             if (is_ANYOF_SYNTHETIC(and_with)) {
1872                 StructCopy(and_with, ssc, regnode_ssc);
1873             }
1874             else {
1875                 ssc->invlist = anded_cp_list;
1876                 ANYOF_POSIXL_ZERO(ssc);
1877                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1878                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1879                 }
1880             }
1881         }
1882         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1883                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1884         {
1885             /* One or the other of P1, P2 is non-empty. */
1886             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1887                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1888             }
1889             ssc_union(ssc, anded_cp_list, FALSE);
1890         }
1891         else { /* P1 = P2 = empty */
1892             ssc_intersection(ssc, anded_cp_list, FALSE);
1893         }
1894     }
1895 }
1896 
1897 STATIC void
S_ssc_or(pTHX_ const RExC_state_t * pRExC_state,regnode_ssc * ssc,const regnode_charclass * or_with)1898 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1899                const regnode_charclass *or_with)
1900 {
1901     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1902      * another SSC or a regular ANYOF class.  Can create false positives if
1903      * 'or_with' is to be inverted. */
1904 
1905     SV* ored_cp_list;
1906     U8 ored_flags;
1907     U8  or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
1908 
1909     PERL_ARGS_ASSERT_SSC_OR;
1910 
1911     assert(is_ANYOF_SYNTHETIC(ssc));
1912 
1913     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1914      * the code point inversion list and just the relevant flags */
1915     if (is_ANYOF_SYNTHETIC(or_with)) {
1916         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1917         ored_flags = or_with_flags;
1918     }
1919     else {
1920         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1921         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1922         if (OP(or_with) != ANYOFD) {
1923             ored_flags
1924             |= or_with_flags
1925              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1926                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1927             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1928                 ored_flags |=
1929                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1930             }
1931         }
1932     }
1933 
1934     ANYOF_FLAGS(ssc) |= ored_flags;
1935 
1936     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1937      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1938      * 'or_with' may be inverted.  When not inverted, we have the simple
1939      * situation of computing:
1940      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1941      * If P1|P2 yields a situation with both a class and its complement are
1942      * set, like having both \w and \W, this matches all code points, and we
1943      * can delete these from the P component of the ssc going forward.  XXX We
1944      * might be able to delete all the P components, but I (khw) am not certain
1945      * about this, and it is better to be safe.
1946      *
1947      * Inverted, we have
1948      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1949      *                         <=  (C1 | P1) | ~C2
1950      *                         <=  (C1 | ~C2) | P1
1951      * (which results in actually simpler code than the non-inverted case)
1952      * */
1953 
1954     if ((or_with_flags & ANYOF_INVERT)
1955         && ! is_ANYOF_SYNTHETIC(or_with))
1956     {
1957         /* We ignore P2, leaving P1 going forward */
1958     }   /* else  Not inverted */
1959     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1960         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1961         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1962             unsigned int i;
1963             for (i = 0; i < ANYOF_MAX; i += 2) {
1964                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1965                 {
1966                     ssc_match_all_cp(ssc);
1967                     ANYOF_POSIXL_CLEAR(ssc, i);
1968                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1969                 }
1970             }
1971         }
1972     }
1973 
1974     ssc_union(ssc,
1975               ored_cp_list,
1976               FALSE /* Already has been inverted */
1977               );
1978 }
1979 
1980 PERL_STATIC_INLINE void
S_ssc_union(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)1981 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1982 {
1983     PERL_ARGS_ASSERT_SSC_UNION;
1984 
1985     assert(is_ANYOF_SYNTHETIC(ssc));
1986 
1987     _invlist_union_maybe_complement_2nd(ssc->invlist,
1988                                         invlist,
1989                                         invert2nd,
1990                                         &ssc->invlist);
1991 }
1992 
1993 PERL_STATIC_INLINE void
S_ssc_intersection(pTHX_ regnode_ssc * ssc,SV * const invlist,const bool invert2nd)1994 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1995                          SV* const invlist,
1996                          const bool invert2nd)
1997 {
1998     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1999 
2000     assert(is_ANYOF_SYNTHETIC(ssc));
2001 
2002     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2003                                                invlist,
2004                                                invert2nd,
2005                                                &ssc->invlist);
2006 }
2007 
2008 PERL_STATIC_INLINE void
S_ssc_add_range(pTHX_ regnode_ssc * ssc,const UV start,const UV end)2009 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2010 {
2011     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2012 
2013     assert(is_ANYOF_SYNTHETIC(ssc));
2014 
2015     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2016 }
2017 
2018 PERL_STATIC_INLINE void
S_ssc_cp_and(pTHX_ regnode_ssc * ssc,const UV cp)2019 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2020 {
2021     /* AND just the single code point 'cp' into the SSC 'ssc' */
2022 
2023     SV* cp_list = _new_invlist(2);
2024 
2025     PERL_ARGS_ASSERT_SSC_CP_AND;
2026 
2027     assert(is_ANYOF_SYNTHETIC(ssc));
2028 
2029     cp_list = add_cp_to_invlist(cp_list, cp);
2030     ssc_intersection(ssc, cp_list,
2031                      FALSE /* Not inverted */
2032                      );
2033     SvREFCNT_dec_NN(cp_list);
2034 }
2035 
2036 PERL_STATIC_INLINE void
S_ssc_clear_locale(regnode_ssc * ssc)2037 S_ssc_clear_locale(regnode_ssc *ssc)
2038 {
2039     /* Set the SSC 'ssc' to not match any locale things */
2040     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2041 
2042     assert(is_ANYOF_SYNTHETIC(ssc));
2043 
2044     ANYOF_POSIXL_ZERO(ssc);
2045     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2046 }
2047 
2048 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2049 
2050 STATIC bool
S_is_ssc_worth_it(const RExC_state_t * pRExC_state,const regnode_ssc * ssc)2051 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2052 {
2053     /* The synthetic start class is used to hopefully quickly winnow down
2054      * places where a pattern could start a match in the target string.  If it
2055      * doesn't really narrow things down that much, there isn't much point to
2056      * having the overhead of using it.  This function uses some very crude
2057      * heuristics to decide if to use the ssc or not.
2058      *
2059      * It returns TRUE if 'ssc' rules out more than half what it considers to
2060      * be the "likely" possible matches, but of course it doesn't know what the
2061      * actual things being matched are going to be; these are only guesses
2062      *
2063      * For /l matches, it assumes that the only likely matches are going to be
2064      *      in the 0-255 range, uniformly distributed, so half of that is 127
2065      * For /a and /d matches, it assumes that the likely matches will be just
2066      *      the ASCII range, so half of that is 63
2067      * For /u and there isn't anything matching above the Latin1 range, it
2068      *      assumes that that is the only range likely to be matched, and uses
2069      *      half that as the cut-off: 127.  If anything matches above Latin1,
2070      *      it assumes that all of Unicode could match (uniformly), except for
2071      *      non-Unicode code points and things in the General Category "Other"
2072      *      (unassigned, private use, surrogates, controls and formats).  This
2073      *      is a much large number. */
2074 
2075     U32 count = 0;      /* Running total of number of code points matched by
2076                            'ssc' */
2077     UV start, end;      /* Start and end points of current range in inversion
2078                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2079     const U32 max_code_points = (LOC)
2080                                 ?  256
2081                                 : ((  ! UNI_SEMANTICS
2082                                     ||  invlist_highest(ssc->invlist) < 256)
2083                                   ? 128
2084                                   : NON_OTHER_COUNT);
2085     const U32 max_match = max_code_points / 2;
2086 
2087     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2088 
2089     invlist_iterinit(ssc->invlist);
2090     while (invlist_iternext(ssc->invlist, &start, &end)) {
2091         if (start >= max_code_points) {
2092             break;
2093         }
2094         end = MIN(end, max_code_points - 1);
2095         count += end - start + 1;
2096         if (count >= max_match) {
2097             invlist_iterfinish(ssc->invlist);
2098             return FALSE;
2099         }
2100     }
2101 
2102     return TRUE;
2103 }
2104 
2105 
2106 STATIC void
S_ssc_finalize(pTHX_ RExC_state_t * pRExC_state,regnode_ssc * ssc)2107 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2108 {
2109     /* The inversion list in the SSC is marked mortal; now we need a more
2110      * permanent copy, which is stored the same way that is done in a regular
2111      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2112      * map */
2113 
2114     SV* invlist = invlist_clone(ssc->invlist, NULL);
2115 
2116     PERL_ARGS_ASSERT_SSC_FINALIZE;
2117 
2118     assert(is_ANYOF_SYNTHETIC(ssc));
2119 
2120     /* The code in this file assumes that all but these flags aren't relevant
2121      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2122      * by the time we reach here */
2123     assert(! (ANYOF_FLAGS(ssc)
2124         & ~( ANYOF_COMMON_FLAGS
2125             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2126             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2127 
2128     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2129 
2130     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2131 
2132     /* Make sure is clone-safe */
2133     ssc->invlist = NULL;
2134 
2135     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2136         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2137         OP(ssc) = ANYOFPOSIXL;
2138     }
2139     else if (RExC_contains_locale) {
2140         OP(ssc) = ANYOFL;
2141     }
2142 
2143     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2144 }
2145 
2146 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2147 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2148 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2149 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2150                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2151                                : 0 )
2152 
2153 
2154 #ifdef DEBUGGING
2155 /*
2156    dump_trie(trie,widecharmap,revcharmap)
2157    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2158    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2159 
2160    These routines dump out a trie in a somewhat readable format.
2161    The _interim_ variants are used for debugging the interim
2162    tables that are used to generate the final compressed
2163    representation which is what dump_trie expects.
2164 
2165    Part of the reason for their existence is to provide a form
2166    of documentation as to how the different representations function.
2167 
2168 */
2169 
2170 /*
2171   Dumps the final compressed table form of the trie to Perl_debug_log.
2172   Used for debugging make_trie().
2173 */
2174 
2175 STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 depth)2176 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2177 	    AV *revcharmap, U32 depth)
2178 {
2179     U32 state;
2180     SV *sv=sv_newmortal();
2181     int colwidth= widecharmap ? 6 : 4;
2182     U16 word;
2183     GET_RE_DEBUG_FLAGS_DECL;
2184 
2185     PERL_ARGS_ASSERT_DUMP_TRIE;
2186 
2187     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2188         depth+1, "Match","Base","Ofs" );
2189 
2190     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2191 	SV ** const tmp = av_fetch( revcharmap, state, 0);
2192         if ( tmp ) {
2193             Perl_re_printf( aTHX_  "%*s",
2194                 colwidth,
2195                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2196 	                    PL_colors[0], PL_colors[1],
2197 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2198 	                    PERL_PV_ESCAPE_FIRSTCHAR
2199                 )
2200             );
2201         }
2202     }
2203     Perl_re_printf( aTHX_  "\n");
2204     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2205 
2206     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2207         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2208     Perl_re_printf( aTHX_  "\n");
2209 
2210     for( state = 1 ; state < trie->statecount ; state++ ) {
2211 	const U32 base = trie->states[ state ].trans.base;
2212 
2213         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2214 
2215         if ( trie->states[ state ].wordnum ) {
2216             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2217         } else {
2218             Perl_re_printf( aTHX_  "%6s", "" );
2219         }
2220 
2221         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2222 
2223         if ( base ) {
2224             U32 ofs = 0;
2225 
2226             while( ( base + ofs  < trie->uniquecharcount ) ||
2227                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2228                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2229                                                                     != state))
2230                     ofs++;
2231 
2232             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2233 
2234             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2235                 if ( ( base + ofs >= trie->uniquecharcount )
2236                         && ( base + ofs - trie->uniquecharcount
2237                                                         < trie->lasttrans )
2238                         && trie->trans[ base + ofs
2239                                     - trie->uniquecharcount ].check == state )
2240                 {
2241                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2242                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2243                    );
2244                 } else {
2245                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2246                 }
2247             }
2248 
2249             Perl_re_printf( aTHX_  "]");
2250 
2251         }
2252         Perl_re_printf( aTHX_  "\n" );
2253     }
2254     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2255                                 depth);
2256     for (word=1; word <= trie->wordcount; word++) {
2257         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2258 	    (int)word, (int)(trie->wordinfo[word].prev),
2259 	    (int)(trie->wordinfo[word].len));
2260     }
2261     Perl_re_printf( aTHX_  "\n" );
2262 }
2263 /*
2264   Dumps a fully constructed but uncompressed trie in list form.
2265   List tries normally only are used for construction when the number of
2266   possible chars (trie->uniquecharcount) is very high.
2267   Used for debugging make_trie().
2268 */
2269 STATIC void
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2270 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2271 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2272 			 U32 depth)
2273 {
2274     U32 state;
2275     SV *sv=sv_newmortal();
2276     int colwidth= widecharmap ? 6 : 4;
2277     GET_RE_DEBUG_FLAGS_DECL;
2278 
2279     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2280 
2281     /* print out the table precompression.  */
2282     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2283             depth+1 );
2284     Perl_re_indentf( aTHX_  "%s",
2285             depth+1, "------:-----+-----------------\n" );
2286 
2287     for( state=1 ; state < next_alloc ; state ++ ) {
2288         U16 charid;
2289 
2290         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2291             depth+1, (UV)state  );
2292         if ( ! trie->states[ state ].wordnum ) {
2293             Perl_re_printf( aTHX_  "%5s| ","");
2294         } else {
2295             Perl_re_printf( aTHX_  "W%4x| ",
2296                 trie->states[ state ].wordnum
2297             );
2298         }
2299         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2300 	    SV ** const tmp = av_fetch( revcharmap,
2301                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2302 	    if ( tmp ) {
2303                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2304                     colwidth,
2305                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2306                               colwidth,
2307                               PL_colors[0], PL_colors[1],
2308                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2309                               | PERL_PV_ESCAPE_FIRSTCHAR
2310                     ) ,
2311                     TRIE_LIST_ITEM(state, charid).forid,
2312                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2313                 );
2314                 if (!(charid % 10))
2315                     Perl_re_printf( aTHX_  "\n%*s| ",
2316                         (int)((depth * 2) + 14), "");
2317             }
2318         }
2319         Perl_re_printf( aTHX_  "\n");
2320     }
2321 }
2322 
2323 /*
2324   Dumps a fully constructed but uncompressed trie in table form.
2325   This is the normal DFA style state transition table, with a few
2326   twists to facilitate compression later.
2327   Used for debugging make_trie().
2328 */
2329 STATIC void
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)2330 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2331 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
2332 			  U32 depth)
2333 {
2334     U32 state;
2335     U16 charid;
2336     SV *sv=sv_newmortal();
2337     int colwidth= widecharmap ? 6 : 4;
2338     GET_RE_DEBUG_FLAGS_DECL;
2339 
2340     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2341 
2342     /*
2343        print out the table precompression so that we can do a visual check
2344        that they are identical.
2345      */
2346 
2347     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2348 
2349     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2350 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
2351         if ( tmp ) {
2352             Perl_re_printf( aTHX_  "%*s",
2353                 colwidth,
2354                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2355 	                    PL_colors[0], PL_colors[1],
2356 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2357 	                    PERL_PV_ESCAPE_FIRSTCHAR
2358                 )
2359             );
2360         }
2361     }
2362 
2363     Perl_re_printf( aTHX_ "\n");
2364     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2365 
2366     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2367         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2368     }
2369 
2370     Perl_re_printf( aTHX_  "\n" );
2371 
2372     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2373 
2374         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2375             depth+1,
2376             (UV)TRIE_NODENUM( state ) );
2377 
2378         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2379             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2380             if (v)
2381                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2382             else
2383                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2384         }
2385         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2386             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2387                                             (UV)trie->trans[ state ].check );
2388         } else {
2389             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2390                                             (UV)trie->trans[ state ].check,
2391             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2392         }
2393     }
2394 }
2395 
2396 #endif
2397 
2398 
2399 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2400   startbranch: the first branch in the whole branch sequence
2401   first      : start branch of sequence of branch-exact nodes.
2402 	       May be the same as startbranch
2403   last       : Thing following the last branch.
2404 	       May be the same as tail.
2405   tail       : item following the branch sequence
2406   count      : words in the sequence
2407   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2408   depth      : indent depth
2409 
2410 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2411 
2412 A trie is an N'ary tree where the branches are determined by digital
2413 decomposition of the key. IE, at the root node you look up the 1st character and
2414 follow that branch repeat until you find the end of the branches. Nodes can be
2415 marked as "accepting" meaning they represent a complete word. Eg:
2416 
2417   /he|she|his|hers/
2418 
2419 would convert into the following structure. Numbers represent states, letters
2420 following numbers represent valid transitions on the letter from that state, if
2421 the number is in square brackets it represents an accepting state, otherwise it
2422 will be in parenthesis.
2423 
2424       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2425       |    |
2426       |   (2)
2427       |    |
2428      (1)   +-i->(6)-+-s->[7]
2429       |
2430       +-s->(3)-+-h->(4)-+-e->[5]
2431 
2432       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2433 
2434 This shows that when matching against the string 'hers' we will begin at state 1
2435 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2436 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2437 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2438 single traverse. We store a mapping from accepting to state to which word was
2439 matched, and then when we have multiple possibilities we try to complete the
2440 rest of the regex in the order in which they occurred in the alternation.
2441 
2442 The only prior NFA like behaviour that would be changed by the TRIE support is
2443 the silent ignoring of duplicate alternations which are of the form:
2444 
2445  / (DUPE|DUPE) X? (?{ ... }) Y /x
2446 
2447 Thus EVAL blocks following a trie may be called a different number of times with
2448 and without the optimisation. With the optimisations dupes will be silently
2449 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2450 the following demonstrates:
2451 
2452  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2453 
2454 which prints out 'word' three times, but
2455 
2456  'words'=~/(word|word|word)(?{ print $1 })S/
2457 
2458 which doesnt print it out at all. This is due to other optimisations kicking in.
2459 
2460 Example of what happens on a structural level:
2461 
2462 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2463 
2464    1: CURLYM[1] {1,32767}(18)
2465    5:   BRANCH(8)
2466    6:     EXACT <ac>(16)
2467    8:   BRANCH(11)
2468    9:     EXACT <ad>(16)
2469   11:   BRANCH(14)
2470   12:     EXACT <ab>(16)
2471   16:   SUCCEED(0)
2472   17:   NOTHING(18)
2473   18: END(0)
2474 
2475 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2476 and should turn into:
2477 
2478    1: CURLYM[1] {1,32767}(18)
2479    5:   TRIE(16)
2480 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2481 	  <ac>
2482 	  <ad>
2483 	  <ab>
2484   16:   SUCCEED(0)
2485   17:   NOTHING(18)
2486   18: END(0)
2487 
2488 Cases where tail != last would be like /(?foo|bar)baz/:
2489 
2490    1: BRANCH(4)
2491    2:   EXACT <foo>(8)
2492    4: BRANCH(7)
2493    5:   EXACT <bar>(8)
2494    7: TAIL(8)
2495    8: EXACT <baz>(10)
2496   10: END(0)
2497 
2498 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2499 and would end up looking like:
2500 
2501     1: TRIE(8)
2502       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2503 	<foo>
2504 	<bar>
2505    7: TAIL(8)
2506    8: EXACT <baz>(10)
2507   10: END(0)
2508 
2509     d = uvchr_to_utf8_flags(d, uv, 0);
2510 
2511 is the recommended Unicode-aware way of saying
2512 
2513     *(d++) = uv;
2514 */
2515 
2516 #define TRIE_STORE_REVCHAR(val)                                            \
2517     STMT_START {                                                           \
2518 	if (UTF) {							   \
2519             SV *zlopp = newSV(UTF8_MAXBYTES);				   \
2520 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
2521             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2522 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
2523 	    SvPOK_on(zlopp);						   \
2524 	    SvUTF8_on(zlopp);						   \
2525 	    av_push(revcharmap, zlopp);					   \
2526 	} else {							   \
2527             char ooooff = (char)val;                                           \
2528 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
2529 	}								   \
2530         } STMT_END
2531 
2532 /* This gets the next character from the input, folding it if not already
2533  * folded. */
2534 #define TRIE_READ_CHAR STMT_START {                                           \
2535     wordlen++;                                                                \
2536     if ( UTF ) {                                                              \
2537         /* if it is UTF then it is either already folded, or does not need    \
2538          * folding */                                                         \
2539         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2540     }                                                                         \
2541     else if (folder == PL_fold_latin1) {                                      \
2542         /* This folder implies Unicode rules, which in the range expressible  \
2543          *  by not UTF is the lower case, with the two exceptions, one of     \
2544          *  which should have been taken care of before calling this */       \
2545         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2546         uvc = toLOWER_L1(*uc);                                                \
2547         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2548         len = 1;                                                              \
2549     } else {                                                                  \
2550         /* raw data, will be folded later if needed */                        \
2551         uvc = (U32)*uc;                                                       \
2552         len = 1;                                                              \
2553     }                                                                         \
2554 } STMT_END
2555 
2556 
2557 
2558 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2559     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2560 	U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2561 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2562         TRIE_LIST_LEN( state ) = ging;                          \
2563     }                                                           \
2564     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2565     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2566     TRIE_LIST_CUR( state )++;                                   \
2567 } STMT_END
2568 
2569 #define TRIE_LIST_NEW(state) STMT_START {                       \
2570     Newx( trie->states[ state ].trans.list,                     \
2571 	4, reg_trie_trans_le );                                 \
2572      TRIE_LIST_CUR( state ) = 1;                                \
2573      TRIE_LIST_LEN( state ) = 4;                                \
2574 } STMT_END
2575 
2576 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2577     U16 dupe= trie->states[ state ].wordnum;                    \
2578     regnode * const noper_next = regnext( noper );              \
2579                                                                 \
2580     DEBUG_r({                                                   \
2581         /* store the word for dumping */                        \
2582         SV* tmp;                                                \
2583         if (OP(noper) != NOTHING)                               \
2584             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
2585         else                                                    \
2586             tmp = newSVpvn_utf8( "", 0, UTF );			\
2587         av_push( trie_words, tmp );                             \
2588     });                                                         \
2589                                                                 \
2590     curword++;                                                  \
2591     trie->wordinfo[curword].prev   = 0;                         \
2592     trie->wordinfo[curword].len    = wordlen;                   \
2593     trie->wordinfo[curword].accept = state;                     \
2594                                                                 \
2595     if ( noper_next < tail ) {                                  \
2596         if (!trie->jump)                                        \
2597             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2598                                                  sizeof(U16) ); \
2599         trie->jump[curword] = (U16)(noper_next - convert);      \
2600         if (!jumper)                                            \
2601             jumper = noper_next;                                \
2602         if (!nextbranch)                                        \
2603             nextbranch= regnext(cur);                           \
2604     }                                                           \
2605                                                                 \
2606     if ( dupe ) {                                               \
2607         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2608         /* chain, so that when the bits of chain are later    */\
2609         /* linked together, the dups appear in the chain      */\
2610 	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2611 	trie->wordinfo[dupe].prev = curword;                    \
2612     } else {                                                    \
2613         /* we haven't inserted this word yet.                */ \
2614         trie->states[ state ].wordnum = curword;                \
2615     }                                                           \
2616 } STMT_END
2617 
2618 
2619 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
2620      ( ( base + charid >=  ucharcount					\
2621          && base + charid < ubound					\
2622          && state == trie->trans[ base - ucharcount + charid ].check	\
2623          && trie->trans[ base - ucharcount + charid ].next )		\
2624            ? trie->trans[ base - ucharcount + charid ].next		\
2625            : ( state==1 ? special : 0 )					\
2626       )
2627 
2628 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2629 STMT_START {                                                \
2630     TRIE_BITMAP_SET(trie, uvc);                             \
2631     /* store the folded codepoint */                        \
2632     if ( folder )                                           \
2633         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2634                                                             \
2635     if ( !UTF ) {                                           \
2636         /* store first byte of utf8 representation of */    \
2637         /* variant codepoints */                            \
2638         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2639             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2640         }                                                   \
2641     }                                                       \
2642 } STMT_END
2643 #define MADE_TRIE       1
2644 #define MADE_JUMP_TRIE  2
2645 #define MADE_EXACT_TRIE 4
2646 
2647 STATIC I32
S_make_trie(pTHX_ RExC_state_t * pRExC_state,regnode * startbranch,regnode * first,regnode * last,regnode * tail,U32 word_count,U32 flags,U32 depth)2648 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2649                   regnode *first, regnode *last, regnode *tail,
2650                   U32 word_count, U32 flags, U32 depth)
2651 {
2652     /* first pass, loop through and scan words */
2653     reg_trie_data *trie;
2654     HV *widecharmap = NULL;
2655     AV *revcharmap = newAV();
2656     regnode *cur;
2657     STRLEN len = 0;
2658     UV uvc = 0;
2659     U16 curword = 0;
2660     U32 next_alloc = 0;
2661     regnode *jumper = NULL;
2662     regnode *nextbranch = NULL;
2663     regnode *convert = NULL;
2664     U32 *prev_states; /* temp array mapping each state to previous one */
2665     /* we just use folder as a flag in utf8 */
2666     const U8 * folder = NULL;
2667 
2668     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2669      * which stands for one trie structure, one hash, optionally followed
2670      * by two arrays */
2671 #ifdef DEBUGGING
2672     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2673     AV *trie_words = NULL;
2674     /* along with revcharmap, this only used during construction but both are
2675      * useful during debugging so we store them in the struct when debugging.
2676      */
2677 #else
2678     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2679     STRLEN trie_charcount=0;
2680 #endif
2681     SV *re_trie_maxbuff;
2682     GET_RE_DEBUG_FLAGS_DECL;
2683 
2684     PERL_ARGS_ASSERT_MAKE_TRIE;
2685 #ifndef DEBUGGING
2686     PERL_UNUSED_ARG(depth);
2687 #endif
2688 
2689     switch (flags) {
2690         case EXACT: case EXACT_ONLY8: case EXACTL: break;
2691 	case EXACTFAA:
2692         case EXACTFUP:
2693 	case EXACTFU:
2694 	case EXACTFLU8: folder = PL_fold_latin1; break;
2695 	case EXACTF:  folder = PL_fold; break;
2696         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2697     }
2698 
2699     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2700     trie->refcount = 1;
2701     trie->startstate = 1;
2702     trie->wordcount = word_count;
2703     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2704     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2705     if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2706 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2707     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2708                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2709 
2710     DEBUG_r({
2711         trie_words = newAV();
2712     });
2713 
2714     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2715     assert(re_trie_maxbuff);
2716     if (!SvIOK(re_trie_maxbuff)) {
2717         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2718     }
2719     DEBUG_TRIE_COMPILE_r({
2720         Perl_re_indentf( aTHX_
2721           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2722           depth+1,
2723           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2724           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2725     });
2726 
2727    /* Find the node we are going to overwrite */
2728     if ( first == startbranch && OP( last ) != BRANCH ) {
2729         /* whole branch chain */
2730         convert = first;
2731     } else {
2732         /* branch sub-chain */
2733         convert = NEXTOPER( first );
2734     }
2735 
2736     /*  -- First loop and Setup --
2737 
2738        We first traverse the branches and scan each word to determine if it
2739        contains widechars, and how many unique chars there are, this is
2740        important as we have to build a table with at least as many columns as we
2741        have unique chars.
2742 
2743        We use an array of integers to represent the character codes 0..255
2744        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2745        the native representation of the character value as the key and IV's for
2746        the coded index.
2747 
2748        *TODO* If we keep track of how many times each character is used we can
2749        remap the columns so that the table compression later on is more
2750        efficient in terms of memory by ensuring the most common value is in the
2751        middle and the least common are on the outside.  IMO this would be better
2752        than a most to least common mapping as theres a decent chance the most
2753        common letter will share a node with the least common, meaning the node
2754        will not be compressible. With a middle is most common approach the worst
2755        case is when we have the least common nodes twice.
2756 
2757      */
2758 
2759     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2760         regnode *noper = NEXTOPER( cur );
2761         const U8 *uc;
2762         const U8 *e;
2763         int foldlen = 0;
2764         U32 wordlen      = 0;         /* required init */
2765         STRLEN minchars = 0;
2766         STRLEN maxchars = 0;
2767         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2768                                                bitmap?*/
2769 
2770         if (OP(noper) == NOTHING) {
2771             /* skip past a NOTHING at the start of an alternation
2772              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2773              *
2774              * If the next node is not something we are supposed to process
2775              * we will just ignore it due to the condition guarding the
2776              * next block.
2777              */
2778 
2779             regnode *noper_next= regnext(noper);
2780             if (noper_next < tail)
2781                 noper= noper_next;
2782         }
2783 
2784         if (    noper < tail
2785             && (    OP(noper) == flags
2786                 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2787                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2788                                          || OP(noper) == EXACTFUP))))
2789         {
2790             uc= (U8*)STRING(noper);
2791             e= uc + STR_LEN(noper);
2792         } else {
2793             trie->minlen= 0;
2794             continue;
2795         }
2796 
2797 
2798         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2799             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2800                                           regardless of encoding */
2801             if (OP( noper ) == EXACTFUP) {
2802                 /* false positives are ok, so just set this */
2803                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2804             }
2805         }
2806 
2807         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2808                                            branch */
2809             TRIE_CHARCOUNT(trie)++;
2810             TRIE_READ_CHAR;
2811 
2812             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2813              * is in effect.  Under /i, this character can match itself, or
2814              * anything that folds to it.  If not under /i, it can match just
2815              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2816              * all fold to k, and all are single characters.   But some folds
2817              * expand to more than one character, so for example LATIN SMALL
2818              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2819              * the string beginning at 'uc' is 'ffi', it could be matched by
2820              * three characters, or just by the one ligature character. (It
2821              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2822              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2823              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2824              * match.)  The trie needs to know the minimum and maximum number
2825              * of characters that could match so that it can use size alone to
2826              * quickly reject many match attempts.  The max is simple: it is
2827              * the number of folded characters in this branch (since a fold is
2828              * never shorter than what folds to it. */
2829 
2830             maxchars++;
2831 
2832             /* And the min is equal to the max if not under /i (indicated by
2833              * 'folder' being NULL), or there are no multi-character folds.  If
2834              * there is a multi-character fold, the min is incremented just
2835              * once, for the character that folds to the sequence.  Each
2836              * character in the sequence needs to be added to the list below of
2837              * characters in the trie, but we count only the first towards the
2838              * min number of characters needed.  This is done through the
2839              * variable 'foldlen', which is returned by the macros that look
2840              * for these sequences as the number of bytes the sequence
2841              * occupies.  Each time through the loop, we decrement 'foldlen' by
2842              * how many bytes the current char occupies.  Only when it reaches
2843              * 0 do we increment 'minchars' or look for another multi-character
2844              * sequence. */
2845             if (folder == NULL) {
2846                 minchars++;
2847             }
2848             else if (foldlen > 0) {
2849                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2850             }
2851             else {
2852                 minchars++;
2853 
2854                 /* See if *uc is the beginning of a multi-character fold.  If
2855                  * so, we decrement the length remaining to look at, to account
2856                  * for the current character this iteration.  (We can use 'uc'
2857                  * instead of the fold returned by TRIE_READ_CHAR because for
2858                  * non-UTF, the latin1_safe macro is smart enough to account
2859                  * for all the unfolded characters, and because for UTF, the
2860                  * string will already have been folded earlier in the
2861                  * compilation process */
2862                 if (UTF) {
2863                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2864                         foldlen -= UTF8SKIP(uc);
2865                     }
2866                 }
2867                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2868                     foldlen--;
2869                 }
2870             }
2871 
2872             /* The current character (and any potential folds) should be added
2873              * to the possible matching characters for this position in this
2874              * branch */
2875             if ( uvc < 256 ) {
2876                 if ( folder ) {
2877                     U8 folded= folder[ (U8) uvc ];
2878                     if ( !trie->charmap[ folded ] ) {
2879                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2880                         TRIE_STORE_REVCHAR( folded );
2881                     }
2882                 }
2883                 if ( !trie->charmap[ uvc ] ) {
2884                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2885                     TRIE_STORE_REVCHAR( uvc );
2886                 }
2887                 if ( set_bit ) {
2888 		    /* store the codepoint in the bitmap, and its folded
2889 		     * equivalent. */
2890                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2891                     set_bit = 0; /* We've done our bit :-) */
2892                 }
2893             } else {
2894 
2895                 /* XXX We could come up with the list of code points that fold
2896                  * to this using PL_utf8_foldclosures, except not for
2897                  * multi-char folds, as there may be multiple combinations
2898                  * there that could work, which needs to wait until runtime to
2899                  * resolve (The comment about LIGATURE FFI above is such an
2900                  * example */
2901 
2902                 SV** svpp;
2903                 if ( !widecharmap )
2904                     widecharmap = newHV();
2905 
2906                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2907 
2908                 if ( !svpp )
2909                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2910 
2911                 if ( !SvTRUE( *svpp ) ) {
2912                     sv_setiv( *svpp, ++trie->uniquecharcount );
2913                     TRIE_STORE_REVCHAR(uvc);
2914                 }
2915             }
2916         } /* end loop through characters in this branch of the trie */
2917 
2918         /* We take the min and max for this branch and combine to find the min
2919          * and max for all branches processed so far */
2920         if( cur == first ) {
2921             trie->minlen = minchars;
2922             trie->maxlen = maxchars;
2923         } else if (minchars < trie->minlen) {
2924             trie->minlen = minchars;
2925         } else if (maxchars > trie->maxlen) {
2926             trie->maxlen = maxchars;
2927         }
2928     } /* end first pass */
2929     DEBUG_TRIE_COMPILE_r(
2930         Perl_re_indentf( aTHX_
2931                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2932                 depth+1,
2933                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2934 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2935 		(int)trie->minlen, (int)trie->maxlen )
2936     );
2937 
2938     /*
2939         We now know what we are dealing with in terms of unique chars and
2940         string sizes so we can calculate how much memory a naive
2941         representation using a flat table  will take. If it's over a reasonable
2942         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2943         conservative but potentially much slower representation using an array
2944         of lists.
2945 
2946         At the end we convert both representations into the same compressed
2947         form that will be used in regexec.c for matching with. The latter
2948         is a form that cannot be used to construct with but has memory
2949         properties similar to the list form and access properties similar
2950         to the table form making it both suitable for fast searches and
2951         small enough that its feasable to store for the duration of a program.
2952 
2953         See the comment in the code where the compressed table is produced
2954         inplace from the flat tabe representation for an explanation of how
2955         the compression works.
2956 
2957     */
2958 
2959 
2960     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2961     prev_states[1] = 0;
2962 
2963     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2964                                                     > SvIV(re_trie_maxbuff) )
2965     {
2966         /*
2967             Second Pass -- Array Of Lists Representation
2968 
2969             Each state will be represented by a list of charid:state records
2970             (reg_trie_trans_le) the first such element holds the CUR and LEN
2971             points of the allocated array. (See defines above).
2972 
2973             We build the initial structure using the lists, and then convert
2974             it into the compressed table form which allows faster lookups
2975             (but cant be modified once converted).
2976         */
2977 
2978         STRLEN transcount = 1;
2979 
2980         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2981             depth+1));
2982 
2983 	trie->states = (reg_trie_state *)
2984 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2985 				  sizeof(reg_trie_state) );
2986         TRIE_LIST_NEW(1);
2987         next_alloc = 2;
2988 
2989         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2990 
2991             regnode *noper   = NEXTOPER( cur );
2992 	    U32 state        = 1;         /* required init */
2993 	    U16 charid       = 0;         /* sanity init */
2994             U32 wordlen      = 0;         /* required init */
2995 
2996             if (OP(noper) == NOTHING) {
2997                 regnode *noper_next= regnext(noper);
2998                 if (noper_next < tail)
2999                     noper= noper_next;
3000                 /* we will undo this assignment if noper does not
3001                  * point at a trieable type in the else clause of
3002                  * the following statement. */
3003             }
3004 
3005             if (    noper < tail
3006                 && (    OP(noper) == flags
3007                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3008                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3009                                              || OP(noper) == EXACTFUP))))
3010             {
3011                 const U8 *uc= (U8*)STRING(noper);
3012                 const U8 *e= uc + STR_LEN(noper);
3013 
3014                 for ( ; uc < e ; uc += len ) {
3015 
3016                     TRIE_READ_CHAR;
3017 
3018                     if ( uvc < 256 ) {
3019                         charid = trie->charmap[ uvc ];
3020 		    } else {
3021                         SV** const svpp = hv_fetch( widecharmap,
3022                                                     (char*)&uvc,
3023                                                     sizeof( UV ),
3024                                                     0);
3025                         if ( !svpp ) {
3026                             charid = 0;
3027                         } else {
3028                             charid=(U16)SvIV( *svpp );
3029                         }
3030 		    }
3031                     /* charid is now 0 if we dont know the char read, or
3032                      * nonzero if we do */
3033                     if ( charid ) {
3034 
3035                         U16 check;
3036                         U32 newstate = 0;
3037 
3038                         charid--;
3039                         if ( !trie->states[ state ].trans.list ) {
3040                             TRIE_LIST_NEW( state );
3041 			}
3042                         for ( check = 1;
3043                               check <= TRIE_LIST_USED( state );
3044                               check++ )
3045                         {
3046                             if ( TRIE_LIST_ITEM( state, check ).forid
3047                                                                     == charid )
3048                             {
3049                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3050                                 break;
3051                             }
3052                         }
3053                         if ( ! newstate ) {
3054                             newstate = next_alloc++;
3055 			    prev_states[newstate] = state;
3056                             TRIE_LIST_PUSH( state, charid, newstate );
3057                             transcount++;
3058                         }
3059                         state = newstate;
3060                     } else {
3061                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3062 		    }
3063 		}
3064             } else {
3065                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3066                  * on a trieable type. So we need to reset noper back to point at the first regop
3067                  * in the branch before we call TRIE_HANDLE_WORD()
3068                 */
3069                 noper= NEXTOPER(cur);
3070             }
3071             TRIE_HANDLE_WORD(state);
3072 
3073         } /* end second pass */
3074 
3075         /* next alloc is the NEXT state to be allocated */
3076         trie->statecount = next_alloc;
3077         trie->states = (reg_trie_state *)
3078 	    PerlMemShared_realloc( trie->states,
3079 				   next_alloc
3080 				   * sizeof(reg_trie_state) );
3081 
3082         /* and now dump it out before we compress it */
3083         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3084 							 revcharmap, next_alloc,
3085 							 depth+1)
3086         );
3087 
3088         trie->trans = (reg_trie_trans *)
3089 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3090         {
3091             U32 state;
3092             U32 tp = 0;
3093             U32 zp = 0;
3094 
3095 
3096             for( state=1 ; state < next_alloc ; state ++ ) {
3097                 U32 base=0;
3098 
3099                 /*
3100                 DEBUG_TRIE_COMPILE_MORE_r(
3101                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3102                 );
3103                 */
3104 
3105                 if (trie->states[state].trans.list) {
3106                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3107                     U16 maxid=minid;
3108 		    U16 idx;
3109 
3110                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3111 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3112 			if ( forid < minid ) {
3113 			    minid=forid;
3114 			} else if ( forid > maxid ) {
3115 			    maxid=forid;
3116 			}
3117                     }
3118                     if ( transcount < tp + maxid - minid + 1) {
3119                         transcount *= 2;
3120 			trie->trans = (reg_trie_trans *)
3121 			    PerlMemShared_realloc( trie->trans,
3122 						     transcount
3123 						     * sizeof(reg_trie_trans) );
3124                         Zero( trie->trans + (transcount / 2),
3125                               transcount / 2,
3126                               reg_trie_trans );
3127                     }
3128                     base = trie->uniquecharcount + tp - minid;
3129                     if ( maxid == minid ) {
3130                         U32 set = 0;
3131                         for ( ; zp < tp ; zp++ ) {
3132                             if ( ! trie->trans[ zp ].next ) {
3133                                 base = trie->uniquecharcount + zp - minid;
3134                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3135                                                                    1).newstate;
3136                                 trie->trans[ zp ].check = state;
3137                                 set = 1;
3138                                 break;
3139                             }
3140                         }
3141                         if ( !set ) {
3142                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3143                                                                    1).newstate;
3144                             trie->trans[ tp ].check = state;
3145                             tp++;
3146                             zp = tp;
3147                         }
3148                     } else {
3149                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3150                             const U32 tid = base
3151                                            - trie->uniquecharcount
3152                                            + TRIE_LIST_ITEM( state, idx ).forid;
3153                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3154                                                                 idx ).newstate;
3155                             trie->trans[ tid ].check = state;
3156                         }
3157                         tp += ( maxid - minid + 1 );
3158                     }
3159                     Safefree(trie->states[ state ].trans.list);
3160                 }
3161                 /*
3162                 DEBUG_TRIE_COMPILE_MORE_r(
3163                     Perl_re_printf( aTHX_  " base: %d\n",base);
3164                 );
3165                 */
3166                 trie->states[ state ].trans.base=base;
3167             }
3168             trie->lasttrans = tp + 1;
3169         }
3170     } else {
3171         /*
3172            Second Pass -- Flat Table Representation.
3173 
3174            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3175            each.  We know that we will need Charcount+1 trans at most to store
3176            the data (one row per char at worst case) So we preallocate both
3177            structures assuming worst case.
3178 
3179            We then construct the trie using only the .next slots of the entry
3180            structs.
3181 
3182            We use the .check field of the first entry of the node temporarily
3183            to make compression both faster and easier by keeping track of how
3184            many non zero fields are in the node.
3185 
3186            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3187            transition.
3188 
3189            There are two terms at use here: state as a TRIE_NODEIDX() which is
3190            a number representing the first entry of the node, and state as a
3191            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3192            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3193            if there are 2 entrys per node. eg:
3194 
3195              A B       A B
3196           1. 2 4    1. 3 7
3197           2. 0 3    3. 0 5
3198           3. 0 0    5. 0 0
3199           4. 0 0    7. 0 0
3200 
3201            The table is internally in the right hand, idx form. However as we
3202            also have to deal with the states array which is indexed by nodenum
3203            we have to use TRIE_NODENUM() to convert.
3204 
3205         */
3206         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3207             depth+1));
3208 
3209 	trie->trans = (reg_trie_trans *)
3210 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3211 				  * trie->uniquecharcount + 1,
3212 				  sizeof(reg_trie_trans) );
3213         trie->states = (reg_trie_state *)
3214 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3215 				  sizeof(reg_trie_state) );
3216         next_alloc = trie->uniquecharcount + 1;
3217 
3218 
3219         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3220 
3221             regnode *noper   = NEXTOPER( cur );
3222 
3223             U32 state        = 1;         /* required init */
3224 
3225             U16 charid       = 0;         /* sanity init */
3226             U32 accept_state = 0;         /* sanity init */
3227 
3228             U32 wordlen      = 0;         /* required init */
3229 
3230             if (OP(noper) == NOTHING) {
3231                 regnode *noper_next= regnext(noper);
3232                 if (noper_next < tail)
3233                     noper= noper_next;
3234                 /* we will undo this assignment if noper does not
3235                  * point at a trieable type in the else clause of
3236                  * the following statement. */
3237             }
3238 
3239             if (    noper < tail
3240                 && (    OP(noper) == flags
3241                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3242                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3243                                              || OP(noper) == EXACTFUP))))
3244             {
3245                 const U8 *uc= (U8*)STRING(noper);
3246                 const U8 *e= uc + STR_LEN(noper);
3247 
3248                 for ( ; uc < e ; uc += len ) {
3249 
3250                     TRIE_READ_CHAR;
3251 
3252                     if ( uvc < 256 ) {
3253                         charid = trie->charmap[ uvc ];
3254                     } else {
3255                         SV* const * const svpp = hv_fetch( widecharmap,
3256                                                            (char*)&uvc,
3257                                                            sizeof( UV ),
3258                                                            0);
3259                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3260                     }
3261                     if ( charid ) {
3262                         charid--;
3263                         if ( !trie->trans[ state + charid ].next ) {
3264                             trie->trans[ state + charid ].next = next_alloc;
3265                             trie->trans[ state ].check++;
3266 			    prev_states[TRIE_NODENUM(next_alloc)]
3267 				    = TRIE_NODENUM(state);
3268                             next_alloc += trie->uniquecharcount;
3269                         }
3270                         state = trie->trans[ state + charid ].next;
3271                     } else {
3272                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3273                     }
3274                     /* charid is now 0 if we dont know the char read, or
3275                      * nonzero if we do */
3276                 }
3277             } else {
3278                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3279                  * on a trieable type. So we need to reset noper back to point at the first regop
3280                  * in the branch before we call TRIE_HANDLE_WORD().
3281                 */
3282                 noper= NEXTOPER(cur);
3283             }
3284             accept_state = TRIE_NODENUM( state );
3285             TRIE_HANDLE_WORD(accept_state);
3286 
3287         } /* end second pass */
3288 
3289         /* and now dump it out before we compress it */
3290         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3291 							  revcharmap,
3292 							  next_alloc, depth+1));
3293 
3294         {
3295         /*
3296            * Inplace compress the table.*
3297 
3298            For sparse data sets the table constructed by the trie algorithm will
3299            be mostly 0/FAIL transitions or to put it another way mostly empty.
3300            (Note that leaf nodes will not contain any transitions.)
3301 
3302            This algorithm compresses the tables by eliminating most such
3303            transitions, at the cost of a modest bit of extra work during lookup:
3304 
3305            - Each states[] entry contains a .base field which indicates the
3306            index in the state[] array wheres its transition data is stored.
3307 
3308            - If .base is 0 there are no valid transitions from that node.
3309 
3310            - If .base is nonzero then charid is added to it to find an entry in
3311            the trans array.
3312 
3313            -If trans[states[state].base+charid].check!=state then the
3314            transition is taken to be a 0/Fail transition. Thus if there are fail
3315            transitions at the front of the node then the .base offset will point
3316            somewhere inside the previous nodes data (or maybe even into a node
3317            even earlier), but the .check field determines if the transition is
3318            valid.
3319 
3320            XXX - wrong maybe?
3321            The following process inplace converts the table to the compressed
3322            table: We first do not compress the root node 1,and mark all its
3323            .check pointers as 1 and set its .base pointer as 1 as well. This
3324            allows us to do a DFA construction from the compressed table later,
3325            and ensures that any .base pointers we calculate later are greater
3326            than 0.
3327 
3328            - We set 'pos' to indicate the first entry of the second node.
3329 
3330            - We then iterate over the columns of the node, finding the first and
3331            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3332            and set the .check pointers accordingly, and advance pos
3333            appropriately and repreat for the next node. Note that when we copy
3334            the next pointers we have to convert them from the original
3335            NODEIDX form to NODENUM form as the former is not valid post
3336            compression.
3337 
3338            - If a node has no transitions used we mark its base as 0 and do not
3339            advance the pos pointer.
3340 
3341            - If a node only has one transition we use a second pointer into the
3342            structure to fill in allocated fail transitions from other states.
3343            This pointer is independent of the main pointer and scans forward
3344            looking for null transitions that are allocated to a state. When it
3345            finds one it writes the single transition into the "hole".  If the
3346            pointer doesnt find one the single transition is appended as normal.
3347 
3348            - Once compressed we can Renew/realloc the structures to release the
3349            excess space.
3350 
3351            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3352            specifically Fig 3.47 and the associated pseudocode.
3353 
3354            demq
3355         */
3356         const U32 laststate = TRIE_NODENUM( next_alloc );
3357 	U32 state, charid;
3358         U32 pos = 0, zp=0;
3359         trie->statecount = laststate;
3360 
3361         for ( state = 1 ; state < laststate ; state++ ) {
3362             U8 flag = 0;
3363 	    const U32 stateidx = TRIE_NODEIDX( state );
3364 	    const U32 o_used = trie->trans[ stateidx ].check;
3365 	    U32 used = trie->trans[ stateidx ].check;
3366             trie->trans[ stateidx ].check = 0;
3367 
3368             for ( charid = 0;
3369                   used && charid < trie->uniquecharcount;
3370                   charid++ )
3371             {
3372                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3373                     if ( trie->trans[ stateidx + charid ].next ) {
3374                         if (o_used == 1) {
3375                             for ( ; zp < pos ; zp++ ) {
3376                                 if ( ! trie->trans[ zp ].next ) {
3377                                     break;
3378                                 }
3379                             }
3380                             trie->states[ state ].trans.base
3381                                                     = zp
3382                                                       + trie->uniquecharcount
3383                                                       - charid ;
3384                             trie->trans[ zp ].next
3385                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3386                                                              + charid ].next );
3387                             trie->trans[ zp ].check = state;
3388                             if ( ++zp > pos ) pos = zp;
3389                             break;
3390                         }
3391                         used--;
3392                     }
3393                     if ( !flag ) {
3394                         flag = 1;
3395                         trie->states[ state ].trans.base
3396                                        = pos + trie->uniquecharcount - charid ;
3397                     }
3398                     trie->trans[ pos ].next
3399                         = SAFE_TRIE_NODENUM(
3400                                        trie->trans[ stateidx + charid ].next );
3401                     trie->trans[ pos ].check = state;
3402                     pos++;
3403                 }
3404             }
3405         }
3406         trie->lasttrans = pos + 1;
3407         trie->states = (reg_trie_state *)
3408 	    PerlMemShared_realloc( trie->states, laststate
3409 				   * sizeof(reg_trie_state) );
3410         DEBUG_TRIE_COMPILE_MORE_r(
3411             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3412                 depth+1,
3413                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3414                        + 1 ),
3415                 (IV)next_alloc,
3416                 (IV)pos,
3417                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3418             );
3419 
3420         } /* end table compress */
3421     }
3422     DEBUG_TRIE_COMPILE_MORE_r(
3423             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3424                 depth+1,
3425                 (UV)trie->statecount,
3426                 (UV)trie->lasttrans)
3427     );
3428     /* resize the trans array to remove unused space */
3429     trie->trans = (reg_trie_trans *)
3430 	PerlMemShared_realloc( trie->trans, trie->lasttrans
3431 			       * sizeof(reg_trie_trans) );
3432 
3433     {   /* Modify the program and insert the new TRIE node */
3434         U8 nodetype =(U8)(flags & 0xFF);
3435         char *str=NULL;
3436 
3437 #ifdef DEBUGGING
3438         regnode *optimize = NULL;
3439 #ifdef RE_TRACK_PATTERN_OFFSETS
3440 
3441         U32 mjd_offset = 0;
3442         U32 mjd_nodelen = 0;
3443 #endif /* RE_TRACK_PATTERN_OFFSETS */
3444 #endif /* DEBUGGING */
3445         /*
3446            This means we convert either the first branch or the first Exact,
3447            depending on whether the thing following (in 'last') is a branch
3448            or not and whther first is the startbranch (ie is it a sub part of
3449            the alternation or is it the whole thing.)
3450            Assuming its a sub part we convert the EXACT otherwise we convert
3451            the whole branch sequence, including the first.
3452          */
3453         /* Find the node we are going to overwrite */
3454         if ( first != startbranch || OP( last ) == BRANCH ) {
3455             /* branch sub-chain */
3456             NEXT_OFF( first ) = (U16)(last - first);
3457 #ifdef RE_TRACK_PATTERN_OFFSETS
3458             DEBUG_r({
3459                 mjd_offset= Node_Offset((convert));
3460                 mjd_nodelen= Node_Length((convert));
3461             });
3462 #endif
3463             /* whole branch chain */
3464         }
3465 #ifdef RE_TRACK_PATTERN_OFFSETS
3466         else {
3467             DEBUG_r({
3468                 const  regnode *nop = NEXTOPER( convert );
3469                 mjd_offset= Node_Offset((nop));
3470                 mjd_nodelen= Node_Length((nop));
3471             });
3472         }
3473         DEBUG_OPTIMISE_r(
3474             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3475                 depth+1,
3476                 (UV)mjd_offset, (UV)mjd_nodelen)
3477         );
3478 #endif
3479         /* But first we check to see if there is a common prefix we can
3480            split out as an EXACT and put in front of the TRIE node.  */
3481         trie->startstate= 1;
3482         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3483             /* we want to find the first state that has more than
3484              * one transition, if that state is not the first state
3485              * then we have a common prefix which we can remove.
3486              */
3487             U32 state;
3488             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3489                 U32 ofs = 0;
3490                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3491                                        transition, -1 means none */
3492                 U32 count = 0;
3493                 const U32 base = trie->states[ state ].trans.base;
3494 
3495                 /* does this state terminate an alternation? */
3496                 if ( trie->states[state].wordnum )
3497                         count = 1;
3498 
3499                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3500                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3501                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3502                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3503                     {
3504                         if ( ++count > 1 ) {
3505                             /* we have more than one transition */
3506                             SV **tmp;
3507                             U8 *ch;
3508                             /* if this is the first state there is no common prefix
3509                              * to extract, so we can exit */
3510                             if ( state == 1 ) break;
3511                             tmp = av_fetch( revcharmap, ofs, 0);
3512                             ch = (U8*)SvPV_nolen_const( *tmp );
3513 
3514                             /* if we are on count 2 then we need to initialize the
3515                              * bitmap, and store the previous char if there was one
3516                              * in it*/
3517                             if ( count == 2 ) {
3518                                 /* clear the bitmap */
3519                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3520                                 DEBUG_OPTIMISE_r(
3521                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3522                                         depth+1,
3523                                         (UV)state));
3524                                 if (first_ofs >= 0) {
3525                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3526 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3527 
3528                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3529                                     DEBUG_OPTIMISE_r(
3530                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3531                                     );
3532 				}
3533 			    }
3534                             /* store the current firstchar in the bitmap */
3535                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3536                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3537 			}
3538                         first_ofs = ofs;
3539 		    }
3540                 }
3541                 if ( count == 1 ) {
3542                     /* This state has only one transition, its transition is part
3543                      * of a common prefix - we need to concatenate the char it
3544                      * represents to what we have so far. */
3545                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3546                     STRLEN len;
3547                     char *ch = SvPV( *tmp, len );
3548                     DEBUG_OPTIMISE_r({
3549                         SV *sv=sv_newmortal();
3550                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3551                             depth+1,
3552                             (UV)state, (UV)first_ofs,
3553                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3554 	                        PL_colors[0], PL_colors[1],
3555 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3556 	                        PERL_PV_ESCAPE_FIRSTCHAR
3557                             )
3558                         );
3559                     });
3560                     if ( state==1 ) {
3561                         OP( convert ) = nodetype;
3562                         str=STRING(convert);
3563                         STR_LEN(convert)=0;
3564                     }
3565                     STR_LEN(convert) += len;
3566                     while (len--)
3567                         *str++ = *ch++;
3568 		} else {
3569 #ifdef DEBUGGING
3570 		    if (state>1)
3571                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3572 #endif
3573 		    break;
3574 		}
3575 	    }
3576 	    trie->prefixlen = (state-1);
3577             if (str) {
3578                 regnode *n = convert+NODE_SZ_STR(convert);
3579                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3580                 trie->startstate = state;
3581                 trie->minlen -= (state - 1);
3582                 trie->maxlen -= (state - 1);
3583 #ifdef DEBUGGING
3584                /* At least the UNICOS C compiler choked on this
3585                 * being argument to DEBUG_r(), so let's just have
3586                 * it right here. */
3587                if (
3588 #ifdef PERL_EXT_RE_BUILD
3589                    1
3590 #else
3591                    DEBUG_r_TEST
3592 #endif
3593                    ) {
3594                    regnode *fix = convert;
3595                    U32 word = trie->wordcount;
3596 #ifdef RE_TRACK_PATTERN_OFFSETS
3597                    mjd_nodelen++;
3598 #endif
3599                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3600                    while( ++fix < n ) {
3601                        Set_Node_Offset_Length(fix, 0, 0);
3602                    }
3603                    while (word--) {
3604                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3605                        if (tmp) {
3606                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3607                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3608                            else
3609                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3610                        }
3611                    }
3612                }
3613 #endif
3614                 if (trie->maxlen) {
3615                     convert = n;
3616 		} else {
3617                     NEXT_OFF(convert) = (U16)(tail - convert);
3618                     DEBUG_r(optimize= n);
3619                 }
3620             }
3621         }
3622         if (!jumper)
3623             jumper = last;
3624         if ( trie->maxlen ) {
3625 	    NEXT_OFF( convert ) = (U16)(tail - convert);
3626 	    ARG_SET( convert, data_slot );
3627 	    /* Store the offset to the first unabsorbed branch in
3628 	       jump[0], which is otherwise unused by the jump logic.
3629 	       We use this when dumping a trie and during optimisation. */
3630 	    if (trie->jump)
3631 	        trie->jump[0] = (U16)(nextbranch - convert);
3632 
3633             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3634 	     *   and there is a bitmap
3635 	     *   and the first "jump target" node we found leaves enough room
3636 	     * then convert the TRIE node into a TRIEC node, with the bitmap
3637 	     * embedded inline in the opcode - this is hypothetically faster.
3638 	     */
3639             if ( !trie->states[trie->startstate].wordnum
3640 		 && trie->bitmap
3641 		 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3642             {
3643                 OP( convert ) = TRIEC;
3644                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3645                 PerlMemShared_free(trie->bitmap);
3646                 trie->bitmap= NULL;
3647             } else
3648                 OP( convert ) = TRIE;
3649 
3650             /* store the type in the flags */
3651             convert->flags = nodetype;
3652             DEBUG_r({
3653             optimize = convert
3654                       + NODE_STEP_REGNODE
3655                       + regarglen[ OP( convert ) ];
3656             });
3657             /* XXX We really should free up the resource in trie now,
3658                    as we won't use them - (which resources?) dmq */
3659         }
3660         /* needed for dumping*/
3661         DEBUG_r(if (optimize) {
3662             regnode *opt = convert;
3663 
3664             while ( ++opt < optimize) {
3665                 Set_Node_Offset_Length(opt, 0, 0);
3666             }
3667             /*
3668                 Try to clean up some of the debris left after the
3669                 optimisation.
3670              */
3671             while( optimize < jumper ) {
3672                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3673                 OP( optimize ) = OPTIMIZED;
3674                 Set_Node_Offset_Length(optimize, 0, 0);
3675                 optimize++;
3676             }
3677             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3678         });
3679     } /* end node insert */
3680 
3681     /*  Finish populating the prev field of the wordinfo array.  Walk back
3682      *  from each accept state until we find another accept state, and if
3683      *  so, point the first word's .prev field at the second word. If the
3684      *  second already has a .prev field set, stop now. This will be the
3685      *  case either if we've already processed that word's accept state,
3686      *  or that state had multiple words, and the overspill words were
3687      *  already linked up earlier.
3688      */
3689     {
3690 	U16 word;
3691 	U32 state;
3692 	U16 prev;
3693 
3694 	for (word=1; word <= trie->wordcount; word++) {
3695 	    prev = 0;
3696 	    if (trie->wordinfo[word].prev)
3697 		continue;
3698 	    state = trie->wordinfo[word].accept;
3699 	    while (state) {
3700 		state = prev_states[state];
3701 		if (!state)
3702 		    break;
3703 		prev = trie->states[state].wordnum;
3704 		if (prev)
3705 		    break;
3706 	    }
3707 	    trie->wordinfo[word].prev = prev;
3708 	}
3709 	Safefree(prev_states);
3710     }
3711 
3712 
3713     /* and now dump out the compressed format */
3714     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3715 
3716     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3717 #ifdef DEBUGGING
3718     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3719     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3720 #else
3721     SvREFCNT_dec_NN(revcharmap);
3722 #endif
3723     return trie->jump
3724            ? MADE_JUMP_TRIE
3725            : trie->startstate>1
3726              ? MADE_EXACT_TRIE
3727              : MADE_TRIE;
3728 }
3729 
3730 STATIC regnode *
S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t * pRExC_state,regnode * source,U32 depth)3731 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3732 {
3733 /* The Trie is constructed and compressed now so we can build a fail array if
3734  * it's needed
3735 
3736    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3737    3.32 in the
3738    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3739    Ullman 1985/88
3740    ISBN 0-201-10088-6
3741 
3742    We find the fail state for each state in the trie, this state is the longest
3743    proper suffix of the current state's 'word' that is also a proper prefix of
3744    another word in our trie. State 1 represents the word '' and is thus the
3745    default fail state. This allows the DFA not to have to restart after its
3746    tried and failed a word at a given point, it simply continues as though it
3747    had been matching the other word in the first place.
3748    Consider
3749       'abcdgu'=~/abcdefg|cdgu/
3750    When we get to 'd' we are still matching the first word, we would encounter
3751    'g' which would fail, which would bring us to the state representing 'd' in
3752    the second word where we would try 'g' and succeed, proceeding to match
3753    'cdgu'.
3754  */
3755  /* add a fail transition */
3756     const U32 trie_offset = ARG(source);
3757     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3758     U32 *q;
3759     const U32 ucharcount = trie->uniquecharcount;
3760     const U32 numstates = trie->statecount;
3761     const U32 ubound = trie->lasttrans + ucharcount;
3762     U32 q_read = 0;
3763     U32 q_write = 0;
3764     U32 charid;
3765     U32 base = trie->states[ 1 ].trans.base;
3766     U32 *fail;
3767     reg_ac_data *aho;
3768     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3769     regnode *stclass;
3770     GET_RE_DEBUG_FLAGS_DECL;
3771 
3772     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3773     PERL_UNUSED_CONTEXT;
3774 #ifndef DEBUGGING
3775     PERL_UNUSED_ARG(depth);
3776 #endif
3777 
3778     if ( OP(source) == TRIE ) {
3779         struct regnode_1 *op = (struct regnode_1 *)
3780             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3781         StructCopy(source, op, struct regnode_1);
3782         stclass = (regnode *)op;
3783     } else {
3784         struct regnode_charclass *op = (struct regnode_charclass *)
3785             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3786         StructCopy(source, op, struct regnode_charclass);
3787         stclass = (regnode *)op;
3788     }
3789     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3790 
3791     ARG_SET( stclass, data_slot );
3792     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3793     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3794     aho->trie=trie_offset;
3795     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3796     Copy( trie->states, aho->states, numstates, reg_trie_state );
3797     Newx( q, numstates, U32);
3798     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3799     aho->refcount = 1;
3800     fail = aho->fail;
3801     /* initialize fail[0..1] to be 1 so that we always have
3802        a valid final fail state */
3803     fail[ 0 ] = fail[ 1 ] = 1;
3804 
3805     for ( charid = 0; charid < ucharcount ; charid++ ) {
3806 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3807 	if ( newstate ) {
3808             q[ q_write ] = newstate;
3809             /* set to point at the root */
3810             fail[ q[ q_write++ ] ]=1;
3811         }
3812     }
3813     while ( q_read < q_write) {
3814 	const U32 cur = q[ q_read++ % numstates ];
3815         base = trie->states[ cur ].trans.base;
3816 
3817         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3818 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3819 	    if (ch_state) {
3820                 U32 fail_state = cur;
3821                 U32 fail_base;
3822                 do {
3823                     fail_state = fail[ fail_state ];
3824                     fail_base = aho->states[ fail_state ].trans.base;
3825                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3826 
3827                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3828                 fail[ ch_state ] = fail_state;
3829                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3830                 {
3831                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3832                 }
3833                 q[ q_write++ % numstates] = ch_state;
3834             }
3835         }
3836     }
3837     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3838        when we fail in state 1, this allows us to use the
3839        charclass scan to find a valid start char. This is based on the principle
3840        that theres a good chance the string being searched contains lots of stuff
3841        that cant be a start char.
3842      */
3843     fail[ 0 ] = fail[ 1 ] = 0;
3844     DEBUG_TRIE_COMPILE_r({
3845         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3846                       depth, (UV)numstates
3847         );
3848         for( q_read=1; q_read<numstates; q_read++ ) {
3849             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3850         }
3851         Perl_re_printf( aTHX_  "\n");
3852     });
3853     Safefree(q);
3854     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3855     return stclass;
3856 }
3857 
3858 
3859 /* The below joins as many adjacent EXACTish nodes as possible into a single
3860  * one.  The regop may be changed if the node(s) contain certain sequences that
3861  * require special handling.  The joining is only done if:
3862  * 1) there is room in the current conglomerated node to entirely contain the
3863  *    next one.
3864  * 2) they are compatible node types
3865  *
3866  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3867  * these get optimized out
3868  *
3869  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3870  * as possible, even if that means splitting an existing node so that its first
3871  * part is moved to the preceeding node.  This would maximise the efficiency of
3872  * memEQ during matching.
3873  *
3874  * If a node is to match under /i (folded), the number of characters it matches
3875  * can be different than its character length if it contains a multi-character
3876  * fold.  *min_subtract is set to the total delta number of characters of the
3877  * input nodes.
3878  *
3879  * And *unfolded_multi_char is set to indicate whether or not the node contains
3880  * an unfolded multi-char fold.  This happens when it won't be known until
3881  * runtime whether the fold is valid or not; namely
3882  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3883  *      target string being matched against turns out to be UTF-8 is that fold
3884  *      valid; or
3885  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3886  *      runtime.
3887  * (Multi-char folds whose components are all above the Latin1 range are not
3888  * run-time locale dependent, and have already been folded by the time this
3889  * function is called.)
3890  *
3891  * This is as good a place as any to discuss the design of handling these
3892  * multi-character fold sequences.  It's been wrong in Perl for a very long
3893  * time.  There are three code points in Unicode whose multi-character folds
3894  * were long ago discovered to mess things up.  The previous designs for
3895  * dealing with these involved assigning a special node for them.  This
3896  * approach doesn't always work, as evidenced by this example:
3897  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3898  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3899  * would match just the \xDF, it won't be able to handle the case where a
3900  * successful match would have to cross the node's boundary.  The new approach
3901  * that hopefully generally solves the problem generates an EXACTFUP node
3902  * that is "sss" in this case.
3903  *
3904  * It turns out that there are problems with all multi-character folds, and not
3905  * just these three.  Now the code is general, for all such cases.  The
3906  * approach taken is:
3907  * 1)   This routine examines each EXACTFish node that could contain multi-
3908  *      character folded sequences.  Since a single character can fold into
3909  *      such a sequence, the minimum match length for this node is less than
3910  *      the number of characters in the node.  This routine returns in
3911  *      *min_subtract how many characters to subtract from the the actual
3912  *      length of the string to get a real minimum match length; it is 0 if
3913  *      there are no multi-char foldeds.  This delta is used by the caller to
3914  *      adjust the min length of the match, and the delta between min and max,
3915  *      so that the optimizer doesn't reject these possibilities based on size
3916  *      constraints.
3917  *
3918  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3919  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3920  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3921  *      EXACTFU nodes.  The node type of such nodes is then changed to
3922  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3923  *      (The procedures in step 1) above are sufficient to handle this case in
3924  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3925  *      the only case where there is a possible fold length change in non-UTF-8
3926  *      patterns.  By reserving a special node type for problematic cases, the
3927  *      far more common regular EXACTFU nodes can be processed faster.
3928  *      regexec.c takes advantage of this.
3929  *
3930  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3931  *      problematic cases.   These all only occur when the pattern is not
3932  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3933  *      length change, it handles the situation where the string cannot be
3934  *      entirely folded.  The strings in an EXACTFish node are folded as much
3935  *      as possible during compilation in regcomp.c.  This saves effort in
3936  *      regex matching.  By using an EXACTFUP node when it is not possible to
3937  *      fully fold at compile time, regexec.c can know that everything in an
3938  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3939  *      case where folding in EXACTFU nodes can't be done at compile time is
3940  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3941  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3942  *      handle two very different cases.  Alternatively, there could have been
3943  *      a node type where there are length changes, one for unfolded, and one
3944  *      for both.  If yet another special case needed to be created, the number
3945  *      of required node types would have to go to 7.  khw figures that even
3946  *      though there are plenty of node types to spare, that the maintenance
3947  *      cost wasn't worth the small speedup of doing it that way, especially
3948  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3949  *
3950  *      There are other cases where folding isn't done at compile time, but
3951  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3952  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3953  *      changes.  Some folds in EXACTF depend on if the runtime target string
3954  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3955  *      when no fold in it depends on the UTF-8ness of the target string.)
3956  *
3957  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3958  *      validity of the fold won't be known until runtime, and so must remain
3959  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3960  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3961  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3962  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3963  *      The reason this is a problem is that the optimizer part of regexec.c
3964  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3965  *      that a character in the pattern corresponds to at most a single
3966  *      character in the target string.  (And I do mean character, and not byte
3967  *      here, unlike other parts of the documentation that have never been
3968  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3969  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3970  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3971  *      EXACTFL nodes, violate the assumption, and they are the only instances
3972  *      where it is violated.  I'm reluctant to try to change the assumption,
3973  *      as the code involved is impenetrable to me (khw), so instead the code
3974  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3975  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3976  *      boolean indicating whether or not the node contains such a fold.  When
3977  *      it is true, the caller sets a flag that later causes the optimizer in
3978  *      this file to not set values for the floating and fixed string lengths,
3979  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3980  *      assumption.  Thus, there is no optimization based on string lengths for
3981  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3982  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3983  *      assumption is wrong only in these cases is that all other non-UTF-8
3984  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3985  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3986  *      EXACTF nodes because we don't know at compile time if it actually
3987  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3988  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3989  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3990  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3991  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3992  *      string would require the pattern to be forced into UTF-8, the overhead
3993  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3994  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3995  *      locale.)
3996  *
3997  *      Similarly, the code that generates tries doesn't currently handle
3998  *      not-already-folded multi-char folds, and it looks like a pain to change
3999  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4000  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4001  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4002  *      using /iaa matching will be doing so almost entirely with ASCII
4003  *      strings, so this should rarely be encountered in practice */
4004 
4005 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
4006     if (PL_regkind[OP(scan)] == EXACT) \
4007         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
4008 
4009 STATIC U32
S_join_exact(pTHX_ RExC_state_t * pRExC_state,regnode * scan,UV * min_subtract,bool * unfolded_multi_char,U32 flags,regnode * val,U32 depth)4010 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4011                    UV *min_subtract, bool *unfolded_multi_char,
4012                    U32 flags, regnode *val, U32 depth)
4013 {
4014     /* Merge several consecutive EXACTish nodes into one. */
4015 
4016     regnode *n = regnext(scan);
4017     U32 stringok = 1;
4018     regnode *next = scan + NODE_SZ_STR(scan);
4019     U32 merged = 0;
4020     U32 stopnow = 0;
4021 #ifdef DEBUGGING
4022     regnode *stop = scan;
4023     GET_RE_DEBUG_FLAGS_DECL;
4024 #else
4025     PERL_UNUSED_ARG(depth);
4026 #endif
4027 
4028     PERL_ARGS_ASSERT_JOIN_EXACT;
4029 #ifndef EXPERIMENTAL_INPLACESCAN
4030     PERL_UNUSED_ARG(flags);
4031     PERL_UNUSED_ARG(val);
4032 #endif
4033     DEBUG_PEEP("join", scan, depth, 0);
4034 
4035     assert(PL_regkind[OP(scan)] == EXACT);
4036 
4037     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4038      * EXACT ones that are mergeable to the current one. */
4039     while (    n
4040            && (    PL_regkind[OP(n)] == NOTHING
4041                || (stringok && PL_regkind[OP(n)] == EXACT))
4042            && NEXT_OFF(n)
4043            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4044     {
4045 
4046         if (OP(n) == TAIL || n > next)
4047             stringok = 0;
4048         if (PL_regkind[OP(n)] == NOTHING) {
4049             DEBUG_PEEP("skip:", n, depth, 0);
4050             NEXT_OFF(scan) += NEXT_OFF(n);
4051             next = n + NODE_STEP_REGNODE;
4052 #ifdef DEBUGGING
4053             if (stringok)
4054                 stop = n;
4055 #endif
4056             n = regnext(n);
4057         }
4058         else if (stringok) {
4059             const unsigned int oldl = STR_LEN(scan);
4060             regnode * const nnext = regnext(n);
4061 
4062             /* XXX I (khw) kind of doubt that this works on platforms (should
4063              * Perl ever run on one) where U8_MAX is above 255 because of lots
4064              * of other assumptions */
4065             /* Don't join if the sum can't fit into a single node */
4066             if (oldl + STR_LEN(n) > U8_MAX)
4067                 break;
4068 
4069             /* Joining something that requires UTF-8 with something that
4070              * doesn't, means the result requires UTF-8. */
4071             if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
4072                 OP(scan) = EXACT_ONLY8;
4073             }
4074             else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
4075                 ;   /* join is compatible, no need to change OP */
4076             }
4077             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
4078                 OP(scan) = EXACTFU_ONLY8;
4079             }
4080             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
4081                 ;   /* join is compatible, no need to change OP */
4082             }
4083             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4084                 ;   /* join is compatible, no need to change OP */
4085             }
4086             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4087 
4088                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4089                   * which can join with EXACTFU ones.  We check for this case
4090                   * here.  These need to be resolved to either EXACTFU or
4091                   * EXACTF at joining time.  They have nothing in them that
4092                   * would forbid them from being the more desirable EXACTFU
4093                   * nodes except that they begin and/or end with a single [Ss].
4094                   * The reason this is problematic is because they could be
4095                   * joined in this loop with an adjacent node that ends and/or
4096                   * begins with [Ss] which would then form the sequence 'ss',
4097                   * which matches differently under /di than /ui, in which case
4098                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4099                   * formed, the nodes get absorbed into any adjacent EXACTFU
4100                   * node.  And if the only adjacent node is EXACTF, they get
4101                   * absorbed into that, under the theory that a longer node is
4102                   * better than two shorter ones, even if one is EXACTFU.  Note
4103                   * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
4104                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4105 
4106                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4107 
4108                     /* Here the joined node would end with 's'.  If the node
4109                      * following the combination is an EXACTF one, it's better to
4110                      * join this trailing edge 's' node with that one, leaving the
4111                      * current one in 'scan' be the more desirable EXACTFU */
4112                     if (OP(nnext) == EXACTF) {
4113                         break;
4114                     }
4115 
4116                     OP(scan) = EXACTFU_S_EDGE;
4117 
4118                 }   /* Otherwise, the beginning 's' of the 2nd node just
4119                        becomes an interior 's' in 'scan' */
4120             }
4121             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4122                 ;   /* join is compatible, no need to change OP */
4123             }
4124             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4125 
4126                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4127                  * nodes.  But the latter nodes can be also joined with EXACTFU
4128                  * ones, and that is a better outcome, so if the node following
4129                  * 'n' is EXACTFU, quit now so that those two can be joined
4130                  * later */
4131                 if (OP(nnext) == EXACTFU) {
4132                     break;
4133                 }
4134 
4135                 /* The join is compatible, and the combined node will be
4136                  * EXACTF.  (These don't care if they begin or end with 's' */
4137             }
4138             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4139                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4140                     && STRING(n)[0] == 's')
4141                 {
4142                     /* When combined, we have the sequence 'ss', which means we
4143                      * have to remain /di */
4144                     OP(scan) = EXACTF;
4145                 }
4146             }
4147             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4148                 if (STRING(n)[0] == 's') {
4149                     ;   /* Here the join is compatible and the combined node
4150                            starts with 's', no need to change OP */
4151                 }
4152                 else {  /* Now the trailing 's' is in the interior */
4153                     OP(scan) = EXACTFU;
4154                 }
4155             }
4156             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4157 
4158                 /* The join is compatible, and the combined node will be
4159                  * EXACTF.  (These don't care if they begin or end with 's' */
4160                 OP(scan) = EXACTF;
4161             }
4162             else if (OP(scan) != OP(n)) {
4163 
4164                 /* The only other compatible joinings are the same node type */
4165                 break;
4166             }
4167 
4168             DEBUG_PEEP("merg", n, depth, 0);
4169             merged++;
4170 
4171             NEXT_OFF(scan) += NEXT_OFF(n);
4172             STR_LEN(scan) += STR_LEN(n);
4173             next = n + NODE_SZ_STR(n);
4174             /* Now we can overwrite *n : */
4175             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4176 #ifdef DEBUGGING
4177             stop = next - 1;
4178 #endif
4179             n = nnext;
4180             if (stopnow) break;
4181         }
4182 
4183 #ifdef EXPERIMENTAL_INPLACESCAN
4184 	if (flags && !NEXT_OFF(n)) {
4185 	    DEBUG_PEEP("atch", val, depth, 0);
4186 	    if (reg_off_by_arg[OP(n)]) {
4187 		ARG_SET(n, val - n);
4188 	    }
4189 	    else {
4190 		NEXT_OFF(n) = val - n;
4191 	    }
4192 	    stopnow = 1;
4193 	}
4194 #endif
4195     }
4196 
4197     /* This temporary node can now be turned into EXACTFU, and must, as
4198      * regexec.c doesn't handle it */
4199     if (OP(scan) == EXACTFU_S_EDGE) {
4200         OP(scan) = EXACTFU;
4201     }
4202 
4203     *min_subtract = 0;
4204     *unfolded_multi_char = FALSE;
4205 
4206     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4207      * can now analyze for sequences of problematic code points.  (Prior to
4208      * this final joining, sequences could have been split over boundaries, and
4209      * hence missed).  The sequences only happen in folding, hence for any
4210      * non-EXACT EXACTish node */
4211     if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4212         U8* s0 = (U8*) STRING(scan);
4213         U8* s = s0;
4214         U8* s_end = s0 + STR_LEN(scan);
4215 
4216         int total_count_delta = 0;  /* Total delta number of characters that
4217                                        multi-char folds expand to */
4218 
4219 	/* One pass is made over the node's string looking for all the
4220 	 * possibilities.  To avoid some tests in the loop, there are two main
4221 	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4222 	 * non-UTF-8 */
4223 	if (UTF) {
4224             U8* folded = NULL;
4225 
4226             if (OP(scan) == EXACTFL) {
4227                 U8 *d;
4228 
4229                 /* An EXACTFL node would already have been changed to another
4230                  * node type unless there is at least one character in it that
4231                  * is problematic; likely a character whose fold definition
4232                  * won't be known until runtime, and so has yet to be folded.
4233                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4234                  * to handle the UTF-8 case, we need to create a temporary
4235                  * folded copy using UTF-8 locale rules in order to analyze it.
4236                  * This is because our macros that look to see if a sequence is
4237                  * a multi-char fold assume everything is folded (otherwise the
4238                  * tests in those macros would be too complicated and slow).
4239                  * Note that here, the non-problematic folds will have already
4240                  * been done, so we can just copy such characters.  We actually
4241                  * don't completely fold the EXACTFL string.  We skip the
4242                  * unfolded multi-char folds, as that would just create work
4243                  * below to figure out the size they already are */
4244 
4245                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4246                 d = folded;
4247                 while (s < s_end) {
4248                     STRLEN s_len = UTF8SKIP(s);
4249                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4250                         Copy(s, d, s_len, U8);
4251                         d += s_len;
4252                     }
4253                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4254                         *unfolded_multi_char = TRUE;
4255                         Copy(s, d, s_len, U8);
4256                         d += s_len;
4257                     }
4258                     else if (isASCII(*s)) {
4259                         *(d++) = toFOLD(*s);
4260                     }
4261                     else {
4262                         STRLEN len;
4263                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4264                         d += len;
4265                     }
4266                     s += s_len;
4267                 }
4268 
4269                 /* Point the remainder of the routine to look at our temporary
4270                  * folded copy */
4271                 s = folded;
4272                 s_end = d;
4273             } /* End of creating folded copy of EXACTFL string */
4274 
4275             /* Examine the string for a multi-character fold sequence.  UTF-8
4276              * patterns have all characters pre-folded by the time this code is
4277              * executed */
4278             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4279                                      length sequence we are looking for is 2 */
4280 	    {
4281                 int count = 0;  /* How many characters in a multi-char fold */
4282                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4283                 if (! len) {    /* Not a multi-char fold: get next char */
4284                     s += UTF8SKIP(s);
4285                     continue;
4286                 }
4287 
4288                 { /* Here is a generic multi-char fold. */
4289                     U8* multi_end  = s + len;
4290 
4291                     /* Count how many characters are in it.  In the case of
4292                      * /aa, no folds which contain ASCII code points are
4293                      * allowed, so check for those, and skip if found. */
4294                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4295                         count = utf8_length(s, multi_end);
4296                         s = multi_end;
4297                     }
4298                     else {
4299                         while (s < multi_end) {
4300                             if (isASCII(*s)) {
4301                                 s++;
4302                                 goto next_iteration;
4303                             }
4304                             else {
4305                                 s += UTF8SKIP(s);
4306                             }
4307                             count++;
4308                         }
4309                     }
4310                 }
4311 
4312                 /* The delta is how long the sequence is minus 1 (1 is how long
4313                  * the character that folds to the sequence is) */
4314                 total_count_delta += count - 1;
4315               next_iteration: ;
4316 	    }
4317 
4318             /* We created a temporary folded copy of the string in EXACTFL
4319              * nodes.  Therefore we need to be sure it doesn't go below zero,
4320              * as the real string could be shorter */
4321             if (OP(scan) == EXACTFL) {
4322                 int total_chars = utf8_length((U8*) STRING(scan),
4323                                            (U8*) STRING(scan) + STR_LEN(scan));
4324                 if (total_count_delta > total_chars) {
4325                     total_count_delta = total_chars;
4326                 }
4327             }
4328 
4329             *min_subtract += total_count_delta;
4330             Safefree(folded);
4331 	}
4332 	else if (OP(scan) == EXACTFAA) {
4333 
4334             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4335              * fold to the ASCII range (and there are no existing ones in the
4336              * upper latin1 range).  But, as outlined in the comments preceding
4337              * this function, we need to flag any occurrences of the sharp s.
4338              * This character forbids trie formation (because of added
4339              * complexity) */
4340 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4341    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4342                                       || UNICODE_DOT_DOT_VERSION > 0)
4343 	    while (s < s_end) {
4344                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4345                     OP(scan) = EXACTFAA_NO_TRIE;
4346                     *unfolded_multi_char = TRUE;
4347                     break;
4348                 }
4349                 s++;
4350             }
4351         }
4352 	else {
4353 
4354             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4355              * folds that are all Latin1.  As explained in the comments
4356              * preceding this function, we look also for the sharp s in EXACTF
4357              * and EXACTFL nodes; it can be in the final position.  Otherwise
4358              * we can stop looking 1 byte earlier because have to find at least
4359              * two characters for a multi-fold */
4360 	    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4361                               ? s_end
4362                               : s_end -1;
4363 
4364 	    while (s < upper) {
4365                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4366                 if (! len) {    /* Not a multi-char fold. */
4367                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4368                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4369                     {
4370                         *unfolded_multi_char = TRUE;
4371                     }
4372                     s++;
4373                     continue;
4374                 }
4375 
4376                 if (len == 2
4377                     && isALPHA_FOLD_EQ(*s, 's')
4378                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4379                 {
4380 
4381                     /* EXACTF nodes need to know that the minimum length
4382                      * changed so that a sharp s in the string can match this
4383                      * ss in the pattern, but they remain EXACTF nodes, as they
4384                      * won't match this unless the target string is is UTF-8,
4385                      * which we don't know until runtime.  EXACTFL nodes can't
4386                      * transform into EXACTFU nodes */
4387                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4388                         OP(scan) = EXACTFUP;
4389                     }
4390 		}
4391 
4392                 *min_subtract += len - 1;
4393                 s += len;
4394 	    }
4395 #endif
4396 	}
4397 
4398         if (     STR_LEN(scan) == 1
4399             &&   isALPHA_A(* STRING(scan))
4400             &&  (         OP(scan) == EXACTFAA
4401                  || (     OP(scan) == EXACTFU
4402                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4403         {
4404             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4405 
4406             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4407              * with the mask set to the complement of the bit that differs
4408              * between upper and lower case, and the lowest code point of the
4409              * pair (which the '&' forces) */
4410             OP(scan) = ANYOFM;
4411             ARG_SET(scan, *STRING(scan) & mask);
4412             FLAGS(scan) = mask;
4413         }
4414     }
4415 
4416 #ifdef DEBUGGING
4417     /* Allow dumping but overwriting the collection of skipped
4418      * ops and/or strings with fake optimized ops */
4419     n = scan + NODE_SZ_STR(scan);
4420     while (n <= stop) {
4421 	OP(n) = OPTIMIZED;
4422 	FLAGS(n) = 0;
4423 	NEXT_OFF(n) = 0;
4424         n++;
4425     }
4426 #endif
4427     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4428     return stopnow;
4429 }
4430 
4431 /* REx optimizer.  Converts nodes into quicker variants "in place".
4432    Finds fixed substrings.  */
4433 
4434 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4435    to the position after last scanned or to NULL. */
4436 
4437 #define INIT_AND_WITHP \
4438     assert(!and_withp); \
4439     Newx(and_withp, 1, regnode_ssc); \
4440     SAVEFREEPV(and_withp)
4441 
4442 
4443 static void
S_unwind_scan_frames(pTHX_ const void * p)4444 S_unwind_scan_frames(pTHX_ const void *p)
4445 {
4446     scan_frame *f= (scan_frame *)p;
4447     do {
4448         scan_frame *n= f->next_frame;
4449         Safefree(f);
4450         f= n;
4451     } while (f);
4452 }
4453 
4454 /* Follow the next-chain of the current node and optimize away
4455    all the NOTHINGs from it.
4456  */
4457 STATIC void
S_rck_elide_nothing(pTHX_ regnode * node)4458 S_rck_elide_nothing(pTHX_ regnode *node)
4459 {
4460     dVAR;
4461 
4462     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4463 
4464     if (OP(node) != CURLYX) {
4465         const int max = (reg_off_by_arg[OP(node)]
4466                         ? I32_MAX
4467                           /* I32 may be smaller than U16 on CRAYs! */
4468                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4469         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4470         int noff;
4471         regnode *n = node;
4472 
4473         /* Skip NOTHING and LONGJMP. */
4474         while (
4475             (n = regnext(n))
4476             && (
4477                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4478                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4479             )
4480             && off + noff < max
4481         ) {
4482             off += noff;
4483         }
4484         if (reg_off_by_arg[OP(node)])
4485             ARG(node) = off;
4486         else
4487             NEXT_OFF(node) = off;
4488     }
4489     return;
4490 }
4491 
4492 /* the return from this sub is the minimum length that could possibly match */
4493 STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,SSize_t * minlenp,SSize_t * deltap,regnode * last,scan_data_t * data,I32 stopparen,U32 recursed_depth,regnode_ssc * and_withp,U32 flags,U32 depth,bool was_mutate_ok)4494 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4495                         SSize_t *minlenp, SSize_t *deltap,
4496 			regnode *last,
4497 			scan_data_t *data,
4498 			I32 stopparen,
4499                         U32 recursed_depth,
4500 			regnode_ssc *and_withp,
4501 			U32 flags, U32 depth, bool was_mutate_ok)
4502 			/* scanp: Start here (read-write). */
4503 			/* deltap: Write maxlen-minlen here. */
4504 			/* last: Stop before this one. */
4505 			/* data: string data about the pattern */
4506 			/* stopparen: treat close N as END */
4507 			/* recursed: which subroutines have we recursed into */
4508 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4509 {
4510     dVAR;
4511     /* There must be at least this number of characters to match */
4512     SSize_t min = 0;
4513     I32 pars = 0, code;
4514     regnode *scan = *scanp, *next;
4515     SSize_t delta = 0;
4516     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4517     int is_inf_internal = 0;		/* The studied chunk is infinite */
4518     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4519     scan_data_t data_fake;
4520     SV *re_trie_maxbuff = NULL;
4521     regnode *first_non_open = scan;
4522     SSize_t stopmin = SSize_t_MAX;
4523     scan_frame *frame = NULL;
4524     GET_RE_DEBUG_FLAGS_DECL;
4525 
4526     PERL_ARGS_ASSERT_STUDY_CHUNK;
4527     RExC_study_started= 1;
4528 
4529     Zero(&data_fake, 1, scan_data_t);
4530 
4531     if ( depth == 0 ) {
4532         while (first_non_open && OP(first_non_open) == OPEN)
4533             first_non_open=regnext(first_non_open);
4534     }
4535 
4536 
4537   fake_study_recurse:
4538     DEBUG_r(
4539         RExC_study_chunk_recursed_count++;
4540     );
4541     DEBUG_OPTIMISE_MORE_r(
4542     {
4543         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4544             depth, (long)stopparen,
4545             (unsigned long)RExC_study_chunk_recursed_count,
4546             (unsigned long)depth, (unsigned long)recursed_depth,
4547             scan,
4548             last);
4549         if (recursed_depth) {
4550             U32 i;
4551             U32 j;
4552             for ( j = 0 ; j < recursed_depth ; j++ ) {
4553                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4554                     if (
4555                         PAREN_TEST(RExC_study_chunk_recursed +
4556                                    ( j * RExC_study_chunk_recursed_bytes), i )
4557                         && (
4558                             !j ||
4559                             !PAREN_TEST(RExC_study_chunk_recursed +
4560                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4561                         )
4562                     ) {
4563                         Perl_re_printf( aTHX_ " %d",(int)i);
4564                         break;
4565                     }
4566                 }
4567                 if ( j + 1 < recursed_depth ) {
4568                     Perl_re_printf( aTHX_  ",");
4569                 }
4570             }
4571         }
4572         Perl_re_printf( aTHX_ "\n");
4573     }
4574     );
4575     while ( scan && OP(scan) != END && scan < last ){
4576         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4577                                    node length to get a real minimum (because
4578                                    the folded version may be shorter) */
4579 	bool unfolded_multi_char = FALSE;
4580         /* avoid mutating ops if we are anywhere within the recursed or
4581          * enframed handling for a GOSUB: the outermost level will handle it.
4582          */
4583         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4584 	/* Peephole optimizer: */
4585         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4586         DEBUG_PEEP("Peep", scan, depth, flags);
4587 
4588 
4589         /* The reason we do this here is that we need to deal with things like
4590          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4591          * parsing code, as each (?:..) is handled by a different invocation of
4592          * reg() -- Yves
4593          */
4594         if (mutate_ok)
4595             JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4596 
4597         /* Follow the next-chain of the current node and optimize
4598            away all the NOTHINGs from it.
4599          */
4600         rck_elide_nothing(scan);
4601 
4602 	/* The principal pseudo-switch.  Cannot be a switch, since we
4603 	   look into several different things.  */
4604         if ( OP(scan) == DEFINEP ) {
4605             SSize_t minlen = 0;
4606             SSize_t deltanext = 0;
4607             SSize_t fake_last_close = 0;
4608             I32 f = SCF_IN_DEFINE;
4609 
4610             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4611             scan = regnext(scan);
4612             assert( OP(scan) == IFTHEN );
4613             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4614 
4615             data_fake.last_closep= &fake_last_close;
4616             minlen = *minlenp;
4617             next = regnext(scan);
4618             scan = NEXTOPER(NEXTOPER(scan));
4619             DEBUG_PEEP("scan", scan, depth, flags);
4620             DEBUG_PEEP("next", next, depth, flags);
4621 
4622             /* we suppose the run is continuous, last=next...
4623              * NOTE we dont use the return here! */
4624             /* DEFINEP study_chunk() recursion */
4625             (void)study_chunk(pRExC_state, &scan, &minlen,
4626                               &deltanext, next, &data_fake, stopparen,
4627                               recursed_depth, NULL, f, depth+1, mutate_ok);
4628 
4629             scan = next;
4630         } else
4631         if (
4632             OP(scan) == BRANCH  ||
4633             OP(scan) == BRANCHJ ||
4634             OP(scan) == IFTHEN
4635         ) {
4636 	    next = regnext(scan);
4637 	    code = OP(scan);
4638 
4639             /* The op(next)==code check below is to see if we
4640              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4641              * IFTHEN is special as it might not appear in pairs.
4642              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4643              * we dont handle it cleanly. */
4644 	    if (OP(next) == code || code == IFTHEN) {
4645                 /* NOTE - There is similar code to this block below for
4646                  * handling TRIE nodes on a re-study.  If you change stuff here
4647                  * check there too. */
4648 		SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4649 		regnode_ssc accum;
4650 		regnode * const startbranch=scan;
4651 
4652                 if (flags & SCF_DO_SUBSTR) {
4653                     /* Cannot merge strings after this. */
4654                     scan_commit(pRExC_state, data, minlenp, is_inf);
4655                 }
4656 
4657                 if (flags & SCF_DO_STCLASS)
4658 		    ssc_init_zero(pRExC_state, &accum);
4659 
4660 		while (OP(scan) == code) {
4661 		    SSize_t deltanext, minnext, fake;
4662 		    I32 f = 0;
4663 		    regnode_ssc this_class;
4664 
4665                     DEBUG_PEEP("Branch", scan, depth, flags);
4666 
4667 		    num++;
4668                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4669 		    if (data) {
4670 			data_fake.whilem_c = data->whilem_c;
4671 			data_fake.last_closep = data->last_closep;
4672 		    }
4673 		    else
4674 			data_fake.last_closep = &fake;
4675 
4676 		    data_fake.pos_delta = delta;
4677 		    next = regnext(scan);
4678 
4679                     scan = NEXTOPER(scan); /* everything */
4680                     if (code != BRANCH)    /* everything but BRANCH */
4681 			scan = NEXTOPER(scan);
4682 
4683 		    if (flags & SCF_DO_STCLASS) {
4684 			ssc_init(pRExC_state, &this_class);
4685 			data_fake.start_class = &this_class;
4686 			f = SCF_DO_STCLASS_AND;
4687 		    }
4688 		    if (flags & SCF_WHILEM_VISITED_POS)
4689 			f |= SCF_WHILEM_VISITED_POS;
4690 
4691 		    /* we suppose the run is continuous, last=next...*/
4692                     /* recurse study_chunk() for each BRANCH in an alternation */
4693 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
4694                                       &deltanext, next, &data_fake, stopparen,
4695                                       recursed_depth, NULL, f, depth+1,
4696                                       mutate_ok);
4697 
4698 		    if (min1 > minnext)
4699 			min1 = minnext;
4700 		    if (deltanext == SSize_t_MAX) {
4701 			is_inf = is_inf_internal = 1;
4702 			max1 = SSize_t_MAX;
4703 		    } else if (max1 < minnext + deltanext)
4704 			max1 = minnext + deltanext;
4705 		    scan = next;
4706 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4707 			pars++;
4708 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
4709 	                if ( stopmin > minnext)
4710 	                    stopmin = min + min1;
4711 	                flags &= ~SCF_DO_SUBSTR;
4712 	                if (data)
4713 	                    data->flags |= SCF_SEEN_ACCEPT;
4714 	            }
4715 		    if (data) {
4716 			if (data_fake.flags & SF_HAS_EVAL)
4717 			    data->flags |= SF_HAS_EVAL;
4718 			data->whilem_c = data_fake.whilem_c;
4719 		    }
4720 		    if (flags & SCF_DO_STCLASS)
4721 			ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4722 		}
4723 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4724 		    min1 = 0;
4725 		if (flags & SCF_DO_SUBSTR) {
4726 		    data->pos_min += min1;
4727 		    if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4728 		        data->pos_delta = SSize_t_MAX;
4729 		    else
4730 		        data->pos_delta += max1 - min1;
4731 		    if (max1 != min1 || is_inf)
4732 			data->cur_is_floating = 1;
4733 		}
4734 		min += min1;
4735 		if (delta == SSize_t_MAX
4736 		 || SSize_t_MAX - delta - (max1 - min1) < 0)
4737 		    delta = SSize_t_MAX;
4738 		else
4739 		    delta += max1 - min1;
4740 		if (flags & SCF_DO_STCLASS_OR) {
4741 		    ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4742 		    if (min1) {
4743 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4744 			flags &= ~SCF_DO_STCLASS;
4745 		    }
4746 		}
4747 		else if (flags & SCF_DO_STCLASS_AND) {
4748 		    if (min1) {
4749 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4750 			flags &= ~SCF_DO_STCLASS;
4751 		    }
4752 		    else {
4753 			/* Switch to OR mode: cache the old value of
4754 			 * data->start_class */
4755 			INIT_AND_WITHP;
4756 			StructCopy(data->start_class, and_withp, regnode_ssc);
4757 			flags &= ~SCF_DO_STCLASS_AND;
4758 			StructCopy(&accum, data->start_class, regnode_ssc);
4759 			flags |= SCF_DO_STCLASS_OR;
4760 		    }
4761 		}
4762 
4763                 if (PERL_ENABLE_TRIE_OPTIMISATION
4764                     && OP(startbranch) == BRANCH
4765                     && mutate_ok
4766                 ) {
4767 		/* demq.
4768 
4769                    Assuming this was/is a branch we are dealing with: 'scan'
4770                    now points at the item that follows the branch sequence,
4771                    whatever it is. We now start at the beginning of the
4772                    sequence and look for subsequences of
4773 
4774 		   BRANCH->EXACT=>x1
4775 		   BRANCH->EXACT=>x2
4776 		   tail
4777 
4778                    which would be constructed from a pattern like
4779                    /A|LIST|OF|WORDS/
4780 
4781 		   If we can find such a subsequence we need to turn the first
4782 		   element into a trie and then add the subsequent branch exact
4783 		   strings to the trie.
4784 
4785 		   We have two cases
4786 
4787                      1. patterns where the whole set of branches can be
4788                         converted.
4789 
4790 		     2. patterns where only a subset can be converted.
4791 
4792 		   In case 1 we can replace the whole set with a single regop
4793 		   for the trie. In case 2 we need to keep the start and end
4794 		   branches so
4795 
4796 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4797 		     becomes BRANCH TRIE; BRANCH X;
4798 
4799 		  There is an additional case, that being where there is a
4800 		  common prefix, which gets split out into an EXACT like node
4801 		  preceding the TRIE node.
4802 
4803 		  If x(1..n)==tail then we can do a simple trie, if not we make
4804 		  a "jump" trie, such that when we match the appropriate word
4805 		  we "jump" to the appropriate tail node. Essentially we turn
4806 		  a nested if into a case structure of sorts.
4807 
4808 		*/
4809 
4810 		    int made=0;
4811 		    if (!re_trie_maxbuff) {
4812 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4813 			if (!SvIOK(re_trie_maxbuff))
4814 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4815 		    }
4816                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4817                         regnode *cur;
4818                         regnode *first = (regnode *)NULL;
4819                         regnode *last = (regnode *)NULL;
4820                         regnode *tail = scan;
4821                         U8 trietype = 0;
4822                         U32 count=0;
4823 
4824                         /* var tail is used because there may be a TAIL
4825                            regop in the way. Ie, the exacts will point to the
4826                            thing following the TAIL, but the last branch will
4827                            point at the TAIL. So we advance tail. If we
4828                            have nested (?:) we may have to move through several
4829                            tails.
4830                          */
4831 
4832                         while ( OP( tail ) == TAIL ) {
4833                             /* this is the TAIL generated by (?:) */
4834                             tail = regnext( tail );
4835                         }
4836 
4837 
4838                         DEBUG_TRIE_COMPILE_r({
4839                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4840                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4841                               depth+1,
4842                               "Looking for TRIE'able sequences. Tail node is ",
4843                               (UV) REGNODE_OFFSET(tail),
4844                               SvPV_nolen_const( RExC_mysv )
4845                             );
4846                         });
4847 
4848                         /*
4849 
4850                             Step through the branches
4851                                 cur represents each branch,
4852                                 noper is the first thing to be matched as part
4853                                       of that branch
4854                                 noper_next is the regnext() of that node.
4855 
4856                             We normally handle a case like this
4857                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4858                             support building with NOJUMPTRIE, which restricts
4859                             the trie logic to structures like /FOO|BAR/.
4860 
4861                             If noper is a trieable nodetype then the branch is
4862                             a possible optimization target. If we are building
4863                             under NOJUMPTRIE then we require that noper_next is
4864                             the same as scan (our current position in the regex
4865                             program).
4866 
4867                             Once we have two or more consecutive such branches
4868                             we can create a trie of the EXACT's contents and
4869                             stitch it in place into the program.
4870 
4871                             If the sequence represents all of the branches in
4872                             the alternation we replace the entire thing with a
4873                             single TRIE node.
4874 
4875                             Otherwise when it is a subsequence we need to
4876                             stitch it in place and replace only the relevant
4877                             branches. This means the first branch has to remain
4878                             as it is used by the alternation logic, and its
4879                             next pointer, and needs to be repointed at the item
4880                             on the branch chain following the last branch we
4881                             have optimized away.
4882 
4883                             This could be either a BRANCH, in which case the
4884                             subsequence is internal, or it could be the item
4885                             following the branch sequence in which case the
4886                             subsequence is at the end (which does not
4887                             necessarily mean the first node is the start of the
4888                             alternation).
4889 
4890                             TRIE_TYPE(X) is a define which maps the optype to a
4891                             trietype.
4892 
4893                                 optype          |  trietype
4894                                 ----------------+-----------
4895                                 NOTHING         | NOTHING
4896                                 EXACT           | EXACT
4897                                 EXACT_ONLY8     | EXACT
4898                                 EXACTFU         | EXACTFU
4899                                 EXACTFU_ONLY8   | EXACTFU
4900                                 EXACTFUP        | EXACTFU
4901                                 EXACTFAA        | EXACTFAA
4902                                 EXACTL          | EXACTL
4903                                 EXACTFLU8       | EXACTFLU8
4904 
4905 
4906                         */
4907 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4908                        ? NOTHING                                            \
4909                        : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
4910                          ? EXACT                                            \
4911                          : (     EXACTFU == (X)                             \
4912                               || EXACTFU_ONLY8 == (X)                       \
4913                               || EXACTFUP == (X) )                          \
4914                            ? EXACTFU                                        \
4915                            : ( EXACTFAA == (X) )                            \
4916                              ? EXACTFAA                                     \
4917                              : ( EXACTL == (X) )                            \
4918                                ? EXACTL                                     \
4919                                : ( EXACTFLU8 == (X) )                       \
4920                                  ? EXACTFLU8                                \
4921                                  : 0 )
4922 
4923                         /* dont use tail as the end marker for this traverse */
4924                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4925                             regnode * const noper = NEXTOPER( cur );
4926                             U8 noper_type = OP( noper );
4927                             U8 noper_trietype = TRIE_TYPE( noper_type );
4928 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4929                             regnode * const noper_next = regnext( noper );
4930                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4931                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4932 #endif
4933 
4934                             DEBUG_TRIE_COMPILE_r({
4935                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4936                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4937                                    depth+1,
4938                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4939 
4940                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4941                                 Perl_re_printf( aTHX_  " -> %d:%s",
4942                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4943 
4944                                 if ( noper_next ) {
4945                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4946                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4947                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4948                                 }
4949                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4950                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4951 				   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4952 				);
4953                             });
4954 
4955                             /* Is noper a trieable nodetype that can be merged
4956                              * with the current trie (if there is one)? */
4957                             if ( noper_trietype
4958                                   &&
4959                                   (
4960                                         ( noper_trietype == NOTHING )
4961                                         || ( trietype == NOTHING )
4962                                         || ( trietype == noper_trietype )
4963                                   )
4964 #ifdef NOJUMPTRIE
4965                                   && noper_next >= tail
4966 #endif
4967                                   && count < U16_MAX)
4968                             {
4969                                 /* Handle mergable triable node Either we are
4970                                  * the first node in a new trieable sequence,
4971                                  * in which case we do some bookkeeping,
4972                                  * otherwise we update the end pointer. */
4973                                 if ( !first ) {
4974                                     first = cur;
4975 				    if ( noper_trietype == NOTHING ) {
4976 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4977 					regnode * const noper_next = regnext( noper );
4978                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4979 					U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4980 #endif
4981 
4982                                         if ( noper_next_trietype ) {
4983 					    trietype = noper_next_trietype;
4984                                         } else if (noper_next_type)  {
4985                                             /* a NOTHING regop is 1 regop wide.
4986                                              * We need at least two for a trie
4987                                              * so we can't merge this in */
4988                                             first = NULL;
4989                                         }
4990                                     } else {
4991                                         trietype = noper_trietype;
4992                                     }
4993                                 } else {
4994                                     if ( trietype == NOTHING )
4995                                         trietype = noper_trietype;
4996                                     last = cur;
4997                                 }
4998 				if (first)
4999 				    count++;
5000                             } /* end handle mergable triable node */
5001                             else {
5002                                 /* handle unmergable node -
5003                                  * noper may either be a triable node which can
5004                                  * not be tried together with the current trie,
5005                                  * or a non triable node */
5006                                 if ( last ) {
5007                                     /* If last is set and trietype is not
5008                                      * NOTHING then we have found at least two
5009                                      * triable branch sequences in a row of a
5010                                      * similar trietype so we can turn them
5011                                      * into a trie. If/when we allow NOTHING to
5012                                      * start a trie sequence this condition
5013                                      * will be required, and it isn't expensive
5014                                      * so we leave it in for now. */
5015                                     if ( trietype && trietype != NOTHING )
5016                                         make_trie( pRExC_state,
5017                                                 startbranch, first, cur, tail,
5018                                                 count, trietype, depth+1 );
5019                                     last = NULL; /* note: we clear/update
5020                                                     first, trietype etc below,
5021                                                     so we dont do it here */
5022                                 }
5023                                 if ( noper_trietype
5024 #ifdef NOJUMPTRIE
5025                                      && noper_next >= tail
5026 #endif
5027                                 ){
5028                                     /* noper is triable, so we can start a new
5029                                      * trie sequence */
5030                                     count = 1;
5031                                     first = cur;
5032                                     trietype = noper_trietype;
5033                                 } else if (first) {
5034                                     /* if we already saw a first but the
5035                                      * current node is not triable then we have
5036                                      * to reset the first information. */
5037                                     count = 0;
5038                                     first = NULL;
5039                                     trietype = 0;
5040                                 }
5041                             } /* end handle unmergable node */
5042                         } /* loop over branches */
5043                         DEBUG_TRIE_COMPILE_r({
5044                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5045                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5046                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5047                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5048                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
5049                                PL_reg_name[trietype]
5050                             );
5051 
5052                         });
5053                         if ( last && trietype ) {
5054                             if ( trietype != NOTHING ) {
5055                                 /* the last branch of the sequence was part of
5056                                  * a trie, so we have to construct it here
5057                                  * outside of the loop */
5058                                 made= make_trie( pRExC_state, startbranch,
5059                                                  first, scan, tail, count,
5060                                                  trietype, depth+1 );
5061 #ifdef TRIE_STUDY_OPT
5062                                 if ( ((made == MADE_EXACT_TRIE &&
5063                                      startbranch == first)
5064                                      || ( first_non_open == first )) &&
5065                                      depth==0 ) {
5066                                     flags |= SCF_TRIE_RESTUDY;
5067                                     if ( startbranch == first
5068                                          && scan >= tail )
5069                                     {
5070                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5071                                     }
5072                                 }
5073 #endif
5074                             } else {
5075                                 /* at this point we know whatever we have is a
5076                                  * NOTHING sequence/branch AND if 'startbranch'
5077                                  * is 'first' then we can turn the whole thing
5078                                  * into a NOTHING
5079                                  */
5080                                 if ( startbranch == first ) {
5081                                     regnode *opt;
5082                                     /* the entire thing is a NOTHING sequence,
5083                                      * something like this: (?:|) So we can
5084                                      * turn it into a plain NOTHING op. */
5085                                     DEBUG_TRIE_COMPILE_r({
5086                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5087                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5088                                           depth+1,
5089                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5090 
5091                                     });
5092                                     OP(startbranch)= NOTHING;
5093                                     NEXT_OFF(startbranch)= tail - startbranch;
5094                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5095                                         OP(opt)= OPTIMIZED;
5096                                 }
5097                             }
5098                         } /* end if ( last) */
5099                     } /* TRIE_MAXBUF is non zero */
5100 
5101                 } /* do trie */
5102 
5103 	    }
5104 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5105 		scan = NEXTOPER(NEXTOPER(scan));
5106 	    } else			/* single branch is optimized. */
5107 		scan = NEXTOPER(scan);
5108 	    continue;
5109         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5110             I32 paren = 0;
5111             regnode *start = NULL;
5112             regnode *end = NULL;
5113             U32 my_recursed_depth= recursed_depth;
5114 
5115             if (OP(scan) != SUSPEND) { /* GOSUB */
5116                 /* Do setup, note this code has side effects beyond
5117                  * the rest of this block. Specifically setting
5118                  * RExC_recurse[] must happen at least once during
5119                  * study_chunk(). */
5120                 paren = ARG(scan);
5121                 RExC_recurse[ARG2L(scan)] = scan;
5122                 start = REGNODE_p(RExC_open_parens[paren]);
5123                 end   = REGNODE_p(RExC_close_parens[paren]);
5124 
5125                 /* NOTE we MUST always execute the above code, even
5126                  * if we do nothing with a GOSUB */
5127                 if (
5128                     ( flags & SCF_IN_DEFINE )
5129                     ||
5130                     (
5131                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5132                         &&
5133                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5134                     )
5135                 ) {
5136                     /* no need to do anything here if we are in a define. */
5137                     /* or we are after some kind of infinite construct
5138                      * so we can skip recursing into this item.
5139                      * Since it is infinite we will not change the maxlen
5140                      * or delta, and if we miss something that might raise
5141                      * the minlen it will merely pessimise a little.
5142                      *
5143                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5144                      * might result in a minlen of 1 and not of 4,
5145                      * but this doesn't make us mismatch, just try a bit
5146                      * harder than we should.
5147                      * */
5148                     scan= regnext(scan);
5149                     continue;
5150                 }
5151 
5152                 if (
5153                     !recursed_depth
5154                     ||
5155                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5156                 ) {
5157                     /* it is quite possible that there are more efficient ways
5158                      * to do this. We maintain a bitmap per level of recursion
5159                      * of which patterns we have entered so we can detect if a
5160                      * pattern creates a possible infinite loop. When we
5161                      * recurse down a level we copy the previous levels bitmap
5162                      * down. When we are at recursion level 0 we zero the top
5163                      * level bitmap. It would be nice to implement a different
5164                      * more efficient way of doing this. In particular the top
5165                      * level bitmap may be unnecessary.
5166                      */
5167                     if (!recursed_depth) {
5168                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5169                     } else {
5170                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5171                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5172                              RExC_study_chunk_recursed_bytes, U8);
5173                     }
5174                     /* we havent recursed into this paren yet, so recurse into it */
5175                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5176                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5177                     my_recursed_depth= recursed_depth + 1;
5178                 } else {
5179                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5180                     /* some form of infinite recursion, assume infinite length
5181                      * */
5182                     if (flags & SCF_DO_SUBSTR) {
5183                         scan_commit(pRExC_state, data, minlenp, is_inf);
5184                         data->cur_is_floating = 1;
5185                     }
5186                     is_inf = is_inf_internal = 1;
5187                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5188                         ssc_anything(data->start_class);
5189                     flags &= ~SCF_DO_STCLASS;
5190 
5191                     start= NULL; /* reset start so we dont recurse later on. */
5192 	        }
5193             } else {
5194 	        paren = stopparen;
5195                 start = scan + 2;
5196 	        end = regnext(scan);
5197 	    }
5198             if (start) {
5199                 scan_frame *newframe;
5200                 assert(end);
5201                 if (!RExC_frame_last) {
5202                     Newxz(newframe, 1, scan_frame);
5203                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5204                     RExC_frame_head= newframe;
5205                     RExC_frame_count++;
5206                 } else if (!RExC_frame_last->next_frame) {
5207                     Newxz(newframe, 1, scan_frame);
5208                     RExC_frame_last->next_frame= newframe;
5209                     newframe->prev_frame= RExC_frame_last;
5210                     RExC_frame_count++;
5211                 } else {
5212                     newframe= RExC_frame_last->next_frame;
5213                 }
5214                 RExC_frame_last= newframe;
5215 
5216                 newframe->next_regnode = regnext(scan);
5217                 newframe->last_regnode = last;
5218                 newframe->stopparen = stopparen;
5219                 newframe->prev_recursed_depth = recursed_depth;
5220                 newframe->this_prev_frame= frame;
5221                 newframe->in_gosub = (
5222                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5223                 );
5224 
5225                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5226                 DEBUG_PEEP("fnew", scan, depth, flags);
5227 
5228 	        frame = newframe;
5229 	        scan =  start;
5230 	        stopparen = paren;
5231 	        last = end;
5232                 depth = depth + 1;
5233                 recursed_depth= my_recursed_depth;
5234 
5235 	        continue;
5236 	    }
5237 	}
5238 	else if (   OP(scan) == EXACT
5239                  || OP(scan) == EXACT_ONLY8
5240                  || OP(scan) == EXACTL)
5241         {
5242 	    SSize_t l = STR_LEN(scan);
5243 	    UV uc;
5244             assert(l);
5245 	    if (UTF) {
5246 		const U8 * const s = (U8*)STRING(scan);
5247 		uc = utf8_to_uvchr_buf(s, s + l, NULL);
5248 		l = utf8_length(s, s + l);
5249 	    } else {
5250 		uc = *((U8*)STRING(scan));
5251 	    }
5252 	    min += l;
5253 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5254 		/* The code below prefers earlier match for fixed
5255 		   offset, later match for variable offset.  */
5256 		if (data->last_end == -1) { /* Update the start info. */
5257 		    data->last_start_min = data->pos_min;
5258  		    data->last_start_max = is_inf
5259  			? SSize_t_MAX : data->pos_min + data->pos_delta;
5260 		}
5261 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5262 		if (UTF)
5263 		    SvUTF8_on(data->last_found);
5264 		{
5265 		    SV * const sv = data->last_found;
5266 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5267 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
5268 		    if (mg && mg->mg_len >= 0)
5269 			mg->mg_len += utf8_length((U8*)STRING(scan),
5270                                               (U8*)STRING(scan)+STR_LEN(scan));
5271 		}
5272 		data->last_end = data->pos_min + l;
5273 		data->pos_min += l; /* As in the first entry. */
5274 		data->flags &= ~SF_BEFORE_EOL;
5275 	    }
5276 
5277             /* ANDing the code point leaves at most it, and not in locale, and
5278              * can't match null string */
5279 	    if (flags & SCF_DO_STCLASS_AND) {
5280                 ssc_cp_and(data->start_class, uc);
5281                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5282                 ssc_clear_locale(data->start_class);
5283 	    }
5284 	    else if (flags & SCF_DO_STCLASS_OR) {
5285                 ssc_add_cp(data->start_class, uc);
5286 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5287 
5288                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5289                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5290 	    }
5291 	    flags &= ~SCF_DO_STCLASS;
5292 	}
5293         else if (PL_regkind[OP(scan)] == EXACT) {
5294             /* But OP != EXACT!, so is EXACTFish */
5295 	    SSize_t l = STR_LEN(scan);
5296             const U8 * s = (U8*)STRING(scan);
5297 
5298 	    /* Search for fixed substrings supports EXACT only. */
5299 	    if (flags & SCF_DO_SUBSTR) {
5300 		assert(data);
5301                 scan_commit(pRExC_state, data, minlenp, is_inf);
5302 	    }
5303 	    if (UTF) {
5304 		l = utf8_length(s, s + l);
5305 	    }
5306 	    if (unfolded_multi_char) {
5307                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5308 	    }
5309 	    min += l - min_subtract;
5310             assert (min >= 0);
5311             delta += min_subtract;
5312 	    if (flags & SCF_DO_SUBSTR) {
5313 		data->pos_min += l - min_subtract;
5314 		if (data->pos_min < 0) {
5315                     data->pos_min = 0;
5316                 }
5317                 data->pos_delta += min_subtract;
5318 		if (min_subtract) {
5319 		    data->cur_is_floating = 1; /* float */
5320 		}
5321 	    }
5322 
5323             if (flags & SCF_DO_STCLASS) {
5324                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5325 
5326                 assert(EXACTF_invlist);
5327                 if (flags & SCF_DO_STCLASS_AND) {
5328                     if (OP(scan) != EXACTFL)
5329                         ssc_clear_locale(data->start_class);
5330                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5331                     ANYOF_POSIXL_ZERO(data->start_class);
5332                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5333                 }
5334                 else {  /* SCF_DO_STCLASS_OR */
5335                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5336                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5337 
5338                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5339                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5340                 }
5341                 flags &= ~SCF_DO_STCLASS;
5342                 SvREFCNT_dec(EXACTF_invlist);
5343             }
5344 	}
5345 	else if (REGNODE_VARIES(OP(scan))) {
5346 	    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5347 	    I32 fl = 0, f = flags;
5348 	    regnode * const oscan = scan;
5349 	    regnode_ssc this_class;
5350 	    regnode_ssc *oclass = NULL;
5351 	    I32 next_is_eval = 0;
5352 
5353 	    switch (PL_regkind[OP(scan)]) {
5354 	    case WHILEM:		/* End of (?:...)* . */
5355 		scan = NEXTOPER(scan);
5356 		goto finish;
5357 	    case PLUS:
5358 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5359 		    next = NEXTOPER(scan);
5360 		    if (   OP(next) == EXACT
5361                         || OP(next) == EXACT_ONLY8
5362                         || OP(next) == EXACTL
5363                         || (flags & SCF_DO_STCLASS))
5364                     {
5365 			mincount = 1;
5366 			maxcount = REG_INFTY;
5367 			next = regnext(scan);
5368 			scan = NEXTOPER(scan);
5369 			goto do_curly;
5370 		    }
5371 		}
5372 		if (flags & SCF_DO_SUBSTR)
5373 		    data->pos_min++;
5374 		min++;
5375 		/* FALLTHROUGH */
5376 	    case STAR:
5377                 next = NEXTOPER(scan);
5378 
5379                 /* This temporary node can now be turned into EXACTFU, and
5380                  * must, as regexec.c doesn't handle it */
5381                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5382                     OP(next) = EXACTFU;
5383                 }
5384 
5385                 if (     STR_LEN(next) == 1
5386                     &&   isALPHA_A(* STRING(next))
5387                     && (         OP(next) == EXACTFAA
5388                         || (     OP(next) == EXACTFU
5389                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5390                     &&   mutate_ok
5391                 ) {
5392                     /* These differ in just one bit */
5393                     U8 mask = ~ ('A' ^ 'a');
5394 
5395                     assert(isALPHA_A(* STRING(next)));
5396 
5397                     /* Then replace it by an ANYOFM node, with
5398                     * the mask set to the complement of the
5399                     * bit that differs between upper and lower
5400                     * case, and the lowest code point of the
5401                     * pair (which the '&' forces) */
5402                     OP(next) = ANYOFM;
5403                     ARG_SET(next, *STRING(next) & mask);
5404                     FLAGS(next) = mask;
5405                 }
5406 
5407 		if (flags & SCF_DO_STCLASS) {
5408 		    mincount = 0;
5409 		    maxcount = REG_INFTY;
5410 		    next = regnext(scan);
5411 		    scan = NEXTOPER(scan);
5412 		    goto do_curly;
5413 		}
5414 		if (flags & SCF_DO_SUBSTR) {
5415                     scan_commit(pRExC_state, data, minlenp, is_inf);
5416                     /* Cannot extend fixed substrings */
5417 		    data->cur_is_floating = 1; /* float */
5418 		}
5419                 is_inf = is_inf_internal = 1;
5420                 scan = regnext(scan);
5421 		goto optimize_curly_tail;
5422 	    case CURLY:
5423 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5424 	            && (scan->flags == stopparen))
5425 		{
5426 		    mincount = 1;
5427 		    maxcount = 1;
5428 		} else {
5429 		    mincount = ARG1(scan);
5430 		    maxcount = ARG2(scan);
5431 		}
5432 		next = regnext(scan);
5433 		if (OP(scan) == CURLYX) {
5434 		    I32 lp = (data ? *(data->last_closep) : 0);
5435 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5436 		}
5437 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5438 		next_is_eval = (OP(scan) == EVAL);
5439 	      do_curly:
5440 		if (flags & SCF_DO_SUBSTR) {
5441                     if (mincount == 0)
5442                         scan_commit(pRExC_state, data, minlenp, is_inf);
5443                     /* Cannot extend fixed substrings */
5444 		    pos_before = data->pos_min;
5445 		}
5446 		if (data) {
5447 		    fl = data->flags;
5448 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5449 		    if (is_inf)
5450 			data->flags |= SF_IS_INF;
5451 		}
5452 		if (flags & SCF_DO_STCLASS) {
5453 		    ssc_init(pRExC_state, &this_class);
5454 		    oclass = data->start_class;
5455 		    data->start_class = &this_class;
5456 		    f |= SCF_DO_STCLASS_AND;
5457 		    f &= ~SCF_DO_STCLASS_OR;
5458 		}
5459 	        /* Exclude from super-linear cache processing any {n,m}
5460 		   regops for which the combination of input pos and regex
5461 		   pos is not enough information to determine if a match
5462 		   will be possible.
5463 
5464 		   For example, in the regex /foo(bar\s*){4,8}baz/ with the
5465 		   regex pos at the \s*, the prospects for a match depend not
5466 		   only on the input position but also on how many (bar\s*)
5467 		   repeats into the {4,8} we are. */
5468                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5469 		    f &= ~SCF_WHILEM_VISITED_POS;
5470 
5471 		/* This will finish on WHILEM, setting scan, or on NULL: */
5472                 /* recurse study_chunk() on loop bodies */
5473 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5474                                   last, data, stopparen, recursed_depth, NULL,
5475                                   (mincount == 0
5476                                    ? (f & ~SCF_DO_SUBSTR)
5477                                    : f)
5478                                   , depth+1, mutate_ok);
5479 
5480 		if (flags & SCF_DO_STCLASS)
5481 		    data->start_class = oclass;
5482 		if (mincount == 0 || minnext == 0) {
5483 		    if (flags & SCF_DO_STCLASS_OR) {
5484 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5485 		    }
5486 		    else if (flags & SCF_DO_STCLASS_AND) {
5487 			/* Switch to OR mode: cache the old value of
5488 			 * data->start_class */
5489 			INIT_AND_WITHP;
5490 			StructCopy(data->start_class, and_withp, regnode_ssc);
5491 			flags &= ~SCF_DO_STCLASS_AND;
5492 			StructCopy(&this_class, data->start_class, regnode_ssc);
5493 			flags |= SCF_DO_STCLASS_OR;
5494                         ANYOF_FLAGS(data->start_class)
5495                                                 |= SSC_MATCHES_EMPTY_STRING;
5496 		    }
5497 		} else {		/* Non-zero len */
5498 		    if (flags & SCF_DO_STCLASS_OR) {
5499 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5500 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5501 		    }
5502 		    else if (flags & SCF_DO_STCLASS_AND)
5503 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5504 		    flags &= ~SCF_DO_STCLASS;
5505 		}
5506 		if (!scan) 		/* It was not CURLYX, but CURLY. */
5507 		    scan = next;
5508 		if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5509 		    /* ? quantifier ok, except for (?{ ... }) */
5510 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
5511 		    && (minnext == 0) && (deltanext == 0)
5512 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5513                     && maxcount <= REG_INFTY/3) /* Complement check for big
5514                                                    count */
5515 		{
5516 		    _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5517                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5518                             "Quantifier unexpected on zero-length expression "
5519                             "in regex m/%" UTF8f "/",
5520 			     UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5521 				  RExC_precomp)));
5522                 }
5523 
5524                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5525                     || min >= SSize_t_MAX - minnext * mincount )
5526                 {
5527                     FAIL("Regexp out of space");
5528                 }
5529 
5530 		min += minnext * mincount;
5531 		is_inf_internal |= deltanext == SSize_t_MAX
5532                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5533 		is_inf |= is_inf_internal;
5534                 if (is_inf) {
5535 		    delta = SSize_t_MAX;
5536                 } else {
5537 		    delta += (minnext + deltanext) * maxcount
5538                              - minnext * mincount;
5539                 }
5540 		/* Try powerful optimization CURLYX => CURLYN. */
5541 		if (  OP(oscan) == CURLYX && data
5542 		      && data->flags & SF_IN_PAR
5543 		      && !(data->flags & SF_HAS_EVAL)
5544 		      && !deltanext && minnext == 1
5545                       && mutate_ok
5546                 ) {
5547 		    /* Try to optimize to CURLYN.  */
5548 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5549 		    regnode * const nxt1 = nxt;
5550 #ifdef DEBUGGING
5551 		    regnode *nxt2;
5552 #endif
5553 
5554 		    /* Skip open. */
5555 		    nxt = regnext(nxt);
5556 		    if (!REGNODE_SIMPLE(OP(nxt))
5557 			&& !(PL_regkind[OP(nxt)] == EXACT
5558 			     && STR_LEN(nxt) == 1))
5559 			goto nogo;
5560 #ifdef DEBUGGING
5561 		    nxt2 = nxt;
5562 #endif
5563 		    nxt = regnext(nxt);
5564 		    if (OP(nxt) != CLOSE)
5565 			goto nogo;
5566 		    if (RExC_open_parens) {
5567 
5568                         /*open->CURLYM*/
5569                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5570 
5571                         /*close->while*/
5572                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5573 		    }
5574 		    /* Now we know that nxt2 is the only contents: */
5575 		    oscan->flags = (U8)ARG(nxt);
5576 		    OP(oscan) = CURLYN;
5577 		    OP(nxt1) = NOTHING;	/* was OPEN. */
5578 
5579 #ifdef DEBUGGING
5580 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5581 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5582 		    NEXT_OFF(nxt2) = 0;	/* just for consistency with CURLY. */
5583 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5584 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
5585 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5586 #endif
5587 		}
5588 	      nogo:
5589 
5590 		/* Try optimization CURLYX => CURLYM. */
5591 		if (  OP(oscan) == CURLYX && data
5592 		      && !(data->flags & SF_HAS_PAR)
5593 		      && !(data->flags & SF_HAS_EVAL)
5594 		      && !deltanext	/* atom is fixed width */
5595 		      && minnext != 0	/* CURLYM can't handle zero width */
5596                          /* Nor characters whose fold at run-time may be
5597                           * multi-character */
5598                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5599                       && mutate_ok
5600 		) {
5601 		    /* XXXX How to optimize if data == 0? */
5602 		    /* Optimize to a simpler form.  */
5603 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5604 		    regnode *nxt2;
5605 
5606 		    OP(oscan) = CURLYM;
5607 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5608 			    && (OP(nxt2) != WHILEM))
5609 			nxt = nxt2;
5610 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5611 		    /* Need to optimize away parenths. */
5612 		    if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5613 			/* Set the parenth number.  */
5614 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5615 
5616 			oscan->flags = (U8)ARG(nxt);
5617 			if (RExC_open_parens) {
5618                              /*open->CURLYM*/
5619                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5620 
5621                             /*close->NOTHING*/
5622                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5623                                                          + 1;
5624 			}
5625 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
5626 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5627 
5628 #ifdef DEBUGGING
5629 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5630 			OP(nxt + 1) = OPTIMIZED; /* was count. */
5631 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5632 			NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5633 #endif
5634 #if 0
5635 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
5636 			    regnode *nnxt = regnext(nxt1);
5637 			    if (nnxt == nxt) {
5638 				if (reg_off_by_arg[OP(nxt1)])
5639 				    ARG_SET(nxt1, nxt2 - nxt1);
5640 				else if (nxt2 - nxt1 < U16_MAX)
5641 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
5642 				else
5643 				    OP(nxt) = NOTHING;	/* Cannot beautify */
5644 			    }
5645 			    nxt1 = nnxt;
5646 			}
5647 #endif
5648 			/* Optimize again: */
5649                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5650 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5651                                     NULL, stopparen, recursed_depth, NULL, 0,
5652                                     depth+1, mutate_ok);
5653 		    }
5654 		    else
5655 			oscan->flags = 0;
5656 		}
5657 		else if ((OP(oscan) == CURLYX)
5658 			 && (flags & SCF_WHILEM_VISITED_POS)
5659 			 /* See the comment on a similar expression above.
5660 			    However, this time it's not a subexpression
5661 			    we care about, but the expression itself. */
5662 			 && (maxcount == REG_INFTY)
5663 			 && data) {
5664 		    /* This stays as CURLYX, we can put the count/of pair. */
5665 		    /* Find WHILEM (as in regexec.c) */
5666 		    regnode *nxt = oscan + NEXT_OFF(oscan);
5667 
5668 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5669 			nxt += ARG(nxt);
5670                     nxt = PREVOPER(nxt);
5671                     if (nxt->flags & 0xf) {
5672                         /* we've already set whilem count on this node */
5673                     } else if (++data->whilem_c < 16) {
5674                         assert(data->whilem_c <= RExC_whilem_seen);
5675                         nxt->flags = (U8)(data->whilem_c
5676                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5677                     }
5678 		}
5679 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5680 		    pars++;
5681 		if (flags & SCF_DO_SUBSTR) {
5682 		    SV *last_str = NULL;
5683                     STRLEN last_chrs = 0;
5684 		    int counted = mincount != 0;
5685 
5686                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5687                                                                   string. */
5688 			SSize_t b = pos_before >= data->last_start_min
5689 			    ? pos_before : data->last_start_min;
5690 			STRLEN l;
5691 			const char * const s = SvPV_const(data->last_found, l);
5692 			SSize_t old = b - data->last_start_min;
5693                         assert(old >= 0);
5694 
5695 			if (UTF)
5696 			    old = utf8_hop_forward((U8*)s, old,
5697                                                (U8 *) SvEND(data->last_found))
5698                                 - (U8*)s;
5699 			l -= old;
5700 			/* Get the added string: */
5701 			last_str = newSVpvn_utf8(s  + old, l, UTF);
5702                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5703                                             (U8*)(s + old + l)) : l;
5704 			if (deltanext == 0 && pos_before == b) {
5705 			    /* What was added is a constant string */
5706 			    if (mincount > 1) {
5707 
5708 				SvGROW(last_str, (mincount * l) + 1);
5709 				repeatcpy(SvPVX(last_str) + l,
5710 					  SvPVX_const(last_str), l,
5711                                           mincount - 1);
5712 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
5713 				/* Add additional parts. */
5714 				SvCUR_set(data->last_found,
5715 					  SvCUR(data->last_found) - l);
5716 				sv_catsv(data->last_found, last_str);
5717 				{
5718 				    SV * sv = data->last_found;
5719 				    MAGIC *mg =
5720 					SvUTF8(sv) && SvMAGICAL(sv) ?
5721 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
5722 				    if (mg && mg->mg_len >= 0)
5723 					mg->mg_len += last_chrs * (mincount-1);
5724 				}
5725                                 last_chrs *= mincount;
5726 				data->last_end += l * (mincount - 1);
5727 			    }
5728 			} else {
5729 			    /* start offset must point into the last copy */
5730 			    data->last_start_min += minnext * (mincount - 1);
5731 			    data->last_start_max =
5732                               is_inf
5733                                ? SSize_t_MAX
5734 			       : data->last_start_max +
5735                                  (maxcount - 1) * (minnext + data->pos_delta);
5736 			}
5737 		    }
5738 		    /* It is counted once already... */
5739 		    data->pos_min += minnext * (mincount - counted);
5740 #if 0
5741 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5742                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5743                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5744     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5745     (UV)mincount);
5746 if (deltanext != SSize_t_MAX)
5747 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5748     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5749           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5750 #endif
5751 		    if (deltanext == SSize_t_MAX
5752                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5753 		        data->pos_delta = SSize_t_MAX;
5754 		    else
5755 		        data->pos_delta += - counted * deltanext +
5756 			(minnext + deltanext) * maxcount - minnext * mincount;
5757 		    if (mincount != maxcount) {
5758 			 /* Cannot extend fixed substrings found inside
5759 			    the group.  */
5760                         scan_commit(pRExC_state, data, minlenp, is_inf);
5761 			if (mincount && last_str) {
5762 			    SV * const sv = data->last_found;
5763 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5764 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
5765 
5766 			    if (mg)
5767 				mg->mg_len = -1;
5768 			    sv_setsv(sv, last_str);
5769 			    data->last_end = data->pos_min;
5770 			    data->last_start_min = data->pos_min - last_chrs;
5771 			    data->last_start_max = is_inf
5772 				? SSize_t_MAX
5773 				: data->pos_min + data->pos_delta - last_chrs;
5774 			}
5775 			data->cur_is_floating = 1; /* float */
5776 		    }
5777 		    SvREFCNT_dec(last_str);
5778 		}
5779 		if (data && (fl & SF_HAS_EVAL))
5780 		    data->flags |= SF_HAS_EVAL;
5781 	      optimize_curly_tail:
5782 		rck_elide_nothing(oscan);
5783 		continue;
5784 
5785 	    default:
5786 #ifdef DEBUGGING
5787                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5788                                                                     OP(scan));
5789 #endif
5790             case REF:
5791             case CLUMP:
5792 		if (flags & SCF_DO_SUBSTR) {
5793                     /* Cannot expect anything... */
5794                     scan_commit(pRExC_state, data, minlenp, is_inf);
5795 		    data->cur_is_floating = 1; /* float */
5796 		}
5797 		is_inf = is_inf_internal = 1;
5798 		if (flags & SCF_DO_STCLASS_OR) {
5799                     if (OP(scan) == CLUMP) {
5800                         /* Actually is any start char, but very few code points
5801                          * aren't start characters */
5802                         ssc_match_all_cp(data->start_class);
5803                     }
5804                     else {
5805                         ssc_anything(data->start_class);
5806                     }
5807                 }
5808 		flags &= ~SCF_DO_STCLASS;
5809 		break;
5810 	    }
5811 	}
5812 	else if (OP(scan) == LNBREAK) {
5813 	    if (flags & SCF_DO_STCLASS) {
5814     	        if (flags & SCF_DO_STCLASS_AND) {
5815                     ssc_intersection(data->start_class,
5816                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5817                     ssc_clear_locale(data->start_class);
5818                     ANYOF_FLAGS(data->start_class)
5819                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5820                 }
5821                 else if (flags & SCF_DO_STCLASS_OR) {
5822                     ssc_union(data->start_class,
5823                               PL_XPosix_ptrs[_CC_VERTSPACE],
5824                               FALSE);
5825 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5826 
5827                     /* See commit msg for
5828                      * 749e076fceedeb708a624933726e7989f2302f6a */
5829                     ANYOF_FLAGS(data->start_class)
5830                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5831                 }
5832 		flags &= ~SCF_DO_STCLASS;
5833             }
5834 	    min++;
5835             if (delta != SSize_t_MAX)
5836                 delta++;    /* Because of the 2 char string cr-lf */
5837             if (flags & SCF_DO_SUBSTR) {
5838                 /* Cannot expect anything... */
5839                 scan_commit(pRExC_state, data, minlenp, is_inf);
5840     	        data->pos_min += 1;
5841                 if (data->pos_delta != SSize_t_MAX) {
5842                     data->pos_delta += 1;
5843                 }
5844 		data->cur_is_floating = 1; /* float */
5845     	    }
5846 	}
5847 	else if (REGNODE_SIMPLE(OP(scan))) {
5848 
5849 	    if (flags & SCF_DO_SUBSTR) {
5850                 scan_commit(pRExC_state, data, minlenp, is_inf);
5851 		data->pos_min++;
5852 	    }
5853 	    min++;
5854 	    if (flags & SCF_DO_STCLASS) {
5855                 bool invert = 0;
5856                 SV* my_invlist = NULL;
5857                 U8 namedclass;
5858 
5859                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5860                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5861 
5862 		/* Some of the logic below assumes that switching
5863 		   locale on will only add false positives. */
5864 		switch (OP(scan)) {
5865 
5866 		default:
5867 #ifdef DEBUGGING
5868                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5869                                                                      OP(scan));
5870 #endif
5871 		case SANY:
5872 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5873 			ssc_match_all_cp(data->start_class);
5874 		    break;
5875 
5876 		case REG_ANY:
5877                     {
5878                         SV* REG_ANY_invlist = _new_invlist(2);
5879                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5880                                                             '\n');
5881                         if (flags & SCF_DO_STCLASS_OR) {
5882                             ssc_union(data->start_class,
5883                                       REG_ANY_invlist,
5884                                       TRUE /* TRUE => invert, hence all but \n
5885                                             */
5886                                       );
5887                         }
5888                         else if (flags & SCF_DO_STCLASS_AND) {
5889                             ssc_intersection(data->start_class,
5890                                              REG_ANY_invlist,
5891                                              TRUE  /* TRUE => invert */
5892                                              );
5893                             ssc_clear_locale(data->start_class);
5894                         }
5895                         SvREFCNT_dec_NN(REG_ANY_invlist);
5896 		    }
5897 		    break;
5898 
5899                 case ANYOFD:
5900                 case ANYOFL:
5901                 case ANYOFPOSIXL:
5902                 case ANYOFH:
5903                 case ANYOF:
5904 		    if (flags & SCF_DO_STCLASS_AND)
5905 			ssc_and(pRExC_state, data->start_class,
5906                                 (regnode_charclass *) scan);
5907 		    else
5908 			ssc_or(pRExC_state, data->start_class,
5909                                                           (regnode_charclass *) scan);
5910 		    break;
5911 
5912                 case NANYOFM:
5913                 case ANYOFM:
5914                   {
5915                     SV* cp_list = get_ANYOFM_contents(scan);
5916 
5917                     if (flags & SCF_DO_STCLASS_OR) {
5918                         ssc_union(data->start_class, cp_list, invert);
5919                     }
5920                     else if (flags & SCF_DO_STCLASS_AND) {
5921                         ssc_intersection(data->start_class, cp_list, invert);
5922                     }
5923 
5924                     SvREFCNT_dec_NN(cp_list);
5925                     break;
5926                   }
5927 
5928 		case NPOSIXL:
5929                     invert = 1;
5930                     /* FALLTHROUGH */
5931 
5932 		case POSIXL:
5933                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5934                     if (flags & SCF_DO_STCLASS_AND) {
5935                         bool was_there = cBOOL(
5936                                           ANYOF_POSIXL_TEST(data->start_class,
5937                                                                  namedclass));
5938                         ANYOF_POSIXL_ZERO(data->start_class);
5939                         if (was_there) {    /* Do an AND */
5940                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5941                         }
5942                         /* No individual code points can now match */
5943                         data->start_class->invlist
5944                                                 = sv_2mortal(_new_invlist(0));
5945                     }
5946                     else {
5947                         int complement = namedclass + ((invert) ? -1 : 1);
5948 
5949                         assert(flags & SCF_DO_STCLASS_OR);
5950 
5951                         /* If the complement of this class was already there,
5952                          * the result is that they match all code points,
5953                          * (\d + \D == everything).  Remove the classes from
5954                          * future consideration.  Locale is not relevant in
5955                          * this case */
5956                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5957                             ssc_match_all_cp(data->start_class);
5958                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5959                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5960                         }
5961                         else {  /* The usual case; just add this class to the
5962                                    existing set */
5963                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5964                         }
5965                     }
5966                     break;
5967 
5968                 case NPOSIXA:   /* For these, we always know the exact set of
5969                                    what's matched */
5970                     invert = 1;
5971                     /* FALLTHROUGH */
5972 		case POSIXA:
5973                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5974                     goto join_posix_and_ascii;
5975 
5976 		case NPOSIXD:
5977 		case NPOSIXU:
5978                     invert = 1;
5979                     /* FALLTHROUGH */
5980 		case POSIXD:
5981 		case POSIXU:
5982                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5983 
5984                     /* NPOSIXD matches all upper Latin1 code points unless the
5985                      * target string being matched is UTF-8, which is
5986                      * unknowable until match time.  Since we are going to
5987                      * invert, we want to get rid of all of them so that the
5988                      * inversion will match all */
5989                     if (OP(scan) == NPOSIXD) {
5990                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5991                                           &my_invlist);
5992                     }
5993 
5994                   join_posix_and_ascii:
5995 
5996                     if (flags & SCF_DO_STCLASS_AND) {
5997                         ssc_intersection(data->start_class, my_invlist, invert);
5998                         ssc_clear_locale(data->start_class);
5999                     }
6000                     else {
6001                         assert(flags & SCF_DO_STCLASS_OR);
6002                         ssc_union(data->start_class, my_invlist, invert);
6003                     }
6004                     SvREFCNT_dec(my_invlist);
6005 		}
6006 		if (flags & SCF_DO_STCLASS_OR)
6007 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6008 		flags &= ~SCF_DO_STCLASS;
6009 	    }
6010 	}
6011 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6012 	    data->flags |= (OP(scan) == MEOL
6013 			    ? SF_BEFORE_MEOL
6014 			    : SF_BEFORE_SEOL);
6015             scan_commit(pRExC_state, data, minlenp, is_inf);
6016 
6017 	}
6018 	else if (  PL_regkind[OP(scan)] == BRANCHJ
6019 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
6020 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
6021 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6022         {
6023             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6024                 || OP(scan) == UNLESSM )
6025             {
6026                 /* Negative Lookahead/lookbehind
6027                    In this case we can't do fixed string optimisation.
6028                 */
6029 
6030                 SSize_t deltanext, minnext, fake = 0;
6031                 regnode *nscan;
6032                 regnode_ssc intrnl;
6033                 int f = 0;
6034 
6035                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6036                 if (data) {
6037                     data_fake.whilem_c = data->whilem_c;
6038                     data_fake.last_closep = data->last_closep;
6039 		}
6040                 else
6041                     data_fake.last_closep = &fake;
6042 		data_fake.pos_delta = delta;
6043                 if ( flags & SCF_DO_STCLASS && !scan->flags
6044                      && OP(scan) == IFMATCH ) { /* Lookahead */
6045                     ssc_init(pRExC_state, &intrnl);
6046                     data_fake.start_class = &intrnl;
6047                     f |= SCF_DO_STCLASS_AND;
6048 		}
6049                 if (flags & SCF_WHILEM_VISITED_POS)
6050                     f |= SCF_WHILEM_VISITED_POS;
6051                 next = regnext(scan);
6052                 nscan = NEXTOPER(NEXTOPER(scan));
6053 
6054                 /* recurse study_chunk() for lookahead body */
6055                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6056                                       last, &data_fake, stopparen,
6057                                       recursed_depth, NULL, f, depth+1,
6058                                       mutate_ok);
6059                 if (scan->flags) {
6060                     if (   deltanext < 0
6061                         || deltanext > (I32) U8_MAX
6062                         || minnext > (I32)U8_MAX
6063                         || minnext + deltanext > (I32)U8_MAX)
6064                     {
6065 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6066                               (UV)U8_MAX);
6067                     }
6068 
6069                     /* The 'next_off' field has been repurposed to count the
6070                      * additional starting positions to try beyond the initial
6071                      * one.  (This leaves it at 0 for non-variable length
6072                      * matches to avoid breakage for those not using this
6073                      * extension) */
6074                     if (deltanext) {
6075                         scan->next_off = deltanext;
6076                         ckWARNexperimental(RExC_parse,
6077                             WARN_EXPERIMENTAL__VLB,
6078                             "Variable length lookbehind is experimental");
6079                     }
6080                     scan->flags = (U8)minnext + deltanext;
6081                 }
6082                 if (data) {
6083                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6084                         pars++;
6085                     if (data_fake.flags & SF_HAS_EVAL)
6086                         data->flags |= SF_HAS_EVAL;
6087                     data->whilem_c = data_fake.whilem_c;
6088                 }
6089                 if (f & SCF_DO_STCLASS_AND) {
6090 		    if (flags & SCF_DO_STCLASS_OR) {
6091 			/* OR before, AND after: ideally we would recurse with
6092 			 * data_fake to get the AND applied by study of the
6093 			 * remainder of the pattern, and then derecurse;
6094 			 * *** HACK *** for now just treat as "no information".
6095 			 * See [perl #56690].
6096 			 */
6097 			ssc_init(pRExC_state, data->start_class);
6098 		    }  else {
6099                         /* AND before and after: combine and continue.  These
6100                          * assertions are zero-length, so can match an EMPTY
6101                          * string */
6102 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6103                         ANYOF_FLAGS(data->start_class)
6104                                                    |= SSC_MATCHES_EMPTY_STRING;
6105 		    }
6106                 }
6107 	    }
6108 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6109             else {
6110                 /* Positive Lookahead/lookbehind
6111                    In this case we can do fixed string optimisation,
6112                    but we must be careful about it. Note in the case of
6113                    lookbehind the positions will be offset by the minimum
6114                    length of the pattern, something we won't know about
6115                    until after the recurse.
6116                 */
6117                 SSize_t deltanext, fake = 0;
6118                 regnode *nscan;
6119                 regnode_ssc intrnl;
6120                 int f = 0;
6121                 /* We use SAVEFREEPV so that when the full compile
6122                     is finished perl will clean up the allocated
6123                     minlens when it's all done. This way we don't
6124                     have to worry about freeing them when we know
6125                     they wont be used, which would be a pain.
6126                  */
6127                 SSize_t *minnextp;
6128                 Newx( minnextp, 1, SSize_t );
6129                 SAVEFREEPV(minnextp);
6130 
6131                 if (data) {
6132                     StructCopy(data, &data_fake, scan_data_t);
6133                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6134                         f |= SCF_DO_SUBSTR;
6135                         if (scan->flags)
6136                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6137                         data_fake.last_found=newSVsv(data->last_found);
6138                     }
6139                 }
6140                 else
6141                     data_fake.last_closep = &fake;
6142                 data_fake.flags = 0;
6143                 data_fake.substrs[0].flags = 0;
6144                 data_fake.substrs[1].flags = 0;
6145 		data_fake.pos_delta = delta;
6146                 if (is_inf)
6147 	            data_fake.flags |= SF_IS_INF;
6148                 if ( flags & SCF_DO_STCLASS && !scan->flags
6149                      && OP(scan) == IFMATCH ) { /* Lookahead */
6150                     ssc_init(pRExC_state, &intrnl);
6151                     data_fake.start_class = &intrnl;
6152                     f |= SCF_DO_STCLASS_AND;
6153                 }
6154                 if (flags & SCF_WHILEM_VISITED_POS)
6155                     f |= SCF_WHILEM_VISITED_POS;
6156                 next = regnext(scan);
6157                 nscan = NEXTOPER(NEXTOPER(scan));
6158 
6159                 /* positive lookahead study_chunk() recursion */
6160                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6161                                         &deltanext, last, &data_fake,
6162                                         stopparen, recursed_depth, NULL,
6163                                         f, depth+1, mutate_ok);
6164                 if (scan->flags) {
6165                     assert(0);  /* This code has never been tested since this
6166                                    is normally not compiled */
6167                     if (   deltanext < 0
6168                         || deltanext > (I32) U8_MAX
6169                         || *minnextp > (I32)U8_MAX
6170                         || *minnextp + deltanext > (I32)U8_MAX)
6171                     {
6172 			FAIL2("Lookbehind longer than %" UVuf " not implemented",
6173                               (UV)U8_MAX);
6174                     }
6175 
6176                     if (deltanext) {
6177                         scan->next_off = deltanext;
6178                     }
6179                     scan->flags = (U8)*minnextp + deltanext;
6180                 }
6181 
6182                 *minnextp += min;
6183 
6184                 if (f & SCF_DO_STCLASS_AND) {
6185                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6186                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6187                 }
6188                 if (data) {
6189                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6190                         pars++;
6191                     if (data_fake.flags & SF_HAS_EVAL)
6192                         data->flags |= SF_HAS_EVAL;
6193                     data->whilem_c = data_fake.whilem_c;
6194                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6195                         int i;
6196                         if (RExC_rx->minlen<*minnextp)
6197                             RExC_rx->minlen=*minnextp;
6198                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6199                         SvREFCNT_dec_NN(data_fake.last_found);
6200 
6201                         for (i = 0; i < 2; i++) {
6202                             if (data_fake.substrs[i].minlenp != minlenp) {
6203                                 data->substrs[i].min_offset =
6204                                             data_fake.substrs[i].min_offset;
6205                                 data->substrs[i].max_offset =
6206                                             data_fake.substrs[i].max_offset;
6207                                 data->substrs[i].minlenp =
6208                                             data_fake.substrs[i].minlenp;
6209                                 data->substrs[i].lookbehind += scan->flags;
6210                             }
6211                         }
6212                     }
6213                 }
6214 	    }
6215 #endif
6216 	}
6217 
6218 	else if (OP(scan) == OPEN) {
6219 	    if (stopparen != (I32)ARG(scan))
6220 	        pars++;
6221 	}
6222 	else if (OP(scan) == CLOSE) {
6223 	    if (stopparen == (I32)ARG(scan)) {
6224 	        break;
6225 	    }
6226 	    if ((I32)ARG(scan) == is_par) {
6227 		next = regnext(scan);
6228 
6229 		if ( next && (OP(next) != WHILEM) && next < last)
6230 		    is_par = 0;		/* Disable optimization */
6231 	    }
6232 	    if (data)
6233 		*(data->last_closep) = ARG(scan);
6234 	}
6235 	else if (OP(scan) == EVAL) {
6236 		if (data)
6237 		    data->flags |= SF_HAS_EVAL;
6238 	}
6239 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6240 	    if (flags & SCF_DO_SUBSTR) {
6241                 scan_commit(pRExC_state, data, minlenp, is_inf);
6242 		flags &= ~SCF_DO_SUBSTR;
6243 	    }
6244 	    if (data && OP(scan)==ACCEPT) {
6245 	        data->flags |= SCF_SEEN_ACCEPT;
6246 	        if (stopmin > min)
6247 	            stopmin = min;
6248 	    }
6249 	}
6250 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6251 	{
6252 		if (flags & SCF_DO_SUBSTR) {
6253                     scan_commit(pRExC_state, data, minlenp, is_inf);
6254 		    data->cur_is_floating = 1; /* float */
6255 		}
6256 		is_inf = is_inf_internal = 1;
6257 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6258 		    ssc_anything(data->start_class);
6259 		flags &= ~SCF_DO_STCLASS;
6260 	}
6261 	else if (OP(scan) == GPOS) {
6262             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6263 	        !(delta || is_inf || (data && data->pos_delta)))
6264 	    {
6265                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6266                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6267 	        if (RExC_rx->gofs < (STRLEN)min)
6268 		    RExC_rx->gofs = min;
6269             } else {
6270                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6271                 RExC_rx->gofs = 0;
6272             }
6273 	}
6274 #ifdef TRIE_STUDY_OPT
6275 #ifdef FULL_TRIE_STUDY
6276         else if (PL_regkind[OP(scan)] == TRIE) {
6277             /* NOTE - There is similar code to this block above for handling
6278                BRANCH nodes on the initial study.  If you change stuff here
6279                check there too. */
6280             regnode *trie_node= scan;
6281             regnode *tail= regnext(scan);
6282             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6283             SSize_t max1 = 0, min1 = SSize_t_MAX;
6284             regnode_ssc accum;
6285 
6286             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6287                 /* Cannot merge strings after this. */
6288                 scan_commit(pRExC_state, data, minlenp, is_inf);
6289             }
6290             if (flags & SCF_DO_STCLASS)
6291                 ssc_init_zero(pRExC_state, &accum);
6292 
6293             if (!trie->jump) {
6294                 min1= trie->minlen;
6295                 max1= trie->maxlen;
6296             } else {
6297                 const regnode *nextbranch= NULL;
6298                 U32 word;
6299 
6300                 for ( word=1 ; word <= trie->wordcount ; word++)
6301                 {
6302                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6303                     regnode_ssc this_class;
6304 
6305                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6306                     if (data) {
6307                         data_fake.whilem_c = data->whilem_c;
6308                         data_fake.last_closep = data->last_closep;
6309                     }
6310                     else
6311                         data_fake.last_closep = &fake;
6312 		    data_fake.pos_delta = delta;
6313                     if (flags & SCF_DO_STCLASS) {
6314                         ssc_init(pRExC_state, &this_class);
6315                         data_fake.start_class = &this_class;
6316                         f = SCF_DO_STCLASS_AND;
6317                     }
6318                     if (flags & SCF_WHILEM_VISITED_POS)
6319                         f |= SCF_WHILEM_VISITED_POS;
6320 
6321                     if (trie->jump[word]) {
6322                         if (!nextbranch)
6323                             nextbranch = trie_node + trie->jump[0];
6324                         scan= trie_node + trie->jump[word];
6325                         /* We go from the jump point to the branch that follows
6326                            it. Note this means we need the vestigal unused
6327                            branches even though they arent otherwise used. */
6328                         /* optimise study_chunk() for TRIE */
6329                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6330                             &deltanext, (regnode *)nextbranch, &data_fake,
6331                             stopparen, recursed_depth, NULL, f, depth+1,
6332                             mutate_ok);
6333                     }
6334                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6335                         nextbranch= regnext((regnode*)nextbranch);
6336 
6337                     if (min1 > (SSize_t)(minnext + trie->minlen))
6338                         min1 = minnext + trie->minlen;
6339                     if (deltanext == SSize_t_MAX) {
6340                         is_inf = is_inf_internal = 1;
6341                         max1 = SSize_t_MAX;
6342                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6343                         max1 = minnext + deltanext + trie->maxlen;
6344 
6345                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6346                         pars++;
6347                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6348                         if ( stopmin > min + min1)
6349 	                    stopmin = min + min1;
6350 	                flags &= ~SCF_DO_SUBSTR;
6351 	                if (data)
6352 	                    data->flags |= SCF_SEEN_ACCEPT;
6353 	            }
6354                     if (data) {
6355                         if (data_fake.flags & SF_HAS_EVAL)
6356                             data->flags |= SF_HAS_EVAL;
6357                         data->whilem_c = data_fake.whilem_c;
6358                     }
6359                     if (flags & SCF_DO_STCLASS)
6360                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6361                 }
6362             }
6363             if (flags & SCF_DO_SUBSTR) {
6364                 data->pos_min += min1;
6365                 data->pos_delta += max1 - min1;
6366                 if (max1 != min1 || is_inf)
6367                     data->cur_is_floating = 1; /* float */
6368             }
6369             min += min1;
6370             if (delta != SSize_t_MAX) {
6371                 if (SSize_t_MAX - (max1 - min1) >= delta)
6372                     delta += max1 - min1;
6373                 else
6374                     delta = SSize_t_MAX;
6375             }
6376             if (flags & SCF_DO_STCLASS_OR) {
6377                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6378                 if (min1) {
6379                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6380                     flags &= ~SCF_DO_STCLASS;
6381                 }
6382             }
6383             else if (flags & SCF_DO_STCLASS_AND) {
6384                 if (min1) {
6385                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6386                     flags &= ~SCF_DO_STCLASS;
6387                 }
6388                 else {
6389                     /* Switch to OR mode: cache the old value of
6390                      * data->start_class */
6391 		    INIT_AND_WITHP;
6392                     StructCopy(data->start_class, and_withp, regnode_ssc);
6393                     flags &= ~SCF_DO_STCLASS_AND;
6394                     StructCopy(&accum, data->start_class, regnode_ssc);
6395                     flags |= SCF_DO_STCLASS_OR;
6396                 }
6397             }
6398             scan= tail;
6399             continue;
6400         }
6401 #else
6402 	else if (PL_regkind[OP(scan)] == TRIE) {
6403 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6404 	    U8*bang=NULL;
6405 
6406 	    min += trie->minlen;
6407 	    delta += (trie->maxlen - trie->minlen);
6408 	    flags &= ~SCF_DO_STCLASS; /* xxx */
6409             if (flags & SCF_DO_SUBSTR) {
6410                 /* Cannot expect anything... */
6411                 scan_commit(pRExC_state, data, minlenp, is_inf);
6412     	        data->pos_min += trie->minlen;
6413     	        data->pos_delta += (trie->maxlen - trie->minlen);
6414 		if (trie->maxlen != trie->minlen)
6415 		    data->cur_is_floating = 1; /* float */
6416     	    }
6417     	    if (trie->jump) /* no more substrings -- for now /grr*/
6418                flags &= ~SCF_DO_SUBSTR;
6419 	}
6420 #endif /* old or new */
6421 #endif /* TRIE_STUDY_OPT */
6422 
6423 	/* Else: zero-length, ignore. */
6424 	scan = regnext(scan);
6425     }
6426 
6427   finish:
6428     if (frame) {
6429         /* we need to unwind recursion. */
6430         depth = depth - 1;
6431 
6432         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6433         DEBUG_PEEP("fend", scan, depth, flags);
6434 
6435         /* restore previous context */
6436         last = frame->last_regnode;
6437         scan = frame->next_regnode;
6438         stopparen = frame->stopparen;
6439         recursed_depth = frame->prev_recursed_depth;
6440 
6441         RExC_frame_last = frame->prev_frame;
6442         frame = frame->this_prev_frame;
6443         goto fake_study_recurse;
6444     }
6445 
6446     assert(!frame);
6447     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6448 
6449     *scanp = scan;
6450     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6451 
6452     if (flags & SCF_DO_SUBSTR && is_inf)
6453 	data->pos_delta = SSize_t_MAX - data->pos_min;
6454     if (is_par > (I32)U8_MAX)
6455 	is_par = 0;
6456     if (is_par && pars==1 && data) {
6457 	data->flags |= SF_IN_PAR;
6458 	data->flags &= ~SF_HAS_PAR;
6459     }
6460     else if (pars && data) {
6461 	data->flags |= SF_HAS_PAR;
6462 	data->flags &= ~SF_IN_PAR;
6463     }
6464     if (flags & SCF_DO_STCLASS_OR)
6465 	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6466     if (flags & SCF_TRIE_RESTUDY)
6467         data->flags |= 	SCF_TRIE_RESTUDY;
6468 
6469     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6470 
6471     {
6472         SSize_t final_minlen= min < stopmin ? min : stopmin;
6473 
6474         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6475             if (final_minlen > SSize_t_MAX - delta)
6476                 RExC_maxlen = SSize_t_MAX;
6477             else if (RExC_maxlen < final_minlen + delta)
6478                 RExC_maxlen = final_minlen + delta;
6479         }
6480         return final_minlen;
6481     }
6482     NOT_REACHED; /* NOTREACHED */
6483 }
6484 
6485 STATIC U32
S_add_data(RExC_state_t * const pRExC_state,const char * const s,const U32 n)6486 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6487 {
6488     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6489 
6490     PERL_ARGS_ASSERT_ADD_DATA;
6491 
6492     Renewc(RExC_rxi->data,
6493 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6494 	   char, struct reg_data);
6495     if(count)
6496 	Renew(RExC_rxi->data->what, count + n, U8);
6497     else
6498 	Newx(RExC_rxi->data->what, n, U8);
6499     RExC_rxi->data->count = count + n;
6500     Copy(s, RExC_rxi->data->what + count, n, U8);
6501     return count;
6502 }
6503 
6504 /*XXX: todo make this not included in a non debugging perl, but appears to be
6505  * used anyway there, in 'use re' */
6506 #ifndef PERL_IN_XSUB_RE
6507 void
Perl_reginitcolors(pTHX)6508 Perl_reginitcolors(pTHX)
6509 {
6510     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6511     if (s) {
6512 	char *t = savepv(s);
6513 	int i = 0;
6514 	PL_colors[0] = t;
6515 	while (++i < 6) {
6516 	    t = strchr(t, '\t');
6517 	    if (t) {
6518 		*t = '\0';
6519 		PL_colors[i] = ++t;
6520 	    }
6521 	    else
6522 		PL_colors[i] = t = (char *)"";
6523 	}
6524     } else {
6525 	int i = 0;
6526 	while (i < 6)
6527 	    PL_colors[i++] = (char *)"";
6528     }
6529     PL_colorset = 1;
6530 }
6531 #endif
6532 
6533 
6534 #ifdef TRIE_STUDY_OPT
6535 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6536     STMT_START {                                            \
6537         if (                                                \
6538               (data.flags & SCF_TRIE_RESTUDY)               \
6539               && ! restudied++                              \
6540         ) {                                                 \
6541             dOsomething;                                    \
6542             goto reStudy;                                   \
6543         }                                                   \
6544     } STMT_END
6545 #else
6546 #define CHECK_RESTUDY_GOTO_butfirst
6547 #endif
6548 
6549 /*
6550  * pregcomp - compile a regular expression into internal code
6551  *
6552  * Decides which engine's compiler to call based on the hint currently in
6553  * scope
6554  */
6555 
6556 #ifndef PERL_IN_XSUB_RE
6557 
6558 /* return the currently in-scope regex engine (or the default if none)  */
6559 
6560 regexp_engine const *
Perl_current_re_engine(pTHX)6561 Perl_current_re_engine(pTHX)
6562 {
6563     if (IN_PERL_COMPILETIME) {
6564 	HV * const table = GvHV(PL_hintgv);
6565 	SV **ptr;
6566 
6567 	if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6568 	    return &PL_core_reg_engine;
6569 	ptr = hv_fetchs(table, "regcomp", FALSE);
6570 	if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6571 	    return &PL_core_reg_engine;
6572 	return INT2PTR(regexp_engine*, SvIV(*ptr));
6573     }
6574     else {
6575 	SV *ptr;
6576 	if (!PL_curcop->cop_hints_hash)
6577 	    return &PL_core_reg_engine;
6578 	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6579 	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6580 	    return &PL_core_reg_engine;
6581 	return INT2PTR(regexp_engine*, SvIV(ptr));
6582     }
6583 }
6584 
6585 
6586 REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern,const U32 flags)6587 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6588 {
6589     regexp_engine const *eng = current_re_engine();
6590     GET_RE_DEBUG_FLAGS_DECL;
6591 
6592     PERL_ARGS_ASSERT_PREGCOMP;
6593 
6594     /* Dispatch a request to compile a regexp to correct regexp engine. */
6595     DEBUG_COMPILE_r({
6596         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6597 			PTR2UV(eng));
6598     });
6599     return CALLREGCOMP_ENG(eng, pattern, flags);
6600 }
6601 #endif
6602 
6603 /* public(ish) entry point for the perl core's own regex compiling code.
6604  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6605  * pattern rather than a list of OPs, and uses the internal engine rather
6606  * than the current one */
6607 
6608 REGEXP *
Perl_re_compile(pTHX_ SV * const pattern,U32 rx_flags)6609 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6610 {
6611     SV *pat = pattern; /* defeat constness! */
6612     PERL_ARGS_ASSERT_RE_COMPILE;
6613     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6614 #ifdef PERL_IN_XSUB_RE
6615                                 &my_reg_engine,
6616 #else
6617                                 &PL_core_reg_engine,
6618 #endif
6619                                 NULL, NULL, rx_flags, 0);
6620 }
6621 
6622 
6623 static void
S_free_codeblocks(pTHX_ struct reg_code_blocks * cbs)6624 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6625 {
6626     int n;
6627 
6628     if (--cbs->refcnt > 0)
6629         return;
6630     for (n = 0; n < cbs->count; n++) {
6631         REGEXP *rx = cbs->cb[n].src_regex;
6632         if (rx) {
6633             cbs->cb[n].src_regex = NULL;
6634             SvREFCNT_dec_NN(rx);
6635         }
6636     }
6637     Safefree(cbs->cb);
6638     Safefree(cbs);
6639 }
6640 
6641 
6642 static struct reg_code_blocks *
S_alloc_code_blocks(pTHX_ int ncode)6643 S_alloc_code_blocks(pTHX_  int ncode)
6644 {
6645      struct reg_code_blocks *cbs;
6646     Newx(cbs, 1, struct reg_code_blocks);
6647     cbs->count = ncode;
6648     cbs->refcnt = 1;
6649     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6650     if (ncode)
6651         Newx(cbs->cb, ncode, struct reg_code_block);
6652     else
6653         cbs->cb = NULL;
6654     return cbs;
6655 }
6656 
6657 
6658 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6659  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6660  * point to the realloced string and length.
6661  *
6662  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6663  * stuff added */
6664 
6665 static void
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,char ** pat_p,STRLEN * plen_p,int num_code_blocks)6666 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6667 		    char **pat_p, STRLEN *plen_p, int num_code_blocks)
6668 {
6669     U8 *const src = (U8*)*pat_p;
6670     U8 *dst, *d;
6671     int n=0;
6672     STRLEN s = 0;
6673     bool do_end = 0;
6674     GET_RE_DEBUG_FLAGS_DECL;
6675 
6676     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6677         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6678 
6679     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6680     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6681     d = dst;
6682 
6683     while (s < *plen_p) {
6684         append_utf8_from_native_byte(src[s], &d);
6685 
6686         if (n < num_code_blocks) {
6687             assert(pRExC_state->code_blocks);
6688             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6689                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6690                 assert(*(d - 1) == '(');
6691                 do_end = 1;
6692             }
6693             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6694                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6695                 assert(*(d - 1) == ')');
6696                 do_end = 0;
6697                 n++;
6698             }
6699         }
6700         s++;
6701     }
6702     *d = '\0';
6703     *plen_p = d - dst;
6704     *pat_p = (char*) dst;
6705     SAVEFREEPV(*pat_p);
6706     RExC_orig_utf8 = RExC_utf8 = 1;
6707 }
6708 
6709 
6710 
6711 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6712  * while recording any code block indices, and handling overloading,
6713  * nested qr// objects etc.  If pat is null, it will allocate a new
6714  * string, or just return the first arg, if there's only one.
6715  *
6716  * Returns the malloced/updated pat.
6717  * patternp and pat_count is the array of SVs to be concatted;
6718  * oplist is the optional list of ops that generated the SVs;
6719  * recompile_p is a pointer to a boolean that will be set if
6720  *   the regex will need to be recompiled.
6721  * delim, if non-null is an SV that will be inserted between each element
6722  */
6723 
6724 static SV*
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,SV * pat,SV ** const patternp,int pat_count,OP * oplist,bool * recompile_p,SV * delim)6725 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6726                 SV *pat, SV ** const patternp, int pat_count,
6727                 OP *oplist, bool *recompile_p, SV *delim)
6728 {
6729     SV **svp;
6730     int n = 0;
6731     bool use_delim = FALSE;
6732     bool alloced = FALSE;
6733 
6734     /* if we know we have at least two args, create an empty string,
6735      * then concatenate args to that. For no args, return an empty string */
6736     if (!pat && pat_count != 1) {
6737         pat = newSVpvs("");
6738         SAVEFREESV(pat);
6739         alloced = TRUE;
6740     }
6741 
6742     for (svp = patternp; svp < patternp + pat_count; svp++) {
6743         SV *sv;
6744         SV *rx  = NULL;
6745         STRLEN orig_patlen = 0;
6746         bool code = 0;
6747         SV *msv = use_delim ? delim : *svp;
6748         if (!msv) msv = &PL_sv_undef;
6749 
6750         /* if we've got a delimiter, we go round the loop twice for each
6751          * svp slot (except the last), using the delimiter the second
6752          * time round */
6753         if (use_delim) {
6754             svp--;
6755             use_delim = FALSE;
6756         }
6757         else if (delim)
6758             use_delim = TRUE;
6759 
6760         if (SvTYPE(msv) == SVt_PVAV) {
6761             /* we've encountered an interpolated array within
6762              * the pattern, e.g. /...@a..../. Expand the list of elements,
6763              * then recursively append elements.
6764              * The code in this block is based on S_pushav() */
6765 
6766             AV *const av = (AV*)msv;
6767             const SSize_t maxarg = AvFILL(av) + 1;
6768             SV **array;
6769 
6770             if (oplist) {
6771                 assert(oplist->op_type == OP_PADAV
6772                     || oplist->op_type == OP_RV2AV);
6773                 oplist = OpSIBLING(oplist);
6774             }
6775 
6776             if (SvRMAGICAL(av)) {
6777                 SSize_t i;
6778 
6779                 Newx(array, maxarg, SV*);
6780                 SAVEFREEPV(array);
6781                 for (i=0; i < maxarg; i++) {
6782                     SV ** const svp = av_fetch(av, i, FALSE);
6783                     array[i] = svp ? *svp : &PL_sv_undef;
6784                 }
6785             }
6786             else
6787                 array = AvARRAY(av);
6788 
6789             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6790                                 array, maxarg, NULL, recompile_p,
6791                                 /* $" */
6792                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6793 
6794             continue;
6795         }
6796 
6797 
6798         /* we make the assumption here that each op in the list of
6799          * op_siblings maps to one SV pushed onto the stack,
6800          * except for code blocks, with have both an OP_NULL and
6801          * and OP_CONST.
6802          * This allows us to match up the list of SVs against the
6803          * list of OPs to find the next code block.
6804          *
6805          * Note that       PUSHMARK PADSV PADSV ..
6806          * is optimised to
6807          *                 PADRANGE PADSV  PADSV  ..
6808          * so the alignment still works. */
6809 
6810         if (oplist) {
6811             if (oplist->op_type == OP_NULL
6812                 && (oplist->op_flags & OPf_SPECIAL))
6813             {
6814                 assert(n < pRExC_state->code_blocks->count);
6815                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6816                 pRExC_state->code_blocks->cb[n].block = oplist;
6817                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6818                 n++;
6819                 code = 1;
6820                 oplist = OpSIBLING(oplist); /* skip CONST */
6821                 assert(oplist);
6822             }
6823             oplist = OpSIBLING(oplist);;
6824         }
6825 
6826 	/* apply magic and QR overloading to arg */
6827 
6828         SvGETMAGIC(msv);
6829         if (SvROK(msv) && SvAMAGIC(msv)) {
6830             SV *sv = AMG_CALLunary(msv, regexp_amg);
6831             if (sv) {
6832                 if (SvROK(sv))
6833                     sv = SvRV(sv);
6834                 if (SvTYPE(sv) != SVt_REGEXP)
6835                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6836                 msv = sv;
6837             }
6838         }
6839 
6840         /* try concatenation overload ... */
6841         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6842                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6843         {
6844             sv_setsv(pat, sv);
6845             /* overloading involved: all bets are off over literal
6846              * code. Pretend we haven't seen it */
6847             if (n)
6848                 pRExC_state->code_blocks->count -= n;
6849             n = 0;
6850         }
6851         else  {
6852             /* ... or failing that, try "" overload */
6853             while (SvAMAGIC(msv)
6854                     && (sv = AMG_CALLunary(msv, string_amg))
6855                     && sv != msv
6856                     &&  !(   SvROK(msv)
6857                           && SvROK(sv)
6858                           && SvRV(msv) == SvRV(sv))
6859             ) {
6860                 msv = sv;
6861                 SvGETMAGIC(msv);
6862             }
6863             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6864                 msv = SvRV(msv);
6865 
6866             if (pat) {
6867                 /* this is a partially unrolled
6868                  *     sv_catsv_nomg(pat, msv);
6869                  * that allows us to adjust code block indices if
6870                  * needed */
6871                 STRLEN dlen;
6872                 char *dst = SvPV_force_nomg(pat, dlen);
6873                 orig_patlen = dlen;
6874                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6875                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6876                     sv_setpvn(pat, dst, dlen);
6877                     SvUTF8_on(pat);
6878                 }
6879                 sv_catsv_nomg(pat, msv);
6880                 rx = msv;
6881             }
6882             else {
6883                 /* We have only one SV to process, but we need to verify
6884                  * it is properly null terminated or we will fail asserts
6885                  * later. In theory we probably shouldn't get such SV's,
6886                  * but if we do we should handle it gracefully. */
6887                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6888                     /* not a string, or a string with a trailing null */
6889                     pat = msv;
6890                 } else {
6891                     /* a string with no trailing null, we need to copy it
6892                      * so it has a trailing null */
6893                     pat = sv_2mortal(newSVsv(msv));
6894                 }
6895             }
6896 
6897             if (code)
6898                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6899         }
6900 
6901         /* extract any code blocks within any embedded qr//'s */
6902         if (rx && SvTYPE(rx) == SVt_REGEXP
6903             && RX_ENGINE((REGEXP*)rx)->op_comp)
6904         {
6905 
6906             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6907             if (ri->code_blocks && ri->code_blocks->count) {
6908                 int i;
6909                 /* the presence of an embedded qr// with code means
6910                  * we should always recompile: the text of the
6911                  * qr// may not have changed, but it may be a
6912                  * different closure than last time */
6913                 *recompile_p = 1;
6914                 if (pRExC_state->code_blocks) {
6915                     int new_count = pRExC_state->code_blocks->count
6916                             + ri->code_blocks->count;
6917                     Renew(pRExC_state->code_blocks->cb,
6918                             new_count, struct reg_code_block);
6919                     pRExC_state->code_blocks->count = new_count;
6920                 }
6921                 else
6922                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6923                                                     ri->code_blocks->count);
6924 
6925                 for (i=0; i < ri->code_blocks->count; i++) {
6926                     struct reg_code_block *src, *dst;
6927                     STRLEN offset =  orig_patlen
6928                         + ReANY((REGEXP *)rx)->pre_prefix;
6929                     assert(n < pRExC_state->code_blocks->count);
6930                     src = &ri->code_blocks->cb[i];
6931                     dst = &pRExC_state->code_blocks->cb[n];
6932                     dst->start	    = src->start + offset;
6933                     dst->end	    = src->end   + offset;
6934                     dst->block	    = src->block;
6935                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6936                                             src->src_regex
6937                                                 ? src->src_regex
6938                                                 : (REGEXP*)rx);
6939                     n++;
6940                 }
6941             }
6942         }
6943     }
6944     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6945     if (alloced)
6946         SvSETMAGIC(pat);
6947 
6948     return pat;
6949 }
6950 
6951 
6952 
6953 /* see if there are any run-time code blocks in the pattern.
6954  * False positives are allowed */
6955 
6956 static bool
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)6957 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6958 		    char *pat, STRLEN plen)
6959 {
6960     int n = 0;
6961     STRLEN s;
6962 
6963     PERL_UNUSED_CONTEXT;
6964 
6965     for (s = 0; s < plen; s++) {
6966 	if (   pRExC_state->code_blocks
6967             && n < pRExC_state->code_blocks->count
6968 	    && s == pRExC_state->code_blocks->cb[n].start)
6969 	{
6970 	    s = pRExC_state->code_blocks->cb[n].end;
6971 	    n++;
6972 	    continue;
6973 	}
6974 	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
6975 	 * positives here */
6976 	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6977 	    (pat[s+2] == '{'
6978                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6979 	)
6980 	    return 1;
6981     }
6982     return 0;
6983 }
6984 
6985 /* Handle run-time code blocks. We will already have compiled any direct
6986  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6987  * copy of it, but with any literal code blocks blanked out and
6988  * appropriate chars escaped; then feed it into
6989  *
6990  *    eval "qr'modified_pattern'"
6991  *
6992  * For example,
6993  *
6994  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6995  *
6996  * becomes
6997  *
6998  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6999  *
7000  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7001  * and merge them with any code blocks of the original regexp.
7002  *
7003  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7004  * instead, just save the qr and return FALSE; this tells our caller that
7005  * the original pattern needs upgrading to utf8.
7006  */
7007 
7008 static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,char * pat,STRLEN plen)7009 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7010     char *pat, STRLEN plen)
7011 {
7012     SV *qr;
7013 
7014     GET_RE_DEBUG_FLAGS_DECL;
7015 
7016     if (pRExC_state->runtime_code_qr) {
7017 	/* this is the second time we've been called; this should
7018 	 * only happen if the main pattern got upgraded to utf8
7019 	 * during compilation; re-use the qr we compiled first time
7020 	 * round (which should be utf8 too)
7021 	 */
7022 	qr = pRExC_state->runtime_code_qr;
7023 	pRExC_state->runtime_code_qr = NULL;
7024 	assert(RExC_utf8 && SvUTF8(qr));
7025     }
7026     else {
7027 	int n = 0;
7028 	STRLEN s;
7029 	char *p, *newpat;
7030 	int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7031 	SV *sv, *qr_ref;
7032 	dSP;
7033 
7034 	/* determine how many extra chars we need for ' and \ escaping */
7035 	for (s = 0; s < plen; s++) {
7036 	    if (pat[s] == '\'' || pat[s] == '\\')
7037 		newlen++;
7038 	}
7039 
7040 	Newx(newpat, newlen, char);
7041 	p = newpat;
7042 	*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7043 
7044 	for (s = 0; s < plen; s++) {
7045 	    if (   pRExC_state->code_blocks
7046 	        && n < pRExC_state->code_blocks->count
7047 		&& s == pRExC_state->code_blocks->cb[n].start)
7048 	    {
7049 		/* blank out literal code block so that they aren't
7050                  * recompiled: eg change from/to:
7051                  *     /(?{xyz})/
7052                  *     /(?=====)/
7053                  * and
7054                  *     /(??{xyz})/
7055                  *     /(?======)/
7056                  * and
7057                  *     /(?(?{xyz}))/
7058                  *     /(?(?=====))/
7059                 */
7060 		assert(pat[s]   == '(');
7061 		assert(pat[s+1] == '?');
7062                 *p++ = '(';
7063                 *p++ = '?';
7064                 s += 2;
7065 		while (s < pRExC_state->code_blocks->cb[n].end) {
7066 		    *p++ = '=';
7067 		    s++;
7068 		}
7069                 *p++ = ')';
7070 		n++;
7071 		continue;
7072 	    }
7073 	    if (pat[s] == '\'' || pat[s] == '\\')
7074 		*p++ = '\\';
7075 	    *p++ = pat[s];
7076 	}
7077 	*p++ = '\'';
7078 	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7079 	    *p++ = 'x';
7080             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7081                 *p++ = 'x';
7082             }
7083         }
7084 	*p++ = '\0';
7085 	DEBUG_COMPILE_r({
7086             Perl_re_printf( aTHX_
7087 		"%sre-parsing pattern for runtime code:%s %s\n",
7088 		PL_colors[4], PL_colors[5], newpat);
7089 	});
7090 
7091 	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7092 	Safefree(newpat);
7093 
7094 	ENTER;
7095 	SAVETMPS;
7096 	save_re_context();
7097 	PUSHSTACKi(PERLSI_REQUIRE);
7098         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7099          * parsing qr''; normally only q'' does this. It also alters
7100          * hints handling */
7101 	eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7102 	SvREFCNT_dec_NN(sv);
7103 	SPAGAIN;
7104 	qr_ref = POPs;
7105 	PUTBACK;
7106 	{
7107 	    SV * const errsv = ERRSV;
7108 	    if (SvTRUE_NN(errsv))
7109                 /* use croak_sv ? */
7110 		Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7111 	}
7112 	assert(SvROK(qr_ref));
7113 	qr = SvRV(qr_ref);
7114 	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7115 	/* the leaving below frees the tmp qr_ref.
7116 	 * Give qr a life of its own */
7117 	SvREFCNT_inc(qr);
7118 	POPSTACK;
7119 	FREETMPS;
7120 	LEAVE;
7121 
7122     }
7123 
7124     if (!RExC_utf8 && SvUTF8(qr)) {
7125 	/* first time through; the pattern got upgraded; save the
7126 	 * qr for the next time through */
7127 	assert(!pRExC_state->runtime_code_qr);
7128 	pRExC_state->runtime_code_qr = qr;
7129 	return 0;
7130     }
7131 
7132 
7133     /* extract any code blocks within the returned qr//  */
7134 
7135 
7136     /* merge the main (r1) and run-time (r2) code blocks into one */
7137     {
7138 	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7139 	struct reg_code_block *new_block, *dst;
7140 	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7141 	int i1 = 0, i2 = 0;
7142         int r1c, r2c;
7143 
7144 	if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7145 	{
7146 	    SvREFCNT_dec_NN(qr);
7147 	    return 1;
7148 	}
7149 
7150         if (!r1->code_blocks)
7151             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7152 
7153         r1c = r1->code_blocks->count;
7154         r2c = r2->code_blocks->count;
7155 
7156 	Newx(new_block, r1c + r2c, struct reg_code_block);
7157 
7158 	dst = new_block;
7159 
7160 	while (i1 < r1c || i2 < r2c) {
7161 	    struct reg_code_block *src;
7162 	    bool is_qr = 0;
7163 
7164 	    if (i1 == r1c) {
7165 		src = &r2->code_blocks->cb[i2++];
7166 		is_qr = 1;
7167 	    }
7168 	    else if (i2 == r2c)
7169 		src = &r1->code_blocks->cb[i1++];
7170 	    else if (  r1->code_blocks->cb[i1].start
7171 	             < r2->code_blocks->cb[i2].start)
7172 	    {
7173 		src = &r1->code_blocks->cb[i1++];
7174 		assert(src->end < r2->code_blocks->cb[i2].start);
7175 	    }
7176 	    else {
7177 		assert(  r1->code_blocks->cb[i1].start
7178 		       > r2->code_blocks->cb[i2].start);
7179 		src = &r2->code_blocks->cb[i2++];
7180 		is_qr = 1;
7181 		assert(src->end < r1->code_blocks->cb[i1].start);
7182 	    }
7183 
7184 	    assert(pat[src->start] == '(');
7185 	    assert(pat[src->end]   == ')');
7186 	    dst->start	    = src->start;
7187 	    dst->end	    = src->end;
7188 	    dst->block	    = src->block;
7189 	    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7190 				    : src->src_regex;
7191 	    dst++;
7192 	}
7193 	r1->code_blocks->count += r2c;
7194 	Safefree(r1->code_blocks->cb);
7195 	r1->code_blocks->cb = new_block;
7196     }
7197 
7198     SvREFCNT_dec_NN(qr);
7199     return 1;
7200 }
7201 
7202 
7203 STATIC bool
S_setup_longest(pTHX_ RExC_state_t * pRExC_state,struct reg_substr_datum * rsd,struct scan_data_substrs * sub,STRLEN longest_length)7204 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7205                       struct reg_substr_datum  *rsd,
7206                       struct scan_data_substrs *sub,
7207                       STRLEN longest_length)
7208 {
7209     /* This is the common code for setting up the floating and fixed length
7210      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7211      * as to whether succeeded or not */
7212 
7213     I32 t;
7214     SSize_t ml;
7215     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7216     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7217 
7218     if (! (longest_length
7219            || (eol /* Can't have SEOL and MULTI */
7220                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7221           )
7222             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7223         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7224     {
7225         return FALSE;
7226     }
7227 
7228     /* copy the information about the longest from the reg_scan_data
7229         over to the program. */
7230     if (SvUTF8(sub->str)) {
7231         rsd->substr      = NULL;
7232         rsd->utf8_substr = sub->str;
7233     } else {
7234         rsd->substr      = sub->str;
7235         rsd->utf8_substr = NULL;
7236     }
7237     /* end_shift is how many chars that must be matched that
7238         follow this item. We calculate it ahead of time as once the
7239         lookbehind offset is added in we lose the ability to correctly
7240         calculate it.*/
7241     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7242     rsd->end_shift = ml - sub->min_offset
7243         - longest_length
7244             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7245              * intead? - DAPM
7246             + (SvTAIL(sub->str) != 0)
7247             */
7248         + sub->lookbehind;
7249 
7250     t = (eol/* Can't have SEOL and MULTI */
7251          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7252     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7253 
7254     return TRUE;
7255 }
7256 
7257 STATIC void
S_set_regex_pv(pTHX_ RExC_state_t * pRExC_state,REGEXP * Rx)7258 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7259 {
7260     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7261      * properly wrapped with the right modifiers */
7262 
7263     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7264     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7265                                                 != REGEX_DEPENDS_CHARSET);
7266 
7267     /* The caret is output if there are any defaults: if not all the STD
7268         * flags are set, or if no character set specifier is needed */
7269     bool has_default =
7270                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7271                 || ! has_charset);
7272     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7273                                                 == REG_RUN_ON_COMMENT_SEEN);
7274     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7275                         >> RXf_PMf_STD_PMMOD_SHIFT);
7276     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7277     char *p;
7278     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7279 
7280     /* We output all the necessary flags; we never output a minus, as all
7281         * those are defaults, so are
7282         * covered by the caret */
7283     const STRLEN wraplen = pat_len + has_p + has_runon
7284         + has_default       /* If needs a caret */
7285         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7286 
7287             /* If needs a character set specifier */
7288         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7289         + (sizeof("(?:)") - 1);
7290 
7291     PERL_ARGS_ASSERT_SET_REGEX_PV;
7292 
7293     /* make sure PL_bitcount bounds not exceeded */
7294     assert(sizeof(STD_PAT_MODS) <= 8);
7295 
7296     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7297     SvPOK_on(Rx);
7298     if (RExC_utf8)
7299         SvFLAGS(Rx) |= SVf_UTF8;
7300     *p++='('; *p++='?';
7301 
7302     /* If a default, cover it using the caret */
7303     if (has_default) {
7304         *p++= DEFAULT_PAT_MOD;
7305     }
7306     if (has_charset) {
7307         STRLEN len;
7308         const char* name;
7309 
7310         name = get_regex_charset_name(RExC_rx->extflags, &len);
7311         if strEQ(name, DEPENDS_PAT_MODS) {  /* /d under UTF-8 => /u */
7312             assert(RExC_utf8);
7313             name = UNICODE_PAT_MODS;
7314             len = sizeof(UNICODE_PAT_MODS) - 1;
7315         }
7316         Copy(name, p, len, char);
7317         p += len;
7318     }
7319     if (has_p)
7320         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7321     {
7322         char ch;
7323         while((ch = *fptr++)) {
7324             if(reganch & 1)
7325                 *p++ = ch;
7326             reganch >>= 1;
7327         }
7328     }
7329 
7330     *p++ = ':';
7331     Copy(RExC_precomp, p, pat_len, char);
7332     assert ((RX_WRAPPED(Rx) - p) < 16);
7333     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7334     p += pat_len;
7335 
7336     /* Adding a trailing \n causes this to compile properly:
7337             my $R = qr / A B C # D E/x; /($R)/
7338         Otherwise the parens are considered part of the comment */
7339     if (has_runon)
7340         *p++ = '\n';
7341     *p++ = ')';
7342     *p = 0;
7343     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7344 }
7345 
7346 /*
7347  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7348  * regular expression into internal code.
7349  * The pattern may be passed either as:
7350  *    a list of SVs (patternp plus pat_count)
7351  *    a list of OPs (expr)
7352  * If both are passed, the SV list is used, but the OP list indicates
7353  * which SVs are actually pre-compiled code blocks
7354  *
7355  * The SVs in the list have magic and qr overloading applied to them (and
7356  * the list may be modified in-place with replacement SVs in the latter
7357  * case).
7358  *
7359  * If the pattern hasn't changed from old_re, then old_re will be
7360  * returned.
7361  *
7362  * eng is the current engine. If that engine has an op_comp method, then
7363  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7364  * do the initial concatenation of arguments and pass on to the external
7365  * engine.
7366  *
7367  * If is_bare_re is not null, set it to a boolean indicating whether the
7368  * arg list reduced (after overloading) to a single bare regex which has
7369  * been returned (i.e. /$qr/).
7370  *
7371  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7372  *
7373  * pm_flags contains the PMf_* flags, typically based on those from the
7374  * pm_flags field of the related PMOP. Currently we're only interested in
7375  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7376  *
7377  * For many years this code had an initial sizing pass that calculated
7378  * (sometimes incorrectly, leading to security holes) the size needed for the
7379  * compiled pattern.  That was changed by commit
7380  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7381  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7382  * references to this sizing pass.
7383  *
7384  * Now, an initial crude guess as to the size needed is made, based on the
7385  * length of the pattern.  Patches welcome to improve that guess.  That amount
7386  * of space is malloc'd and then immediately freed, and then clawed back node
7387  * by node.  This design is to minimze, to the extent possible, memory churn
7388  * when doing the the reallocs.
7389  *
7390  * A separate parentheses counting pass may be needed in some cases.
7391  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7392  * of these cases.
7393  *
7394  * The existence of a sizing pass necessitated design decisions that are no
7395  * longer needed.  There are potential areas of simplification.
7396  *
7397  * Beware that the optimization-preparation code in here knows about some
7398  * of the structure of the compiled regexp.  [I'll say.]
7399  */
7400 
7401 REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp,int pat_count,OP * expr,const regexp_engine * eng,REGEXP * old_re,bool * is_bare_re,const U32 orig_rx_flags,const U32 pm_flags)7402 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7403 		    OP *expr, const regexp_engine* eng, REGEXP *old_re,
7404 		     bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7405 {
7406     dVAR;
7407     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7408     STRLEN plen;
7409     char *exp;
7410     regnode *scan;
7411     I32 flags;
7412     SSize_t minlen = 0;
7413     U32 rx_flags;
7414     SV *pat;
7415     SV** new_patternp = patternp;
7416 
7417     /* these are all flags - maybe they should be turned
7418      * into a single int with different bit masks */
7419     I32 sawlookahead = 0;
7420     I32 sawplus = 0;
7421     I32 sawopen = 0;
7422     I32 sawminmod = 0;
7423 
7424     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7425     bool recompile = 0;
7426     bool runtime_code = 0;
7427     scan_data_t data;
7428     RExC_state_t RExC_state;
7429     RExC_state_t * const pRExC_state = &RExC_state;
7430 #ifdef TRIE_STUDY_OPT
7431     int restudied = 0;
7432     RExC_state_t copyRExC_state;
7433 #endif
7434     GET_RE_DEBUG_FLAGS_DECL;
7435 
7436     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7437 
7438     DEBUG_r(if (!PL_colorset) reginitcolors());
7439 
7440     /* Initialize these here instead of as-needed, as is quick and avoids
7441      * having to test them each time otherwise */
7442     if (! PL_InBitmap) {
7443 #ifdef DEBUGGING
7444         char * dump_len_string;
7445 #endif
7446 
7447         /* This is calculated here, because the Perl program that generates the
7448          * static global ones doesn't currently have access to
7449          * NUM_ANYOF_CODE_POINTS */
7450 	PL_InBitmap = _new_invlist(2);
7451 	PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7452                                                     NUM_ANYOF_CODE_POINTS - 1);
7453 #ifdef DEBUGGING
7454         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7455         if (   ! dump_len_string
7456             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7457         {
7458             PL_dump_re_max_len = 60;    /* A reasonable default */
7459         }
7460 #endif
7461     }
7462 
7463     pRExC_state->warn_text = NULL;
7464     pRExC_state->unlexed_names = NULL;
7465     pRExC_state->code_blocks = NULL;
7466 
7467     if (is_bare_re)
7468 	*is_bare_re = FALSE;
7469 
7470     if (expr && (expr->op_type == OP_LIST ||
7471 		(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7472 	/* allocate code_blocks if needed */
7473 	OP *o;
7474 	int ncode = 0;
7475 
7476 	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7477 	    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7478 		ncode++; /* count of DO blocks */
7479 
7480 	if (ncode)
7481             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7482     }
7483 
7484     if (!pat_count) {
7485         /* compile-time pattern with just OP_CONSTs and DO blocks */
7486 
7487         int n;
7488         OP *o;
7489 
7490         /* find how many CONSTs there are */
7491         assert(expr);
7492         n = 0;
7493         if (expr->op_type == OP_CONST)
7494             n = 1;
7495         else
7496             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7497                 if (o->op_type == OP_CONST)
7498                     n++;
7499             }
7500 
7501         /* fake up an SV array */
7502 
7503         assert(!new_patternp);
7504         Newx(new_patternp, n, SV*);
7505         SAVEFREEPV(new_patternp);
7506         pat_count = n;
7507 
7508         n = 0;
7509         if (expr->op_type == OP_CONST)
7510             new_patternp[n] = cSVOPx_sv(expr);
7511         else
7512             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7513                 if (o->op_type == OP_CONST)
7514                     new_patternp[n++] = cSVOPo_sv;
7515             }
7516 
7517     }
7518 
7519     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7520         "Assembling pattern from %d elements%s\n", pat_count,
7521             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7522 
7523     /* set expr to the first arg op */
7524 
7525     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7526          && expr->op_type != OP_CONST)
7527     {
7528             expr = cLISTOPx(expr)->op_first;
7529             assert(   expr->op_type == OP_PUSHMARK
7530                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7531                    || expr->op_type == OP_PADRANGE);
7532             expr = OpSIBLING(expr);
7533     }
7534 
7535     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7536                         expr, &recompile, NULL);
7537 
7538     /* handle bare (possibly after overloading) regex: foo =~ $re */
7539     {
7540         SV *re = pat;
7541         if (SvROK(re))
7542             re = SvRV(re);
7543         if (SvTYPE(re) == SVt_REGEXP) {
7544             if (is_bare_re)
7545                 *is_bare_re = TRUE;
7546             SvREFCNT_inc(re);
7547             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7548                 "Precompiled pattern%s\n",
7549                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7550 
7551             return (REGEXP*)re;
7552         }
7553     }
7554 
7555     exp = SvPV_nomg(pat, plen);
7556 
7557     if (!eng->op_comp) {
7558 	if ((SvUTF8(pat) && IN_BYTES)
7559 		|| SvGMAGICAL(pat) || SvAMAGIC(pat))
7560 	{
7561 	    /* make a temporary copy; either to convert to bytes,
7562 	     * or to avoid repeating get-magic / overloaded stringify */
7563 	    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7564 					(IN_BYTES ? 0 : SvUTF8(pat)));
7565 	}
7566 	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7567     }
7568 
7569     /* ignore the utf8ness if the pattern is 0 length */
7570     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7571     RExC_uni_semantics = 0;
7572     RExC_contains_locale = 0;
7573     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7574     RExC_in_script_run = 0;
7575     RExC_study_started = 0;
7576     pRExC_state->runtime_code_qr = NULL;
7577     RExC_frame_head= NULL;
7578     RExC_frame_last= NULL;
7579     RExC_frame_count= 0;
7580     RExC_latest_warn_offset = 0;
7581     RExC_use_BRANCHJ = 0;
7582     RExC_total_parens = 0;
7583     RExC_open_parens = NULL;
7584     RExC_close_parens = NULL;
7585     RExC_paren_names = NULL;
7586     RExC_size = 0;
7587     RExC_seen_d_op = FALSE;
7588 #ifdef DEBUGGING
7589     RExC_paren_name_list = NULL;
7590 #endif
7591 
7592     DEBUG_r({
7593         RExC_mysv1= sv_newmortal();
7594         RExC_mysv2= sv_newmortal();
7595     });
7596 
7597     DEBUG_COMPILE_r({
7598             SV *dsv= sv_newmortal();
7599             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7600             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7601                           PL_colors[4], PL_colors[5], s);
7602         });
7603 
7604     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7605      * to utf8 */
7606 
7607     if ((pm_flags & PMf_USE_RE_EVAL)
7608 		/* this second condition covers the non-regex literal case,
7609 		 * i.e.  $foo =~ '(?{})'. */
7610 		|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7611     )
7612 	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7613 
7614   redo_parse:
7615     /* return old regex if pattern hasn't changed */
7616     /* XXX: note in the below we have to check the flags as well as the
7617      * pattern.
7618      *
7619      * Things get a touch tricky as we have to compare the utf8 flag
7620      * independently from the compile flags.  */
7621 
7622     if (   old_re
7623         && !recompile
7624         && !!RX_UTF8(old_re) == !!RExC_utf8
7625         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7626 	&& RX_PRECOMP(old_re)
7627 	&& RX_PRELEN(old_re) == plen
7628         && memEQ(RX_PRECOMP(old_re), exp, plen)
7629 	&& !runtime_code /* with runtime code, always recompile */ )
7630     {
7631         return old_re;
7632     }
7633 
7634     /* Allocate the pattern's SV */
7635     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7636     RExC_rx = ReANY(Rx);
7637     if ( RExC_rx == NULL )
7638         FAIL("Regexp out of space");
7639 
7640     rx_flags = orig_rx_flags;
7641 
7642     if (   (UTF || RExC_uni_semantics)
7643         && initial_charset == REGEX_DEPENDS_CHARSET)
7644     {
7645 
7646 	/* Set to use unicode semantics if the pattern is in utf8 and has the
7647 	 * 'depends' charset specified, as it means unicode when utf8  */
7648 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7649         RExC_uni_semantics = 1;
7650     }
7651 
7652     RExC_pm_flags = pm_flags;
7653 
7654     if (runtime_code) {
7655         assert(TAINTING_get || !TAINT_get);
7656 	if (TAINT_get)
7657 	    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7658 
7659 	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7660 	    /* whoops, we have a non-utf8 pattern, whilst run-time code
7661 	     * got compiled as utf8. Try again with a utf8 pattern */
7662             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7663                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7664             goto redo_parse;
7665 	}
7666     }
7667     assert(!pRExC_state->runtime_code_qr);
7668 
7669     RExC_sawback = 0;
7670 
7671     RExC_seen = 0;
7672     RExC_maxlen = 0;
7673     RExC_in_lookbehind = 0;
7674     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7675 #ifdef EBCDIC
7676     RExC_recode_x_to_native = 0;
7677 #endif
7678     RExC_in_multi_char_class = 0;
7679 
7680     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7681     RExC_precomp_end = RExC_end = exp + plen;
7682     RExC_nestroot = 0;
7683     RExC_whilem_seen = 0;
7684     RExC_end_op = NULL;
7685     RExC_recurse = NULL;
7686     RExC_study_chunk_recursed = NULL;
7687     RExC_study_chunk_recursed_bytes= 0;
7688     RExC_recurse_count = 0;
7689     pRExC_state->code_index = 0;
7690 
7691     /* Initialize the string in the compiled pattern.  This is so that there is
7692      * something to output if necessary */
7693     set_regex_pv(pRExC_state, Rx);
7694 
7695     DEBUG_PARSE_r({
7696         Perl_re_printf( aTHX_
7697             "Starting parse and generation\n");
7698         RExC_lastnum=0;
7699         RExC_lastparse=NULL;
7700     });
7701 
7702     /* Allocate space and zero-initialize. Note, the two step process
7703        of zeroing when in debug mode, thus anything assigned has to
7704        happen after that */
7705     if (!  RExC_size) {
7706 
7707         /* On the first pass of the parse, we guess how big this will be.  Then
7708          * we grow in one operation to that amount and then give it back.  As
7709          * we go along, we re-allocate what we need.
7710          *
7711          * XXX Currently the guess is essentially that the pattern will be an
7712          * EXACT node with one byte input, one byte output.  This is crude, and
7713          * better heuristics are welcome.
7714          *
7715          * On any subsequent passes, we guess what we actually computed in the
7716          * latest earlier pass.  Such a pass probably didn't complete so is
7717          * missing stuff.  We could improve those guesses by knowing where the
7718          * parse stopped, and use the length so far plus apply the above
7719          * assumption to what's left. */
7720         RExC_size = STR_SZ(RExC_end - RExC_start);
7721     }
7722 
7723     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7724     if ( RExC_rxi == NULL )
7725         FAIL("Regexp out of space");
7726 
7727     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7728     RXi_SET( RExC_rx, RExC_rxi );
7729 
7730     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7731      * node parsed will give back any excess memory we have allocated so far).
7732      * */
7733     RExC_size = 0;
7734 
7735     /* non-zero initialization begins here */
7736     RExC_rx->engine= eng;
7737     RExC_rx->extflags = rx_flags;
7738     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7739 
7740     if (pm_flags & PMf_IS_QR) {
7741 	RExC_rxi->code_blocks = pRExC_state->code_blocks;
7742         if (RExC_rxi->code_blocks) {
7743             RExC_rxi->code_blocks->refcnt++;
7744         }
7745     }
7746 
7747     RExC_rx->intflags = 0;
7748 
7749     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
7750     RExC_parse = exp;
7751 
7752     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7753      * code makes sure the final byte is an uncounted NUL.  But should this
7754      * ever not be the case, lots of things could read beyond the end of the
7755      * buffer: loops like
7756      *      while(isFOO(*RExC_parse)) RExC_parse++;
7757      *      strchr(RExC_parse, "foo");
7758      * etc.  So it is worth noting. */
7759     assert(*RExC_end == '\0');
7760 
7761     RExC_naughty = 0;
7762     RExC_npar = 1;
7763     RExC_parens_buf_size = 0;
7764     RExC_emit_start = RExC_rxi->program;
7765     pRExC_state->code_index = 0;
7766 
7767     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7768     RExC_emit = 1;
7769 
7770     /* Do the parse */
7771     if (reg(pRExC_state, 0, &flags, 1)) {
7772 
7773         /* Success!, But we may need to redo the parse knowing how many parens
7774          * there actually are */
7775         if (IN_PARENS_PASS) {
7776             flags |= RESTART_PARSE;
7777         }
7778 
7779         /* We have that number in RExC_npar */
7780         RExC_total_parens = RExC_npar;
7781 
7782         /* XXX For backporting, use long jumps if there is any possibility of
7783          * overflow */
7784         if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
7785             RExC_use_BRANCHJ = TRUE;
7786             flags |= RESTART_PARSE;
7787         }
7788     }
7789     else if (! MUST_RESTART(flags)) {
7790 	ReREFCNT_dec(Rx);
7791         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7792     }
7793 
7794     /* Here, we either have success, or we have to redo the parse for some reason */
7795     if (MUST_RESTART(flags)) {
7796 
7797         /* It's possible to write a regexp in ascii that represents Unicode
7798         codepoints outside of the byte range, such as via \x{100}. If we
7799         detect such a sequence we have to convert the entire pattern to utf8
7800         and then recompile, as our sizing calculation will have been based
7801         on 1 byte == 1 character, but we will need to use utf8 to encode
7802         at least some part of the pattern, and therefore must convert the whole
7803         thing.
7804         -- dmq */
7805         if (flags & NEED_UTF8) {
7806 
7807             /* We have stored the offset of the final warning output so far.
7808              * That must be adjusted.  Any variant characters between the start
7809              * of the pattern and this warning count for 2 bytes in the final,
7810              * so just add them again */
7811             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7812                 RExC_latest_warn_offset +=
7813                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7814                                                 + RExC_latest_warn_offset);
7815             }
7816             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7817             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7818             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7819         }
7820         else {
7821             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7822         }
7823 
7824         if (ALL_PARENS_COUNTED) {
7825             /* Make enough room for all the known parens, and zero it */
7826             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7827             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7828             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7829 
7830             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7831             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7832         }
7833         else { /* Parse did not complete.  Reinitialize the parentheses
7834                   structures */
7835             RExC_total_parens = 0;
7836             if (RExC_open_parens) {
7837                 Safefree(RExC_open_parens);
7838                 RExC_open_parens = NULL;
7839             }
7840             if (RExC_close_parens) {
7841                 Safefree(RExC_close_parens);
7842                 RExC_close_parens = NULL;
7843             }
7844         }
7845 
7846         /* Clean up what we did in this parse */
7847         SvREFCNT_dec_NN(RExC_rx_sv);
7848 
7849         goto redo_parse;
7850     }
7851 
7852     /* Here, we have successfully parsed and generated the pattern's program
7853      * for the regex engine.  We are ready to finish things up and look for
7854      * optimizations. */
7855 
7856     /* Update the string to compile, with correct modifiers, etc */
7857     set_regex_pv(pRExC_state, Rx);
7858 
7859     RExC_rx->nparens = RExC_total_parens - 1;
7860 
7861     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7862     if (RExC_whilem_seen > 15)
7863         RExC_whilem_seen = 15;
7864 
7865     DEBUG_PARSE_r({
7866         Perl_re_printf( aTHX_
7867             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7868         RExC_lastnum=0;
7869         RExC_lastparse=NULL;
7870     });
7871 
7872 #ifdef RE_TRACK_PATTERN_OFFSETS
7873     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7874                           "%s %" UVuf " bytes for offset annotations.\n",
7875                           RExC_offsets ? "Got" : "Couldn't get",
7876                           (UV)((RExC_offsets[0] * 2 + 1))));
7877     DEBUG_OFFSETS_r(if (RExC_offsets) {
7878         const STRLEN len = RExC_offsets[0];
7879         STRLEN i;
7880         GET_RE_DEBUG_FLAGS_DECL;
7881         Perl_re_printf( aTHX_
7882                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7883         for (i = 1; i <= len; i++) {
7884             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7885                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7886                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7887         }
7888         Perl_re_printf( aTHX_  "\n");
7889     });
7890 
7891 #else
7892     SetProgLen(RExC_rxi,RExC_size);
7893 #endif
7894 
7895     DEBUG_OPTIMISE_r(
7896         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7897     );
7898 
7899     /* XXXX To minimize changes to RE engine we always allocate
7900        3-units-long substrs field. */
7901     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7902     if (RExC_recurse_count) {
7903         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7904         SAVEFREEPV(RExC_recurse);
7905     }
7906 
7907     if (RExC_seen & REG_RECURSE_SEEN) {
7908         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7909          * So its 1 if there are no parens. */
7910         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7911                                          ((RExC_total_parens & 0x07) != 0);
7912         Newx(RExC_study_chunk_recursed,
7913              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7914         SAVEFREEPV(RExC_study_chunk_recursed);
7915     }
7916 
7917   reStudy:
7918     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7919     DEBUG_r(
7920         RExC_study_chunk_recursed_count= 0;
7921     );
7922     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7923     if (RExC_study_chunk_recursed) {
7924         Zero(RExC_study_chunk_recursed,
7925              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7926     }
7927 
7928 
7929 #ifdef TRIE_STUDY_OPT
7930     if (!restudied) {
7931         StructCopy(&zero_scan_data, &data, scan_data_t);
7932         copyRExC_state = RExC_state;
7933     } else {
7934         U32 seen=RExC_seen;
7935         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7936 
7937         RExC_state = copyRExC_state;
7938         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7939             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7940         else
7941             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7942 	StructCopy(&zero_scan_data, &data, scan_data_t);
7943     }
7944 #else
7945     StructCopy(&zero_scan_data, &data, scan_data_t);
7946 #endif
7947 
7948     /* Dig out information for optimizations. */
7949     RExC_rx->extflags = RExC_flags; /* was pm_op */
7950     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7951 
7952     if (UTF)
7953 	SvUTF8_on(Rx);	/* Unicode in it? */
7954     RExC_rxi->regstclass = NULL;
7955     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
7956 	RExC_rx->intflags |= PREGf_NAUGHTY;
7957     scan = RExC_rxi->program + 1;		/* First BRANCH. */
7958 
7959     /* testing for BRANCH here tells us whether there is "must appear"
7960        data in the pattern. If there is then we can use it for optimisations */
7961     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7962                                                   */
7963 	SSize_t fake;
7964 	STRLEN longest_length[2];
7965 	regnode_ssc ch_class; /* pointed to by data */
7966 	int stclass_flag;
7967 	SSize_t last_close = 0; /* pointed to by data */
7968         regnode *first= scan;
7969         regnode *first_next= regnext(first);
7970         int i;
7971 
7972 	/*
7973 	 * Skip introductions and multiplicators >= 1
7974 	 * so that we can extract the 'meat' of the pattern that must
7975 	 * match in the large if() sequence following.
7976 	 * NOTE that EXACT is NOT covered here, as it is normally
7977 	 * picked up by the optimiser separately.
7978 	 *
7979 	 * This is unfortunate as the optimiser isnt handling lookahead
7980 	 * properly currently.
7981 	 *
7982 	 */
7983 	while ((OP(first) == OPEN && (sawopen = 1)) ||
7984 	       /* An OR of *one* alternative - should not happen now. */
7985 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7986 	    /* for now we can't handle lookbehind IFMATCH*/
7987 	    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7988 	    (OP(first) == PLUS) ||
7989 	    (OP(first) == MINMOD) ||
7990 	       /* An {n,m} with n>0 */
7991 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7992 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7993 	{
7994 		/*
7995 		 * the only op that could be a regnode is PLUS, all the rest
7996 		 * will be regnode_1 or regnode_2.
7997 		 *
7998                  * (yves doesn't think this is true)
7999 		 */
8000 		if (OP(first) == PLUS)
8001 		    sawplus = 1;
8002                 else {
8003                     if (OP(first) == MINMOD)
8004                         sawminmod = 1;
8005 		    first += regarglen[OP(first)];
8006                 }
8007 		first = NEXTOPER(first);
8008 		first_next= regnext(first);
8009 	}
8010 
8011 	/* Starting-point info. */
8012       again:
8013         DEBUG_PEEP("first:", first, 0, 0);
8014         /* Ignore EXACT as we deal with it later. */
8015 	if (PL_regkind[OP(first)] == EXACT) {
8016 	    if (   OP(first) == EXACT
8017                 || OP(first) == EXACT_ONLY8
8018                 || OP(first) == EXACTL)
8019             {
8020 		NOOP;	/* Empty, get anchored substr later. */
8021             }
8022 	    else
8023 		RExC_rxi->regstclass = first;
8024 	}
8025 #ifdef TRIE_STCLASS
8026 	else if (PL_regkind[OP(first)] == TRIE &&
8027 	        ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8028 	{
8029             /* this can happen only on restudy */
8030             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8031 	}
8032 #endif
8033 	else if (REGNODE_SIMPLE(OP(first)))
8034 	    RExC_rxi->regstclass = first;
8035 	else if (PL_regkind[OP(first)] == BOUND ||
8036 		 PL_regkind[OP(first)] == NBOUND)
8037 	    RExC_rxi->regstclass = first;
8038 	else if (PL_regkind[OP(first)] == BOL) {
8039             RExC_rx->intflags |= (OP(first) == MBOL
8040                            ? PREGf_ANCH_MBOL
8041                            : PREGf_ANCH_SBOL);
8042 	    first = NEXTOPER(first);
8043 	    goto again;
8044 	}
8045 	else if (OP(first) == GPOS) {
8046             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8047 	    first = NEXTOPER(first);
8048 	    goto again;
8049 	}
8050 	else if ((!sawopen || !RExC_sawback) &&
8051             !sawlookahead &&
8052 	    (OP(first) == STAR &&
8053 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8054             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8055 	{
8056 	    /* turn .* into ^.* with an implied $*=1 */
8057 	    const int type =
8058 		(OP(NEXTOPER(first)) == REG_ANY)
8059                     ? PREGf_ANCH_MBOL
8060                     : PREGf_ANCH_SBOL;
8061             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8062 	    first = NEXTOPER(first);
8063 	    goto again;
8064 	}
8065         if (sawplus && !sawminmod && !sawlookahead
8066             && (!sawopen || !RExC_sawback)
8067 	    && !pRExC_state->code_blocks) /* May examine pos and $& */
8068 	    /* x+ must match at the 1st pos of run of x's */
8069 	    RExC_rx->intflags |= PREGf_SKIP;
8070 
8071 	/* Scan is after the zeroth branch, first is atomic matcher. */
8072 #ifdef TRIE_STUDY_OPT
8073 	DEBUG_PARSE_r(
8074 	    if (!restudied)
8075                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8076 			      (IV)(first - scan + 1))
8077         );
8078 #else
8079 	DEBUG_PARSE_r(
8080             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8081 	        (IV)(first - scan + 1))
8082         );
8083 #endif
8084 
8085 
8086 	/*
8087 	* If there's something expensive in the r.e., find the
8088 	* longest literal string that must appear and make it the
8089 	* regmust.  Resolve ties in favor of later strings, since
8090 	* the regstart check works with the beginning of the r.e.
8091 	* and avoiding duplication strengthens checking.  Not a
8092 	* strong reason, but sufficient in the absence of others.
8093 	* [Now we resolve ties in favor of the earlier string if
8094 	* it happens that c_offset_min has been invalidated, since the
8095 	* earlier string may buy us something the later one won't.]
8096 	*/
8097 
8098 	data.substrs[0].str = newSVpvs("");
8099 	data.substrs[1].str = newSVpvs("");
8100 	data.last_found = newSVpvs("");
8101 	data.cur_is_floating = 0; /* initially any found substring is fixed */
8102 	ENTER_with_name("study_chunk");
8103 	SAVEFREESV(data.substrs[0].str);
8104 	SAVEFREESV(data.substrs[1].str);
8105 	SAVEFREESV(data.last_found);
8106 	first = scan;
8107 	if (!RExC_rxi->regstclass) {
8108 	    ssc_init(pRExC_state, &ch_class);
8109 	    data.start_class = &ch_class;
8110 	    stclass_flag = SCF_DO_STCLASS_AND;
8111 	} else				/* XXXX Check for BOUND? */
8112 	    stclass_flag = 0;
8113 	data.last_closep = &last_close;
8114 
8115         DEBUG_RExC_seen();
8116         /*
8117          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8118          * (NO top level branches)
8119          */
8120 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8121                              scan + RExC_size, /* Up to end */
8122             &data, -1, 0, NULL,
8123             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8124                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8125             0, TRUE);
8126 
8127 
8128         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8129 
8130 
8131 	if ( RExC_total_parens == 1 && !data.cur_is_floating
8132 	     && data.last_start_min == 0 && data.last_end > 0
8133 	     && !RExC_seen_zerolen
8134              && !(RExC_seen & REG_VERBARG_SEEN)
8135              && !(RExC_seen & REG_GPOS_SEEN)
8136         ){
8137 	    RExC_rx->extflags |= RXf_CHECK_ALL;
8138         }
8139 	scan_commit(pRExC_state, &data,&minlen, 0);
8140 
8141 
8142         /* XXX this is done in reverse order because that's the way the
8143          * code was before it was parameterised. Don't know whether it
8144          * actually needs doing in reverse order. DAPM */
8145         for (i = 1; i >= 0; i--) {
8146             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8147 
8148             if (   !(   i
8149                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8150                      &&    data.substrs[0].min_offset
8151                         == data.substrs[1].min_offset
8152                      &&    SvCUR(data.substrs[0].str)
8153                         == SvCUR(data.substrs[1].str)
8154                     )
8155                 && S_setup_longest (aTHX_ pRExC_state,
8156                                         &(RExC_rx->substrs->data[i]),
8157                                         &(data.substrs[i]),
8158                                         longest_length[i]))
8159             {
8160                 RExC_rx->substrs->data[i].min_offset =
8161                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8162 
8163                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8164                 /* Don't offset infinity */
8165                 if (data.substrs[i].max_offset < SSize_t_MAX)
8166                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8167                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8168             }
8169             else {
8170                 RExC_rx->substrs->data[i].substr      = NULL;
8171                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8172                 longest_length[i] = 0;
8173             }
8174         }
8175 
8176 	LEAVE_with_name("study_chunk");
8177 
8178 	if (RExC_rxi->regstclass
8179 	    && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8180 	    RExC_rxi->regstclass = NULL;
8181 
8182 	if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8183               || RExC_rx->substrs->data[0].min_offset)
8184 	    && stclass_flag
8185             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8186 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8187 	{
8188 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8189 
8190             ssc_finalize(pRExC_state, data.start_class);
8191 
8192 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8193 	    StructCopy(data.start_class,
8194 		       (regnode_ssc*)RExC_rxi->data->data[n],
8195 		       regnode_ssc);
8196 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8197 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8198 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8199                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8200                       Perl_re_printf( aTHX_
8201 				    "synthetic stclass \"%s\".\n",
8202 				    SvPVX_const(sv));});
8203             data.start_class = NULL;
8204 	}
8205 
8206         /* A temporary algorithm prefers floated substr to fixed one of
8207          * same length to dig more info. */
8208 	i = (longest_length[0] <= longest_length[1]);
8209         RExC_rx->substrs->check_ix = i;
8210         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8211         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8212         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8213         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8214         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8215         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8216             RExC_rx->intflags |= PREGf_NOSCAN;
8217 
8218 	if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8219 	    RExC_rx->extflags |= RXf_USE_INTUIT;
8220 	    if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8221 		RExC_rx->extflags |= RXf_INTUIT_TAIL;
8222 	}
8223 
8224 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8225 	if ( (STRLEN)minlen < longest_length[1] )
8226             minlen= longest_length[1];
8227         if ( (STRLEN)minlen < longest_length[0] )
8228             minlen= longest_length[0];
8229         */
8230     }
8231     else {
8232 	/* Several toplevels. Best we can is to set minlen. */
8233 	SSize_t fake;
8234 	regnode_ssc ch_class;
8235 	SSize_t last_close = 0;
8236 
8237         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8238 
8239 	scan = RExC_rxi->program + 1;
8240 	ssc_init(pRExC_state, &ch_class);
8241 	data.start_class = &ch_class;
8242 	data.last_closep = &last_close;
8243 
8244         DEBUG_RExC_seen();
8245         /*
8246          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8247          * (patterns WITH top level branches)
8248          */
8249 	minlen = study_chunk(pRExC_state,
8250             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8251             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8252                                                       ? SCF_TRIE_DOING_RESTUDY
8253                                                       : 0),
8254             0, TRUE);
8255 
8256         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8257 
8258 	RExC_rx->check_substr = NULL;
8259         RExC_rx->check_utf8 = NULL;
8260         RExC_rx->substrs->data[0].substr      = NULL;
8261         RExC_rx->substrs->data[0].utf8_substr = NULL;
8262         RExC_rx->substrs->data[1].substr      = NULL;
8263         RExC_rx->substrs->data[1].utf8_substr = NULL;
8264 
8265         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8266 	    && is_ssc_worth_it(pRExC_state, data.start_class))
8267         {
8268 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8269 
8270             ssc_finalize(pRExC_state, data.start_class);
8271 
8272 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8273 	    StructCopy(data.start_class,
8274 		       (regnode_ssc*)RExC_rxi->data->data[n],
8275 		       regnode_ssc);
8276 	    RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8277 	    RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
8278 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8279                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8280                       Perl_re_printf( aTHX_
8281 				    "synthetic stclass \"%s\".\n",
8282 				    SvPVX_const(sv));});
8283             data.start_class = NULL;
8284 	}
8285     }
8286 
8287     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8288         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8289         RExC_rx->maxlen = REG_INFTY;
8290     }
8291     else {
8292         RExC_rx->maxlen = RExC_maxlen;
8293     }
8294 
8295     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8296        the "real" pattern. */
8297     DEBUG_OPTIMISE_r({
8298         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8299                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8300     });
8301     RExC_rx->minlenret = minlen;
8302     if (RExC_rx->minlen < minlen)
8303         RExC_rx->minlen = minlen;
8304 
8305     if (RExC_seen & REG_RECURSE_SEEN ) {
8306         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8307         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8308     }
8309     if (RExC_seen & REG_GPOS_SEEN)
8310         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8311     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8312         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8313                                                 lookbehind */
8314     if (pRExC_state->code_blocks)
8315 	RExC_rx->extflags |= RXf_EVAL_SEEN;
8316     if (RExC_seen & REG_VERBARG_SEEN)
8317     {
8318 	RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8319         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8320     }
8321     if (RExC_seen & REG_CUTGROUP_SEEN)
8322 	RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8323     if (pm_flags & PMf_USE_RE_EVAL)
8324 	RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8325     if (RExC_paren_names)
8326         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8327     else
8328         RXp_PAREN_NAMES(RExC_rx) = NULL;
8329 
8330     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8331      * so it can be used in pp.c */
8332     if (RExC_rx->intflags & PREGf_ANCH)
8333         RExC_rx->extflags |= RXf_IS_ANCHORED;
8334 
8335 
8336     {
8337         /* this is used to identify "special" patterns that might result
8338          * in Perl NOT calling the regex engine and instead doing the match "itself",
8339          * particularly special cases in split//. By having the regex compiler
8340          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8341          * we avoid weird issues with equivalent patterns resulting in different behavior,
8342          * AND we allow non Perl engines to get the same optimizations by the setting the
8343          * flags appropriately - Yves */
8344         regnode *first = RExC_rxi->program + 1;
8345         U8 fop = OP(first);
8346         regnode *next = regnext(first);
8347         U8 nop = OP(next);
8348 
8349         if (PL_regkind[fop] == NOTHING && nop == END)
8350             RExC_rx->extflags |= RXf_NULL;
8351         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8352             /* when fop is SBOL first->flags will be true only when it was
8353              * produced by parsing /\A/, and not when parsing /^/. This is
8354              * very important for the split code as there we want to
8355              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8356              * See rt #122761 for more details. -- Yves */
8357             RExC_rx->extflags |= RXf_START_ONLY;
8358         else if (fop == PLUS
8359                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8360                  && nop == END)
8361             RExC_rx->extflags |= RXf_WHITE;
8362         else if ( RExC_rx->extflags & RXf_SPLIT
8363                   && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8364                   && STR_LEN(first) == 1
8365                   && *(STRING(first)) == ' '
8366                   && nop == END )
8367             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8368 
8369     }
8370 
8371     if (RExC_contains_locale) {
8372         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8373     }
8374 
8375 #ifdef DEBUGGING
8376     if (RExC_paren_names) {
8377         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8378         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8379                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8380     } else
8381 #endif
8382     RExC_rxi->name_list_idx = 0;
8383 
8384     while ( RExC_recurse_count > 0 ) {
8385         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8386         /*
8387          * This data structure is set up in study_chunk() and is used
8388          * to calculate the distance between a GOSUB regopcode and
8389          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8390          * it refers to.
8391          *
8392          * If for some reason someone writes code that optimises
8393          * away a GOSUB opcode then the assert should be changed to
8394          * an if(scan) to guard the ARG2L_SET() - Yves
8395          *
8396          */
8397         assert(scan && OP(scan) == GOSUB);
8398         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8399     }
8400 
8401     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8402     /* assume we don't need to swap parens around before we match */
8403     DEBUG_TEST_r({
8404         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8405             (unsigned long)RExC_study_chunk_recursed_count);
8406     });
8407     DEBUG_DUMP_r({
8408         DEBUG_RExC_seen();
8409         Perl_re_printf( aTHX_ "Final program:\n");
8410         regdump(RExC_rx);
8411     });
8412 
8413     if (RExC_open_parens) {
8414         Safefree(RExC_open_parens);
8415         RExC_open_parens = NULL;
8416     }
8417     if (RExC_close_parens) {
8418         Safefree(RExC_close_parens);
8419         RExC_close_parens = NULL;
8420     }
8421 
8422 #ifdef USE_ITHREADS
8423     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8424      * by setting the regexp SV to readonly-only instead. If the
8425      * pattern's been recompiled, the USEDness should remain. */
8426     if (old_re && SvREADONLY(old_re))
8427         SvREADONLY_on(Rx);
8428 #endif
8429     return Rx;
8430 }
8431 
8432 
8433 SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx,SV * const key,SV * const value,const U32 flags)8434 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8435                     const U32 flags)
8436 {
8437     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8438 
8439     PERL_UNUSED_ARG(value);
8440 
8441     if (flags & RXapif_FETCH) {
8442         return reg_named_buff_fetch(rx, key, flags);
8443     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8444         Perl_croak_no_modify();
8445         return NULL;
8446     } else if (flags & RXapif_EXISTS) {
8447         return reg_named_buff_exists(rx, key, flags)
8448             ? &PL_sv_yes
8449             : &PL_sv_no;
8450     } else if (flags & RXapif_REGNAMES) {
8451         return reg_named_buff_all(rx, flags);
8452     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8453         return reg_named_buff_scalar(rx, flags);
8454     } else {
8455         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8456         return NULL;
8457     }
8458 }
8459 
8460 SV*
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx,const SV * const lastkey,const U32 flags)8461 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8462                          const U32 flags)
8463 {
8464     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8465     PERL_UNUSED_ARG(lastkey);
8466 
8467     if (flags & RXapif_FIRSTKEY)
8468         return reg_named_buff_firstkey(rx, flags);
8469     else if (flags & RXapif_NEXTKEY)
8470         return reg_named_buff_nextkey(rx, flags);
8471     else {
8472         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8473                                             (int)flags);
8474         return NULL;
8475     }
8476 }
8477 
8478 SV*
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r,SV * const namesv,const U32 flags)8479 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8480 			  const U32 flags)
8481 {
8482     SV *ret;
8483     struct regexp *const rx = ReANY(r);
8484 
8485     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8486 
8487     if (rx && RXp_PAREN_NAMES(rx)) {
8488         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8489         if (he_str) {
8490             IV i;
8491             SV* sv_dat=HeVAL(he_str);
8492             I32 *nums=(I32*)SvPVX(sv_dat);
8493             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8494             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8495                 if ((I32)(rx->nparens) >= nums[i]
8496                     && rx->offs[nums[i]].start != -1
8497                     && rx->offs[nums[i]].end != -1)
8498                 {
8499                     ret = newSVpvs("");
8500                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8501                     if (!retarray)
8502                         return ret;
8503                 } else {
8504                     if (retarray)
8505                         ret = newSVsv(&PL_sv_undef);
8506                 }
8507                 if (retarray)
8508                     av_push(retarray, ret);
8509             }
8510             if (retarray)
8511                 return newRV_noinc(MUTABLE_SV(retarray));
8512         }
8513     }
8514     return NULL;
8515 }
8516 
8517 bool
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r,SV * const key,const U32 flags)8518 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8519                            const U32 flags)
8520 {
8521     struct regexp *const rx = ReANY(r);
8522 
8523     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8524 
8525     if (rx && RXp_PAREN_NAMES(rx)) {
8526         if (flags & RXapif_ALL) {
8527             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8528         } else {
8529 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8530             if (sv) {
8531 		SvREFCNT_dec_NN(sv);
8532                 return TRUE;
8533             } else {
8534                 return FALSE;
8535             }
8536         }
8537     } else {
8538         return FALSE;
8539     }
8540 }
8541 
8542 SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r,const U32 flags)8543 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8544 {
8545     struct regexp *const rx = ReANY(r);
8546 
8547     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8548 
8549     if ( rx && RXp_PAREN_NAMES(rx) ) {
8550 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
8551 
8552 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8553     } else {
8554 	return FALSE;
8555     }
8556 }
8557 
8558 SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r,const U32 flags)8559 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8560 {
8561     struct regexp *const rx = ReANY(r);
8562     GET_RE_DEBUG_FLAGS_DECL;
8563 
8564     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8565 
8566     if (rx && RXp_PAREN_NAMES(rx)) {
8567         HV *hv = RXp_PAREN_NAMES(rx);
8568         HE *temphe;
8569         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8570             IV i;
8571             IV parno = 0;
8572             SV* sv_dat = HeVAL(temphe);
8573             I32 *nums = (I32*)SvPVX(sv_dat);
8574             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8575                 if ((I32)(rx->lastparen) >= nums[i] &&
8576                     rx->offs[nums[i]].start != -1 &&
8577                     rx->offs[nums[i]].end != -1)
8578                 {
8579                     parno = nums[i];
8580                     break;
8581                 }
8582             }
8583             if (parno || flags & RXapif_ALL) {
8584 		return newSVhek(HeKEY_hek(temphe));
8585             }
8586         }
8587     }
8588     return NULL;
8589 }
8590 
8591 SV*
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r,const U32 flags)8592 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8593 {
8594     SV *ret;
8595     AV *av;
8596     SSize_t length;
8597     struct regexp *const rx = ReANY(r);
8598 
8599     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8600 
8601     if (rx && RXp_PAREN_NAMES(rx)) {
8602         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8603             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8604         } else if (flags & RXapif_ONE) {
8605             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8606             av = MUTABLE_AV(SvRV(ret));
8607             length = av_tindex(av);
8608 	    SvREFCNT_dec_NN(ret);
8609             return newSViv(length + 1);
8610         } else {
8611             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8612                                                 (int)flags);
8613             return NULL;
8614         }
8615     }
8616     return &PL_sv_undef;
8617 }
8618 
8619 SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r,const U32 flags)8620 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8621 {
8622     struct regexp *const rx = ReANY(r);
8623     AV *av = newAV();
8624 
8625     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8626 
8627     if (rx && RXp_PAREN_NAMES(rx)) {
8628         HV *hv= RXp_PAREN_NAMES(rx);
8629         HE *temphe;
8630         (void)hv_iterinit(hv);
8631         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8632             IV i;
8633             IV parno = 0;
8634             SV* sv_dat = HeVAL(temphe);
8635             I32 *nums = (I32*)SvPVX(sv_dat);
8636             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8637                 if ((I32)(rx->lastparen) >= nums[i] &&
8638                     rx->offs[nums[i]].start != -1 &&
8639                     rx->offs[nums[i]].end != -1)
8640                 {
8641                     parno = nums[i];
8642                     break;
8643                 }
8644             }
8645             if (parno || flags & RXapif_ALL) {
8646                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8647             }
8648         }
8649     }
8650 
8651     return newRV_noinc(MUTABLE_SV(av));
8652 }
8653 
8654 void
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r,const I32 paren,SV * const sv)8655 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8656 			     SV * const sv)
8657 {
8658     struct regexp *const rx = ReANY(r);
8659     char *s = NULL;
8660     SSize_t i = 0;
8661     SSize_t s1, t1;
8662     I32 n = paren;
8663 
8664     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8665 
8666     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8667            || n == RX_BUFF_IDX_CARET_FULLMATCH
8668            || n == RX_BUFF_IDX_CARET_POSTMATCH
8669        )
8670     {
8671         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8672         if (!keepcopy) {
8673             /* on something like
8674              *    $r = qr/.../;
8675              *    /$qr/p;
8676              * the KEEPCOPY is set on the PMOP rather than the regex */
8677             if (PL_curpm && r == PM_GETRE(PL_curpm))
8678                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8679         }
8680         if (!keepcopy)
8681             goto ret_undef;
8682     }
8683 
8684     if (!rx->subbeg)
8685         goto ret_undef;
8686 
8687     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8688         /* no need to distinguish between them any more */
8689         n = RX_BUFF_IDX_FULLMATCH;
8690 
8691     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8692         && rx->offs[0].start != -1)
8693     {
8694         /* $`, ${^PREMATCH} */
8695 	i = rx->offs[0].start;
8696 	s = rx->subbeg;
8697     }
8698     else
8699     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8700         && rx->offs[0].end != -1)
8701     {
8702         /* $', ${^POSTMATCH} */
8703 	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8704 	i = rx->sublen + rx->suboffset - rx->offs[0].end;
8705     }
8706     else
8707     if ( 0 <= n && n <= (I32)rx->nparens &&
8708         (s1 = rx->offs[n].start) != -1 &&
8709         (t1 = rx->offs[n].end) != -1)
8710     {
8711         /* $&, ${^MATCH},  $1 ... */
8712         i = t1 - s1;
8713         s = rx->subbeg + s1 - rx->suboffset;
8714     } else {
8715         goto ret_undef;
8716     }
8717 
8718     assert(s >= rx->subbeg);
8719     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8720     if (i >= 0) {
8721 #ifdef NO_TAINT_SUPPORT
8722         sv_setpvn(sv, s, i);
8723 #else
8724         const int oldtainted = TAINT_get;
8725         TAINT_NOT;
8726         sv_setpvn(sv, s, i);
8727         TAINT_set(oldtainted);
8728 #endif
8729         if (RXp_MATCH_UTF8(rx))
8730             SvUTF8_on(sv);
8731         else
8732             SvUTF8_off(sv);
8733         if (TAINTING_get) {
8734             if (RXp_MATCH_TAINTED(rx)) {
8735                 if (SvTYPE(sv) >= SVt_PVMG) {
8736                     MAGIC* const mg = SvMAGIC(sv);
8737                     MAGIC* mgt;
8738                     TAINT;
8739                     SvMAGIC_set(sv, mg->mg_moremagic);
8740                     SvTAINT(sv);
8741                     if ((mgt = SvMAGIC(sv))) {
8742                         mg->mg_moremagic = mgt;
8743                         SvMAGIC_set(sv, mg);
8744                     }
8745                 } else {
8746                     TAINT;
8747                     SvTAINT(sv);
8748                 }
8749             } else
8750                 SvTAINTED_off(sv);
8751         }
8752     } else {
8753       ret_undef:
8754         sv_set_undef(sv);
8755         return;
8756     }
8757 }
8758 
8759 void
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx,const I32 paren,SV const * const value)8760 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8761 							 SV const * const value)
8762 {
8763     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8764 
8765     PERL_UNUSED_ARG(rx);
8766     PERL_UNUSED_ARG(paren);
8767     PERL_UNUSED_ARG(value);
8768 
8769     if (!PL_localizing)
8770         Perl_croak_no_modify();
8771 }
8772 
8773 I32
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r,const SV * const sv,const I32 paren)8774 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8775                               const I32 paren)
8776 {
8777     struct regexp *const rx = ReANY(r);
8778     I32 i;
8779     I32 s1, t1;
8780 
8781     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8782 
8783     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8784         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8785         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8786     )
8787     {
8788         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8789         if (!keepcopy) {
8790             /* on something like
8791              *    $r = qr/.../;
8792              *    /$qr/p;
8793              * the KEEPCOPY is set on the PMOP rather than the regex */
8794             if (PL_curpm && r == PM_GETRE(PL_curpm))
8795                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8796         }
8797         if (!keepcopy)
8798             goto warn_undef;
8799     }
8800 
8801     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8802     switch (paren) {
8803       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8804       case RX_BUFF_IDX_PREMATCH:       /* $` */
8805         if (rx->offs[0].start != -1) {
8806 			i = rx->offs[0].start;
8807 			if (i > 0) {
8808 				s1 = 0;
8809 				t1 = i;
8810 				goto getlen;
8811 			}
8812 	    }
8813         return 0;
8814 
8815       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8816       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8817 	    if (rx->offs[0].end != -1) {
8818 			i = rx->sublen - rx->offs[0].end;
8819 			if (i > 0) {
8820 				s1 = rx->offs[0].end;
8821 				t1 = rx->sublen;
8822 				goto getlen;
8823 			}
8824 	    }
8825         return 0;
8826 
8827       default: /* $& / ${^MATCH}, $1, $2, ... */
8828 	    if (paren <= (I32)rx->nparens &&
8829             (s1 = rx->offs[paren].start) != -1 &&
8830             (t1 = rx->offs[paren].end) != -1)
8831 	    {
8832             i = t1 - s1;
8833             goto getlen;
8834         } else {
8835           warn_undef:
8836             if (ckWARN(WARN_UNINITIALIZED))
8837                 report_uninit((const SV *)sv);
8838             return 0;
8839         }
8840     }
8841   getlen:
8842     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8843         const char * const s = rx->subbeg - rx->suboffset + s1;
8844         const U8 *ep;
8845         STRLEN el;
8846 
8847         i = t1 - s1;
8848         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8849 			i = el;
8850     }
8851     return i;
8852 }
8853 
8854 SV*
Perl_reg_qr_package(pTHX_ REGEXP * const rx)8855 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8856 {
8857     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8858 	PERL_UNUSED_ARG(rx);
8859 	if (0)
8860 	    return NULL;
8861 	else
8862 	    return newSVpvs("Regexp");
8863 }
8864 
8865 /* Scans the name of a named buffer from the pattern.
8866  * If flags is REG_RSN_RETURN_NULL returns null.
8867  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8868  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8869  * to the parsed name as looked up in the RExC_paren_names hash.
8870  * If there is an error throws a vFAIL().. type exception.
8871  */
8872 
8873 #define REG_RSN_RETURN_NULL    0
8874 #define REG_RSN_RETURN_NAME    1
8875 #define REG_RSN_RETURN_DATA    2
8876 
8877 STATIC SV*
S_reg_scan_name(pTHX_ RExC_state_t * pRExC_state,U32 flags)8878 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8879 {
8880     char *name_start = RExC_parse;
8881     SV* sv_name;
8882 
8883     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8884 
8885     assert (RExC_parse <= RExC_end);
8886     if (RExC_parse == RExC_end) NOOP;
8887     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8888          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8889           * using do...while */
8890 	if (UTF)
8891 	    do {
8892 		RExC_parse += UTF8SKIP(RExC_parse);
8893 	    } while (   RExC_parse < RExC_end
8894                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8895 	else
8896 	    do {
8897 		RExC_parse++;
8898 	    } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8899     } else {
8900         RExC_parse++; /* so the <- from the vFAIL is after the offending
8901                          character */
8902         vFAIL("Group name must start with a non-digit word character");
8903     }
8904     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8905 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8906     if ( flags == REG_RSN_RETURN_NAME)
8907         return sv_name;
8908     else if (flags==REG_RSN_RETURN_DATA) {
8909         HE *he_str = NULL;
8910         SV *sv_dat = NULL;
8911         if ( ! sv_name )      /* should not happen*/
8912             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8913         if (RExC_paren_names)
8914             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8915         if ( he_str )
8916             sv_dat = HeVAL(he_str);
8917         if ( ! sv_dat ) {   /* Didn't find group */
8918 
8919             /* It might be a forward reference; we can't fail until we
8920                 * know, by completing the parse to get all the groups, and
8921                 * then reparsing */
8922             if (ALL_PARENS_COUNTED)  {
8923                 vFAIL("Reference to nonexistent named group");
8924             }
8925             else {
8926                 REQUIRE_PARENS_PASS;
8927             }
8928         }
8929         return sv_dat;
8930     }
8931 
8932     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8933                      (unsigned long) flags);
8934 }
8935 
8936 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8937     if (RExC_lastparse!=RExC_parse) {                           \
8938         Perl_re_printf( aTHX_  "%s",                            \
8939             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8940                 RExC_end - RExC_parse, 16,                      \
8941                 "", "",                                         \
8942                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8943                 PERL_PV_PRETTY_ELLIPSES   |                     \
8944                 PERL_PV_PRETTY_LTGT       |                     \
8945                 PERL_PV_ESCAPE_RE         |                     \
8946                 PERL_PV_PRETTY_EXACTSIZE                        \
8947             )                                                   \
8948         );                                                      \
8949     } else                                                      \
8950         Perl_re_printf( aTHX_ "%16s","");                       \
8951                                                                 \
8952     if (RExC_lastnum!=RExC_emit)                                \
8953        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8954     else                                                        \
8955        Perl_re_printf( aTHX_ "|%4s","");                        \
8956     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8957         (int)((depth*2)), "",                                   \
8958         (funcname)                                              \
8959     );                                                          \
8960     RExC_lastnum=RExC_emit;                                     \
8961     RExC_lastparse=RExC_parse;                                  \
8962 })
8963 
8964 
8965 
8966 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8967     DEBUG_PARSE_MSG((funcname));                            \
8968     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8969 })
8970 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8971     DEBUG_PARSE_MSG((funcname));                            \
8972     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8973 })
8974 
8975 /* This section of code defines the inversion list object and its methods.  The
8976  * interfaces are highly subject to change, so as much as possible is static to
8977  * this file.  An inversion list is here implemented as a malloc'd C UV array
8978  * as an SVt_INVLIST scalar.
8979  *
8980  * An inversion list for Unicode is an array of code points, sorted by ordinal
8981  * number.  Each element gives the code point that begins a range that extends
8982  * up-to but not including the code point given by the next element.  The final
8983  * element gives the first code point of a range that extends to the platform's
8984  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8985  * ...) give ranges whose code points are all in the inversion list.  We say
8986  * that those ranges are in the set.  The odd-numbered elements give ranges
8987  * whose code points are not in the inversion list, and hence not in the set.
8988  * Thus, element [0] is the first code point in the list.  Element [1]
8989  * is the first code point beyond that not in the list; and element [2] is the
8990  * first code point beyond that that is in the list.  In other words, the first
8991  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8992  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8993  * all code points in that range are not in the inversion list.  The third
8994  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8995  * list, and so forth.  Thus every element whose index is divisible by two
8996  * gives the beginning of a range that is in the list, and every element whose
8997  * index is not divisible by two gives the beginning of a range not in the
8998  * list.  If the final element's index is divisible by two, the inversion list
8999  * extends to the platform's infinity; otherwise the highest code point in the
9000  * inversion list is the contents of that element minus 1.
9001  *
9002  * A range that contains just a single code point N will look like
9003  *  invlist[i]   == N
9004  *  invlist[i+1] == N+1
9005  *
9006  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9007  * impossible to represent, so element [i+1] is omitted.  The single element
9008  * inversion list
9009  *  invlist[0] == UV_MAX
9010  * contains just UV_MAX, but is interpreted as matching to infinity.
9011  *
9012  * Taking the complement (inverting) an inversion list is quite simple, if the
9013  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9014  * This implementation reserves an element at the beginning of each inversion
9015  * list to always contain 0; there is an additional flag in the header which
9016  * indicates if the list begins at the 0, or is offset to begin at the next
9017  * element.  This means that the inversion list can be inverted without any
9018  * copying; just flip the flag.
9019  *
9020  * More about inversion lists can be found in "Unicode Demystified"
9021  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9022  *
9023  * The inversion list data structure is currently implemented as an SV pointing
9024  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9025  * array of UV whose memory management is automatically handled by the existing
9026  * facilities for SV's.
9027  *
9028  * Some of the methods should always be private to the implementation, and some
9029  * should eventually be made public */
9030 
9031 /* The header definitions are in F<invlist_inline.h> */
9032 
9033 #ifndef PERL_IN_XSUB_RE
9034 
9035 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)9036 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9037 {
9038     /* Returns a pointer to the first element in the inversion list's array.
9039      * This is called upon initialization of an inversion list.  Where the
9040      * array begins depends on whether the list has the code point U+0000 in it
9041      * or not.  The other parameter tells it whether the code that follows this
9042      * call is about to put a 0 in the inversion list or not.  The first
9043      * element is either the element reserved for 0, if TRUE, or the element
9044      * after it, if FALSE */
9045 
9046     bool* offset = get_invlist_offset_addr(invlist);
9047     UV* zero_addr = (UV *) SvPVX(invlist);
9048 
9049     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9050 
9051     /* Must be empty */
9052     assert(! _invlist_len(invlist));
9053 
9054     *zero_addr = 0;
9055 
9056     /* 1^1 = 0; 1^0 = 1 */
9057     *offset = 1 ^ will_have_0;
9058     return zero_addr + *offset;
9059 }
9060 
9061 PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV * const invlist,const UV len,const bool offset)9062 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
9063 {
9064     /* Sets the current number of elements stored in the inversion list.
9065      * Updates SvCUR correspondingly */
9066     PERL_UNUSED_CONTEXT;
9067     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
9068 
9069     assert(is_invlist(invlist));
9070 
9071     SvCUR_set(invlist,
9072               (len == 0)
9073                ? 0
9074                : TO_INTERNAL_SIZE(len + offset));
9075     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
9076 }
9077 
9078 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)9079 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9080 {
9081     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9082      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9083      * is similar to what SvSetMagicSV() would do, if it were implemented on
9084      * inversion lists, though this routine avoids a copy */
9085 
9086     const UV src_len          = _invlist_len(src);
9087     const bool src_offset     = *get_invlist_offset_addr(src);
9088     const STRLEN src_byte_len = SvLEN(src);
9089     char * array              = SvPVX(src);
9090 
9091     const int oldtainted = TAINT_get;
9092 
9093     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9094 
9095     assert(is_invlist(src));
9096     assert(is_invlist(dest));
9097     assert(! invlist_is_iterating(src));
9098     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9099 
9100     /* Make sure it ends in the right place with a NUL, as our inversion list
9101      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9102      * asserts it */
9103     array[src_byte_len - 1] = '\0';
9104 
9105     TAINT_NOT;      /* Otherwise it breaks */
9106     sv_usepvn_flags(dest,
9107                     (char *) array,
9108                     src_byte_len - 1,
9109 
9110                     /* This flag is documented to cause a copy to be avoided */
9111                     SV_HAS_TRAILING_NUL);
9112     TAINT_set(oldtainted);
9113     SvPV_set(src, 0);
9114     SvLEN_set(src, 0);
9115     SvCUR_set(src, 0);
9116 
9117     /* Finish up copying over the other fields in an inversion list */
9118     *get_invlist_offset_addr(dest) = src_offset;
9119     invlist_set_len(dest, src_len, src_offset);
9120     *get_invlist_previous_index_addr(dest) = 0;
9121     invlist_iterfinish(dest);
9122 }
9123 
9124 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)9125 S_get_invlist_previous_index_addr(SV* invlist)
9126 {
9127     /* Return the address of the IV that is reserved to hold the cached index
9128      * */
9129     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9130 
9131     assert(is_invlist(invlist));
9132 
9133     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9134 }
9135 
9136 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)9137 S_invlist_previous_index(SV* const invlist)
9138 {
9139     /* Returns cached index of previous search */
9140 
9141     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9142 
9143     return *get_invlist_previous_index_addr(invlist);
9144 }
9145 
9146 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)9147 S_invlist_set_previous_index(SV* const invlist, const IV index)
9148 {
9149     /* Caches <index> for later retrieval */
9150 
9151     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9152 
9153     assert(index == 0 || index < (int) _invlist_len(invlist));
9154 
9155     *get_invlist_previous_index_addr(invlist) = index;
9156 }
9157 
9158 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)9159 S_invlist_trim(SV* invlist)
9160 {
9161     /* Free the not currently-being-used space in an inversion list */
9162 
9163     /* But don't free up the space needed for the 0 UV that is always at the
9164      * beginning of the list, nor the trailing NUL */
9165     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9166 
9167     PERL_ARGS_ASSERT_INVLIST_TRIM;
9168 
9169     assert(is_invlist(invlist));
9170 
9171     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9172 }
9173 
9174 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)9175 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9176 {
9177     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9178 
9179     assert(is_invlist(invlist));
9180 
9181     invlist_set_len(invlist, 0, 0);
9182     invlist_trim(invlist);
9183 }
9184 
9185 #endif /* ifndef PERL_IN_XSUB_RE */
9186 
9187 PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV * const invlist)9188 S_invlist_is_iterating(SV* const invlist)
9189 {
9190     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9191 
9192     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9193 }
9194 
9195 #ifndef PERL_IN_XSUB_RE
9196 
9197 PERL_STATIC_INLINE UV
S_invlist_max(SV * const invlist)9198 S_invlist_max(SV* const invlist)
9199 {
9200     /* Returns the maximum number of elements storable in the inversion list's
9201      * array, without having to realloc() */
9202 
9203     PERL_ARGS_ASSERT_INVLIST_MAX;
9204 
9205     assert(is_invlist(invlist));
9206 
9207     /* Assumes worst case, in which the 0 element is not counted in the
9208      * inversion list, so subtracts 1 for that */
9209     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9210            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9211            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9212 }
9213 
9214 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)9215 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9216 {
9217     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9218 
9219     /* First 1 is in case the zero element isn't in the list; second 1 is for
9220      * trailing NUL */
9221     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9222     invlist_set_len(invlist, 0, 0);
9223 
9224     /* Force iterinit() to be used to get iteration to work */
9225     invlist_iterfinish(invlist);
9226 
9227     *get_invlist_previous_index_addr(invlist) = 0;
9228 }
9229 
9230 SV*
Perl__new_invlist(pTHX_ IV initial_size)9231 Perl__new_invlist(pTHX_ IV initial_size)
9232 {
9233 
9234     /* Return a pointer to a newly constructed inversion list, with enough
9235      * space to store 'initial_size' elements.  If that number is negative, a
9236      * system default is used instead */
9237 
9238     SV* new_list;
9239 
9240     if (initial_size < 0) {
9241 	initial_size = 10;
9242     }
9243 
9244     new_list = newSV_type(SVt_INVLIST);
9245     initialize_invlist_guts(new_list, initial_size);
9246 
9247     return new_list;
9248 }
9249 
9250 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)9251 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9252 {
9253     /* Return a pointer to a newly constructed inversion list, initialized to
9254      * point to <list>, which has to be in the exact correct inversion list
9255      * form, including internal fields.  Thus this is a dangerous routine that
9256      * should not be used in the wrong hands.  The passed in 'list' contains
9257      * several header fields at the beginning that are not part of the
9258      * inversion list body proper */
9259 
9260     const STRLEN length = (STRLEN) list[0];
9261     const UV version_id =          list[1];
9262     const bool offset   =    cBOOL(list[2]);
9263 #define HEADER_LENGTH 3
9264     /* If any of the above changes in any way, you must change HEADER_LENGTH
9265      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9266      *      perl -E 'say int(rand 2**31-1)'
9267      */
9268 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9269                                         data structure type, so that one being
9270                                         passed in can be validated to be an
9271                                         inversion list of the correct vintage.
9272                                        */
9273 
9274     SV* invlist = newSV_type(SVt_INVLIST);
9275 
9276     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9277 
9278     if (version_id != INVLIST_VERSION_ID) {
9279         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9280     }
9281 
9282     /* The generated array passed in includes header elements that aren't part
9283      * of the list proper, so start it just after them */
9284     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9285 
9286     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9287 			       shouldn't touch it */
9288 
9289     *(get_invlist_offset_addr(invlist)) = offset;
9290 
9291     /* The 'length' passed to us is the physical number of elements in the
9292      * inversion list.  But if there is an offset the logical number is one
9293      * less than that */
9294     invlist_set_len(invlist, length  - offset, offset);
9295 
9296     invlist_set_previous_index(invlist, 0);
9297 
9298     /* Initialize the iteration pointer. */
9299     invlist_iterfinish(invlist);
9300 
9301     SvREADONLY_on(invlist);
9302 
9303     return invlist;
9304 }
9305 
9306 STATIC void
S_invlist_extend(pTHX_ SV * const invlist,const UV new_max)9307 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9308 {
9309     /* Grow the maximum size of an inversion list */
9310 
9311     PERL_ARGS_ASSERT_INVLIST_EXTEND;
9312 
9313     assert(is_invlist(invlist));
9314 
9315     /* Add one to account for the zero element at the beginning which may not
9316      * be counted by the calling parameters */
9317     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9318 }
9319 
9320 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)9321 S__append_range_to_invlist(pTHX_ SV* const invlist,
9322                                  const UV start, const UV end)
9323 {
9324    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9325     * the end of the inversion list.  The range must be above any existing
9326     * ones. */
9327 
9328     UV* array;
9329     UV max = invlist_max(invlist);
9330     UV len = _invlist_len(invlist);
9331     bool offset;
9332 
9333     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9334 
9335     if (len == 0) { /* Empty lists must be initialized */
9336         offset = start != 0;
9337         array = _invlist_array_init(invlist, ! offset);
9338     }
9339     else {
9340 	/* Here, the existing list is non-empty. The current max entry in the
9341 	 * list is generally the first value not in the set, except when the
9342 	 * set extends to the end of permissible values, in which case it is
9343 	 * the first entry in that final set, and so this call is an attempt to
9344 	 * append out-of-order */
9345 
9346 	UV final_element = len - 1;
9347 	array = invlist_array(invlist);
9348 	if (   array[final_element] > start
9349 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9350 	{
9351 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9352 		     array[final_element], start,
9353 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9354 	}
9355 
9356         /* Here, it is a legal append.  If the new range begins 1 above the end
9357          * of the range below it, it is extending the range below it, so the
9358          * new first value not in the set is one greater than the newly
9359          * extended range.  */
9360         offset = *get_invlist_offset_addr(invlist);
9361 	if (array[final_element] == start) {
9362 	    if (end != UV_MAX) {
9363 		array[final_element] = end + 1;
9364 	    }
9365 	    else {
9366 		/* But if the end is the maximum representable on the machine,
9367                  * assume that infinity was actually what was meant.  Just let
9368                  * the range that this would extend to have no end */
9369 		invlist_set_len(invlist, len - 1, offset);
9370 	    }
9371 	    return;
9372 	}
9373     }
9374 
9375     /* Here the new range doesn't extend any existing set.  Add it */
9376 
9377     len += 2;	/* Includes an element each for the start and end of range */
9378 
9379     /* If wll overflow the existing space, extend, which may cause the array to
9380      * be moved */
9381     if (max < len) {
9382 	invlist_extend(invlist, len);
9383 
9384         /* Have to set len here to avoid assert failure in invlist_array() */
9385         invlist_set_len(invlist, len, offset);
9386 
9387 	array = invlist_array(invlist);
9388     }
9389     else {
9390 	invlist_set_len(invlist, len, offset);
9391     }
9392 
9393     /* The next item on the list starts the range, the one after that is
9394      * one past the new range.  */
9395     array[len - 2] = start;
9396     if (end != UV_MAX) {
9397 	array[len - 1] = end + 1;
9398     }
9399     else {
9400 	/* But if the end is the maximum representable on the machine, just let
9401 	 * the range have no end */
9402 	invlist_set_len(invlist, len - 1, offset);
9403     }
9404 }
9405 
9406 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)9407 Perl__invlist_search(SV* const invlist, const UV cp)
9408 {
9409     /* Searches the inversion list for the entry that contains the input code
9410      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9411      * return value is the index into the list's array of the range that
9412      * contains <cp>, that is, 'i' such that
9413      *	array[i] <= cp < array[i+1]
9414      */
9415 
9416     IV low = 0;
9417     IV mid;
9418     IV high = _invlist_len(invlist);
9419     const IV highest_element = high - 1;
9420     const UV* array;
9421 
9422     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9423 
9424     /* If list is empty, return failure. */
9425     if (high == 0) {
9426 	return -1;
9427     }
9428 
9429     /* (We can't get the array unless we know the list is non-empty) */
9430     array = invlist_array(invlist);
9431 
9432     mid = invlist_previous_index(invlist);
9433     assert(mid >=0);
9434     if (mid > highest_element) {
9435         mid = highest_element;
9436     }
9437 
9438     /* <mid> contains the cache of the result of the previous call to this
9439      * function (0 the first time).  See if this call is for the same result,
9440      * or if it is for mid-1.  This is under the theory that calls to this
9441      * function will often be for related code points that are near each other.
9442      * And benchmarks show that caching gives better results.  We also test
9443      * here if the code point is within the bounds of the list.  These tests
9444      * replace others that would have had to be made anyway to make sure that
9445      * the array bounds were not exceeded, and these give us extra information
9446      * at the same time */
9447     if (cp >= array[mid]) {
9448         if (cp >= array[highest_element]) {
9449             return highest_element;
9450         }
9451 
9452         /* Here, array[mid] <= cp < array[highest_element].  This means that
9453          * the final element is not the answer, so can exclude it; it also
9454          * means that <mid> is not the final element, so can refer to 'mid + 1'
9455          * safely */
9456         if (cp < array[mid + 1]) {
9457             return mid;
9458         }
9459         high--;
9460         low = mid + 1;
9461     }
9462     else { /* cp < aray[mid] */
9463         if (cp < array[0]) { /* Fail if outside the array */
9464             return -1;
9465         }
9466         high = mid;
9467         if (cp >= array[mid - 1]) {
9468             goto found_entry;
9469         }
9470     }
9471 
9472     /* Binary search.  What we are looking for is <i> such that
9473      *	array[i] <= cp < array[i+1]
9474      * The loop below converges on the i+1.  Note that there may not be an
9475      * (i+1)th element in the array, and things work nonetheless */
9476     while (low < high) {
9477 	mid = (low + high) / 2;
9478         assert(mid <= highest_element);
9479 	if (array[mid] <= cp) { /* cp >= array[mid] */
9480 	    low = mid + 1;
9481 
9482 	    /* We could do this extra test to exit the loop early.
9483 	    if (cp < array[low]) {
9484 		return mid;
9485 	    }
9486 	    */
9487 	}
9488 	else { /* cp < array[mid] */
9489 	    high = mid;
9490 	}
9491     }
9492 
9493   found_entry:
9494     high--;
9495     invlist_set_previous_index(invlist, high);
9496     return high;
9497 }
9498 
9499 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)9500 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9501                                          const bool complement_b, SV** output)
9502 {
9503     /* Take the union of two inversion lists and point '*output' to it.  On
9504      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9505      * even 'a' or 'b').  If to an inversion list, the contents of the original
9506      * list will be replaced by the union.  The first list, 'a', may be
9507      * NULL, in which case a copy of the second list is placed in '*output'.
9508      * If 'complement_b' is TRUE, the union is taken of the complement
9509      * (inversion) of 'b' instead of b itself.
9510      *
9511      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9512      * Richard Gillam, published by Addison-Wesley, and explained at some
9513      * length there.  The preface says to incorporate its examples into your
9514      * code at your own risk.
9515      *
9516      * The algorithm is like a merge sort. */
9517 
9518     const UV* array_a;    /* a's array */
9519     const UV* array_b;
9520     UV len_a;	    /* length of a's array */
9521     UV len_b;
9522 
9523     SV* u;			/* the resulting union */
9524     UV* array_u;
9525     UV len_u = 0;
9526 
9527     UV i_a = 0;		    /* current index into a's array */
9528     UV i_b = 0;
9529     UV i_u = 0;
9530 
9531     /* running count, as explained in the algorithm source book; items are
9532      * stopped accumulating and are output when the count changes to/from 0.
9533      * The count is incremented when we start a range that's in an input's set,
9534      * and decremented when we start a range that's not in a set.  So this
9535      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9536      * and hence nothing goes into the union; 1, just one of the inputs is in
9537      * its set (and its current range gets added to the union); and 2 when both
9538      * inputs are in their sets.  */
9539     UV count = 0;
9540 
9541     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9542     assert(a != b);
9543     assert(*output == NULL || is_invlist(*output));
9544 
9545     len_b = _invlist_len(b);
9546     if (len_b == 0) {
9547 
9548         /* Here, 'b' is empty, hence it's complement is all possible code
9549          * points.  So if the union includes the complement of 'b', it includes
9550          * everything, and we need not even look at 'a'.  It's easiest to
9551          * create a new inversion list that matches everything.  */
9552         if (complement_b) {
9553             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9554 
9555             if (*output == NULL) { /* If the output didn't exist, just point it
9556                                       at the new list */
9557                 *output = everything;
9558             }
9559             else { /* Otherwise, replace its contents with the new list */
9560                 invlist_replace_list_destroys_src(*output, everything);
9561                 SvREFCNT_dec_NN(everything);
9562             }
9563 
9564             return;
9565         }
9566 
9567         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9568          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9569          * output will be empty */
9570 
9571         if (a == NULL || _invlist_len(a) == 0) {
9572             if (*output == NULL) {
9573                 *output = _new_invlist(0);
9574             }
9575             else {
9576                 invlist_clear(*output);
9577             }
9578             return;
9579         }
9580 
9581         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9582          * union.  We can just return a copy of 'a' if '*output' doesn't point
9583          * to an existing list */
9584         if (*output == NULL) {
9585             *output = invlist_clone(a, NULL);
9586             return;
9587         }
9588 
9589         /* If the output is to overwrite 'a', we have a no-op, as it's
9590          * already in 'a' */
9591         if (*output == a) {
9592             return;
9593         }
9594 
9595         /* Here, '*output' is to be overwritten by 'a' */
9596         u = invlist_clone(a, NULL);
9597         invlist_replace_list_destroys_src(*output, u);
9598         SvREFCNT_dec_NN(u);
9599 
9600         return;
9601     }
9602 
9603     /* Here 'b' is not empty.  See about 'a' */
9604 
9605     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9606 
9607         /* Here, 'a' is empty (and b is not).  That means the union will come
9608          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9609          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9610          * the clone */
9611 
9612         SV ** dest = (*output == NULL) ? output : &u;
9613         *dest = invlist_clone(b, NULL);
9614         if (complement_b) {
9615             _invlist_invert(*dest);
9616         }
9617 
9618         if (dest == &u) {
9619             invlist_replace_list_destroys_src(*output, u);
9620             SvREFCNT_dec_NN(u);
9621         }
9622 
9623 	return;
9624     }
9625 
9626     /* Here both lists exist and are non-empty */
9627     array_a = invlist_array(a);
9628     array_b = invlist_array(b);
9629 
9630     /* If are to take the union of 'a' with the complement of b, set it
9631      * up so are looking at b's complement. */
9632     if (complement_b) {
9633 
9634 	/* To complement, we invert: if the first element is 0, remove it.  To
9635 	 * do this, we just pretend the array starts one later */
9636         if (array_b[0] == 0) {
9637             array_b++;
9638             len_b--;
9639         }
9640         else {
9641 
9642             /* But if the first element is not zero, we pretend the list starts
9643              * at the 0 that is always stored immediately before the array. */
9644             array_b--;
9645             len_b++;
9646         }
9647     }
9648 
9649     /* Size the union for the worst case: that the sets are completely
9650      * disjoint */
9651     u = _new_invlist(len_a + len_b);
9652 
9653     /* Will contain U+0000 if either component does */
9654     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9655                                       || (len_b > 0 && array_b[0] == 0));
9656 
9657     /* Go through each input list item by item, stopping when have exhausted
9658      * one of them */
9659     while (i_a < len_a && i_b < len_b) {
9660 	UV cp;	    /* The element to potentially add to the union's array */
9661 	bool cp_in_set;   /* is it in the the input list's set or not */
9662 
9663 	/* We need to take one or the other of the two inputs for the union.
9664 	 * Since we are merging two sorted lists, we take the smaller of the
9665          * next items.  In case of a tie, we take first the one that is in its
9666          * set.  If we first took the one not in its set, it would decrement
9667          * the count, possibly to 0 which would cause it to be output as ending
9668          * the range, and the next time through we would take the same number,
9669          * and output it again as beginning the next range.  By doing it the
9670          * opposite way, there is no possibility that the count will be
9671          * momentarily decremented to 0, and thus the two adjoining ranges will
9672          * be seamlessly merged.  (In a tie and both are in the set or both not
9673          * in the set, it doesn't matter which we take first.) */
9674 	if (       array_a[i_a] < array_b[i_b]
9675 	    || (   array_a[i_a] == array_b[i_b]
9676 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9677 	{
9678 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9679 	    cp = array_a[i_a++];
9680 	}
9681 	else {
9682 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9683 	    cp = array_b[i_b++];
9684 	}
9685 
9686 	/* Here, have chosen which of the two inputs to look at.  Only output
9687 	 * if the running count changes to/from 0, which marks the
9688 	 * beginning/end of a range that's in the set */
9689 	if (cp_in_set) {
9690 	    if (count == 0) {
9691 		array_u[i_u++] = cp;
9692 	    }
9693 	    count++;
9694 	}
9695 	else {
9696 	    count--;
9697 	    if (count == 0) {
9698 		array_u[i_u++] = cp;
9699 	    }
9700 	}
9701     }
9702 
9703 
9704     /* The loop above increments the index into exactly one of the input lists
9705      * each iteration, and ends when either index gets to its list end.  That
9706      * means the other index is lower than its end, and so something is
9707      * remaining in that one.  We decrement 'count', as explained below, if
9708      * that list is in its set.  (i_a and i_b each currently index the element
9709      * beyond the one we care about.) */
9710     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9711 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9712     {
9713 	count--;
9714     }
9715 
9716     /* Above we decremented 'count' if the list that had unexamined elements in
9717      * it was in its set.  This has made it so that 'count' being non-zero
9718      * means there isn't anything left to output; and 'count' equal to 0 means
9719      * that what is left to output is precisely that which is left in the
9720      * non-exhausted input list.
9721      *
9722      * To see why, note first that the exhausted input obviously has nothing
9723      * left to add to the union.  If it was in its set at its end, that means
9724      * the set extends from here to the platform's infinity, and hence so does
9725      * the union and the non-exhausted set is irrelevant.  The exhausted set
9726      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9727      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9728      * 'count' remains at 1.  This is consistent with the decremented 'count'
9729      * != 0 meaning there's nothing left to add to the union.
9730      *
9731      * But if the exhausted input wasn't in its set, it contributed 0 to
9732      * 'count', and the rest of the union will be whatever the other input is.
9733      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9734      * otherwise it gets decremented to 0.  This is consistent with 'count'
9735      * == 0 meaning the remainder of the union is whatever is left in the
9736      * non-exhausted list. */
9737     if (count != 0) {
9738         len_u = i_u;
9739     }
9740     else {
9741         IV copy_count = len_a - i_a;
9742         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9743 	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9744         }
9745         else { /* The non-exhausted input is b */
9746             copy_count = len_b - i_b;
9747 	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9748         }
9749         len_u = i_u + copy_count;
9750     }
9751 
9752     /* Set the result to the final length, which can change the pointer to
9753      * array_u, so re-find it.  (Note that it is unlikely that this will
9754      * change, as we are shrinking the space, not enlarging it) */
9755     if (len_u != _invlist_len(u)) {
9756 	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9757 	invlist_trim(u);
9758 	array_u = invlist_array(u);
9759     }
9760 
9761     if (*output == NULL) {  /* Simply return the new inversion list */
9762         *output = u;
9763     }
9764     else {
9765         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9766          * could instead free '*output', and then set it to 'u', but experience
9767          * has shown [perl #127392] that if the input is a mortal, we can get a
9768          * huge build-up of these during regex compilation before they get
9769          * freed. */
9770         invlist_replace_list_destroys_src(*output, u);
9771         SvREFCNT_dec_NN(u);
9772     }
9773 
9774     return;
9775 }
9776 
9777 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)9778 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9779                                                const bool complement_b, SV** i)
9780 {
9781     /* Take the intersection of two inversion lists and point '*i' to it.  On
9782      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9783      * even 'a' or 'b').  If to an inversion list, the contents of the original
9784      * list will be replaced by the intersection.  The first list, 'a', may be
9785      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9786      * TRUE, the result will be the intersection of 'a' and the complement (or
9787      * inversion) of 'b' instead of 'b' directly.
9788      *
9789      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9790      * Richard Gillam, published by Addison-Wesley, and explained at some
9791      * length there.  The preface says to incorporate its examples into your
9792      * code at your own risk.  In fact, it had bugs
9793      *
9794      * The algorithm is like a merge sort, and is essentially the same as the
9795      * union above
9796      */
9797 
9798     const UV* array_a;		/* a's array */
9799     const UV* array_b;
9800     UV len_a;	/* length of a's array */
9801     UV len_b;
9802 
9803     SV* r;		     /* the resulting intersection */
9804     UV* array_r;
9805     UV len_r = 0;
9806 
9807     UV i_a = 0;		    /* current index into a's array */
9808     UV i_b = 0;
9809     UV i_r = 0;
9810 
9811     /* running count of how many of the two inputs are postitioned at ranges
9812      * that are in their sets.  As explained in the algorithm source book,
9813      * items are stopped accumulating and are output when the count changes
9814      * to/from 2.  The count is incremented when we start a range that's in an
9815      * input's set, and decremented when we start a range that's not in a set.
9816      * Only when it is 2 are we in the intersection. */
9817     UV count = 0;
9818 
9819     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9820     assert(a != b);
9821     assert(*i == NULL || is_invlist(*i));
9822 
9823     /* Special case if either one is empty */
9824     len_a = (a == NULL) ? 0 : _invlist_len(a);
9825     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9826         if (len_a != 0 && complement_b) {
9827 
9828             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9829              * must be empty.  Here, also we are using 'b's complement, which
9830              * hence must be every possible code point.  Thus the intersection
9831              * is simply 'a'. */
9832 
9833             if (*i == a) {  /* No-op */
9834                 return;
9835             }
9836 
9837             if (*i == NULL) {
9838                 *i = invlist_clone(a, NULL);
9839                 return;
9840             }
9841 
9842             r = invlist_clone(a, NULL);
9843             invlist_replace_list_destroys_src(*i, r);
9844             SvREFCNT_dec_NN(r);
9845             return;
9846         }
9847 
9848         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9849          * intersection must be empty */
9850         if (*i == NULL) {
9851             *i = _new_invlist(0);
9852             return;
9853         }
9854 
9855         invlist_clear(*i);
9856 	return;
9857     }
9858 
9859     /* Here both lists exist and are non-empty */
9860     array_a = invlist_array(a);
9861     array_b = invlist_array(b);
9862 
9863     /* If are to take the intersection of 'a' with the complement of b, set it
9864      * up so are looking at b's complement. */
9865     if (complement_b) {
9866 
9867 	/* To complement, we invert: if the first element is 0, remove it.  To
9868 	 * do this, we just pretend the array starts one later */
9869         if (array_b[0] == 0) {
9870             array_b++;
9871             len_b--;
9872         }
9873         else {
9874 
9875             /* But if the first element is not zero, we pretend the list starts
9876              * at the 0 that is always stored immediately before the array. */
9877             array_b--;
9878             len_b++;
9879         }
9880     }
9881 
9882     /* Size the intersection for the worst case: that the intersection ends up
9883      * fragmenting everything to be completely disjoint */
9884     r= _new_invlist(len_a + len_b);
9885 
9886     /* Will contain U+0000 iff both components do */
9887     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9888                                      && len_b > 0 && array_b[0] == 0);
9889 
9890     /* Go through each list item by item, stopping when have exhausted one of
9891      * them */
9892     while (i_a < len_a && i_b < len_b) {
9893 	UV cp;	    /* The element to potentially add to the intersection's
9894 		       array */
9895 	bool cp_in_set;	/* Is it in the input list's set or not */
9896 
9897 	/* We need to take one or the other of the two inputs for the
9898 	 * intersection.  Since we are merging two sorted lists, we take the
9899          * smaller of the next items.  In case of a tie, we take first the one
9900          * that is not in its set (a difference from the union algorithm).  If
9901          * we first took the one in its set, it would increment the count,
9902          * possibly to 2 which would cause it to be output as starting a range
9903          * in the intersection, and the next time through we would take that
9904          * same number, and output it again as ending the set.  By doing the
9905          * opposite of this, there is no possibility that the count will be
9906          * momentarily incremented to 2.  (In a tie and both are in the set or
9907          * both not in the set, it doesn't matter which we take first.) */
9908 	if (       array_a[i_a] < array_b[i_b]
9909 	    || (   array_a[i_a] == array_b[i_b]
9910 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9911 	{
9912 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9913 	    cp = array_a[i_a++];
9914 	}
9915 	else {
9916 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9917 	    cp= array_b[i_b++];
9918 	}
9919 
9920 	/* Here, have chosen which of the two inputs to look at.  Only output
9921 	 * if the running count changes to/from 2, which marks the
9922 	 * beginning/end of a range that's in the intersection */
9923 	if (cp_in_set) {
9924 	    count++;
9925 	    if (count == 2) {
9926 		array_r[i_r++] = cp;
9927 	    }
9928 	}
9929 	else {
9930 	    if (count == 2) {
9931 		array_r[i_r++] = cp;
9932 	    }
9933 	    count--;
9934 	}
9935 
9936     }
9937 
9938     /* The loop above increments the index into exactly one of the input lists
9939      * each iteration, and ends when either index gets to its list end.  That
9940      * means the other index is lower than its end, and so something is
9941      * remaining in that one.  We increment 'count', as explained below, if the
9942      * exhausted list was in its set.  (i_a and i_b each currently index the
9943      * element beyond the one we care about.) */
9944     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9945         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9946     {
9947 	count++;
9948     }
9949 
9950     /* Above we incremented 'count' if the exhausted list was in its set.  This
9951      * has made it so that 'count' being below 2 means there is nothing left to
9952      * output; otheriwse what's left to add to the intersection is precisely
9953      * that which is left in the non-exhausted input list.
9954      *
9955      * To see why, note first that the exhausted input obviously has nothing
9956      * left to affect the intersection.  If it was in its set at its end, that
9957      * means the set extends from here to the platform's infinity, and hence
9958      * anything in the non-exhausted's list will be in the intersection, and
9959      * anything not in it won't be.  Hence, the rest of the intersection is
9960      * precisely what's in the non-exhausted list  The exhausted set also
9961      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9962      * it means 'count' is now at least 2.  This is consistent with the
9963      * incremented 'count' being >= 2 means to add the non-exhausted list to
9964      * the intersection.
9965      *
9966      * But if the exhausted input wasn't in its set, it contributed 0 to
9967      * 'count', and the intersection can't include anything further; the
9968      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9969      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9970      * further to add to the intersection. */
9971     if (count < 2) { /* Nothing left to put in the intersection. */
9972         len_r = i_r;
9973     }
9974     else { /* copy the non-exhausted list, unchanged. */
9975         IV copy_count = len_a - i_a;
9976         if (copy_count > 0) {   /* a is the one with stuff left */
9977 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9978         }
9979         else {  /* b is the one with stuff left */
9980             copy_count = len_b - i_b;
9981 	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9982         }
9983         len_r = i_r + copy_count;
9984     }
9985 
9986     /* Set the result to the final length, which can change the pointer to
9987      * array_r, so re-find it.  (Note that it is unlikely that this will
9988      * change, as we are shrinking the space, not enlarging it) */
9989     if (len_r != _invlist_len(r)) {
9990 	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9991 	invlist_trim(r);
9992 	array_r = invlist_array(r);
9993     }
9994 
9995     if (*i == NULL) { /* Simply return the calculated intersection */
9996         *i = r;
9997     }
9998     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9999               instead free '*i', and then set it to 'r', but experience has
10000               shown [perl #127392] that if the input is a mortal, we can get a
10001               huge build-up of these during regex compilation before they get
10002               freed. */
10003         if (len_r) {
10004             invlist_replace_list_destroys_src(*i, r);
10005         }
10006         else {
10007             invlist_clear(*i);
10008         }
10009         SvREFCNT_dec_NN(r);
10010     }
10011 
10012     return;
10013 }
10014 
10015 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)10016 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10017 {
10018     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10019      * set.  A pointer to the inversion list is returned.  This may actually be
10020      * a new list, in which case the passed in one has been destroyed.  The
10021      * passed-in inversion list can be NULL, in which case a new one is created
10022      * with just the one range in it.  The new list is not necessarily
10023      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10024      * result of this function.  The gain would not be large, and in many
10025      * cases, this is called multiple times on a single inversion list, so
10026      * anything freed may almost immediately be needed again.
10027      *
10028      * This used to mostly call the 'union' routine, but that is much more
10029      * heavyweight than really needed for a single range addition */
10030 
10031     UV* array;              /* The array implementing the inversion list */
10032     UV len;                 /* How many elements in 'array' */
10033     SSize_t i_s;            /* index into the invlist array where 'start'
10034                                should go */
10035     SSize_t i_e = 0;        /* And the index where 'end' should go */
10036     UV cur_highest;         /* The highest code point in the inversion list
10037                                upon entry to this function */
10038 
10039     /* This range becomes the whole inversion list if none already existed */
10040     if (invlist == NULL) {
10041 	invlist = _new_invlist(2);
10042         _append_range_to_invlist(invlist, start, end);
10043         return invlist;
10044     }
10045 
10046     /* Likewise, if the inversion list is currently empty */
10047     len = _invlist_len(invlist);
10048     if (len == 0) {
10049         _append_range_to_invlist(invlist, start, end);
10050         return invlist;
10051     }
10052 
10053     /* Starting here, we have to know the internals of the list */
10054     array = invlist_array(invlist);
10055 
10056     /* If the new range ends higher than the current highest ... */
10057     cur_highest = invlist_highest(invlist);
10058     if (end > cur_highest) {
10059 
10060         /* If the whole range is higher, we can just append it */
10061         if (start > cur_highest) {
10062             _append_range_to_invlist(invlist, start, end);
10063             return invlist;
10064         }
10065 
10066         /* Otherwise, add the portion that is higher ... */
10067         _append_range_to_invlist(invlist, cur_highest + 1, end);
10068 
10069         /* ... and continue on below to handle the rest.  As a result of the
10070          * above append, we know that the index of the end of the range is the
10071          * final even numbered one of the array.  Recall that the final element
10072          * always starts a range that extends to infinity.  If that range is in
10073          * the set (meaning the set goes from here to infinity), it will be an
10074          * even index, but if it isn't in the set, it's odd, and the final
10075          * range in the set is one less, which is even. */
10076         if (end == UV_MAX) {
10077             i_e = len;
10078         }
10079         else {
10080             i_e = len - 2;
10081         }
10082     }
10083 
10084     /* We have dealt with appending, now see about prepending.  If the new
10085      * range starts lower than the current lowest ... */
10086     if (start < array[0]) {
10087 
10088         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10089          * Let the union code handle it, rather than having to know the
10090          * trickiness in two code places.  */
10091         if (UNLIKELY(start == 0)) {
10092             SV* range_invlist;
10093 
10094             range_invlist = _new_invlist(2);
10095             _append_range_to_invlist(range_invlist, start, end);
10096 
10097             _invlist_union(invlist, range_invlist, &invlist);
10098 
10099             SvREFCNT_dec_NN(range_invlist);
10100 
10101             return invlist;
10102         }
10103 
10104         /* If the whole new range comes before the first entry, and doesn't
10105          * extend it, we have to insert it as an additional range */
10106         if (end < array[0] - 1) {
10107             i_s = i_e = -1;
10108             goto splice_in_new_range;
10109         }
10110 
10111         /* Here the new range adjoins the existing first range, extending it
10112          * downwards. */
10113         array[0] = start;
10114 
10115         /* And continue on below to handle the rest.  We know that the index of
10116          * the beginning of the range is the first one of the array */
10117         i_s = 0;
10118     }
10119     else { /* Not prepending any part of the new range to the existing list.
10120             * Find where in the list it should go.  This finds i_s, such that:
10121             *     invlist[i_s] <= start < array[i_s+1]
10122             */
10123         i_s = _invlist_search(invlist, start);
10124     }
10125 
10126     /* At this point, any extending before the beginning of the inversion list
10127      * and/or after the end has been done.  This has made it so that, in the
10128      * code below, each endpoint of the new range is either in a range that is
10129      * in the set, or is in a gap between two ranges that are.  This means we
10130      * don't have to worry about exceeding the array bounds.
10131      *
10132      * Find where in the list the new range ends (but we can skip this if we
10133      * have already determined what it is, or if it will be the same as i_s,
10134      * which we already have computed) */
10135     if (i_e == 0) {
10136         i_e = (start == end)
10137               ? i_s
10138               : _invlist_search(invlist, end);
10139     }
10140 
10141     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10142      * is a range that goes to infinity there is no element at invlist[i_e+1],
10143      * so only the first relation holds. */
10144 
10145     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10146 
10147         /* Here, the ranges on either side of the beginning of the new range
10148          * are in the set, and this range starts in the gap between them.
10149          *
10150          * The new range extends the range above it downwards if the new range
10151          * ends at or above that range's start */
10152         const bool extends_the_range_above = (   end == UV_MAX
10153                                               || end + 1 >= array[i_s+1]);
10154 
10155         /* The new range extends the range below it upwards if it begins just
10156          * after where that range ends */
10157         if (start == array[i_s]) {
10158 
10159             /* If the new range fills the entire gap between the other ranges,
10160              * they will get merged together.  Other ranges may also get
10161              * merged, depending on how many of them the new range spans.  In
10162              * the general case, we do the merge later, just once, after we
10163              * figure out how many to merge.  But in the case where the new
10164              * range exactly spans just this one gap (possibly extending into
10165              * the one above), we do the merge here, and an early exit.  This
10166              * is done here to avoid having to special case later. */
10167             if (i_e - i_s <= 1) {
10168 
10169                 /* If i_e - i_s == 1, it means that the new range terminates
10170                  * within the range above, and hence 'extends_the_range_above'
10171                  * must be true.  (If the range above it extends to infinity,
10172                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10173                  * will be 0, so no harm done.) */
10174                 if (extends_the_range_above) {
10175                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10176                     invlist_set_len(invlist,
10177                                     len - 2,
10178                                     *(get_invlist_offset_addr(invlist)));
10179                     return invlist;
10180                 }
10181 
10182                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10183                  * to the same range, and below we are about to decrement i_s
10184                  * */
10185                 i_e--;
10186             }
10187 
10188             /* Here, the new range is adjacent to the one below.  (It may also
10189              * span beyond the range above, but that will get resolved later.)
10190              * Extend the range below to include this one. */
10191             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10192             i_s--;
10193             start = array[i_s];
10194         }
10195         else if (extends_the_range_above) {
10196 
10197             /* Here the new range only extends the range above it, but not the
10198              * one below.  It merges with the one above.  Again, we keep i_e
10199              * and i_s in sync if they point to the same range */
10200             if (i_e == i_s) {
10201                 i_e++;
10202             }
10203             i_s++;
10204             array[i_s] = start;
10205         }
10206     }
10207 
10208     /* Here, we've dealt with the new range start extending any adjoining
10209      * existing ranges.
10210      *
10211      * If the new range extends to infinity, it is now the final one,
10212      * regardless of what was there before */
10213     if (UNLIKELY(end == UV_MAX)) {
10214         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10215         return invlist;
10216     }
10217 
10218     /* If i_e started as == i_s, it has also been dealt with,
10219      * and been updated to the new i_s, which will fail the following if */
10220     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10221 
10222         /* Here, the ranges on either side of the end of the new range are in
10223          * the set, and this range ends in the gap between them.
10224          *
10225          * If this range is adjacent to (hence extends) the range above it, it
10226          * becomes part of that range; likewise if it extends the range below,
10227          * it becomes part of that range */
10228         if (end + 1 == array[i_e+1]) {
10229             i_e++;
10230             array[i_e] = start;
10231         }
10232         else if (start <= array[i_e]) {
10233             array[i_e] = end + 1;
10234             i_e--;
10235         }
10236     }
10237 
10238     if (i_s == i_e) {
10239 
10240         /* If the range fits entirely in an existing range (as possibly already
10241          * extended above), it doesn't add anything new */
10242         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10243             return invlist;
10244         }
10245 
10246         /* Here, no part of the range is in the list.  Must add it.  It will
10247          * occupy 2 more slots */
10248       splice_in_new_range:
10249 
10250         invlist_extend(invlist, len + 2);
10251         array = invlist_array(invlist);
10252         /* Move the rest of the array down two slots. Don't include any
10253          * trailing NUL */
10254         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10255 
10256         /* Do the actual splice */
10257         array[i_e+1] = start;
10258         array[i_e+2] = end + 1;
10259         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10260         return invlist;
10261     }
10262 
10263     /* Here the new range crossed the boundaries of a pre-existing range.  The
10264      * code above has adjusted things so that both ends are in ranges that are
10265      * in the set.  This means everything in between must also be in the set.
10266      * Just squash things together */
10267     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10268     invlist_set_len(invlist,
10269                     len - i_e + i_s,
10270                     *(get_invlist_offset_addr(invlist)));
10271 
10272     return invlist;
10273 }
10274 
10275 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)10276 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10277                                  UV** other_elements_ptr)
10278 {
10279     /* Create and return an inversion list whose contents are to be populated
10280      * by the caller.  The caller gives the number of elements (in 'size') and
10281      * the very first element ('element0').  This function will set
10282      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10283      * are to be placed.
10284      *
10285      * Obviously there is some trust involved that the caller will properly
10286      * fill in the other elements of the array.
10287      *
10288      * (The first element needs to be passed in, as the underlying code does
10289      * things differently depending on whether it is zero or non-zero) */
10290 
10291     SV* invlist = _new_invlist(size);
10292     bool offset;
10293 
10294     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10295 
10296     invlist = add_cp_to_invlist(invlist, element0);
10297     offset = *get_invlist_offset_addr(invlist);
10298 
10299     invlist_set_len(invlist, size, offset);
10300     *other_elements_ptr = invlist_array(invlist) + 1;
10301     return invlist;
10302 }
10303 
10304 #endif
10305 
10306 PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV * invlist,const UV cp)10307 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10308     return _add_range_to_invlist(invlist, cp, cp);
10309 }
10310 
10311 #ifndef PERL_IN_XSUB_RE
10312 void
Perl__invlist_invert(pTHX_ SV * const invlist)10313 Perl__invlist_invert(pTHX_ SV* const invlist)
10314 {
10315     /* Complement the input inversion list.  This adds a 0 if the list didn't
10316      * have a zero; removes it otherwise.  As described above, the data
10317      * structure is set up so that this is very efficient */
10318 
10319     PERL_ARGS_ASSERT__INVLIST_INVERT;
10320 
10321     assert(! invlist_is_iterating(invlist));
10322 
10323     /* The inverse of matching nothing is matching everything */
10324     if (_invlist_len(invlist) == 0) {
10325 	_append_range_to_invlist(invlist, 0, UV_MAX);
10326 	return;
10327     }
10328 
10329     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10330 }
10331 
10332 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)10333 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10334 {
10335     /* Return a new inversion list that is a copy of the input one, which is
10336      * unchanged.  The new list will not be mortal even if the old one was. */
10337 
10338     const STRLEN nominal_length = _invlist_len(invlist);
10339     const STRLEN physical_length = SvCUR(invlist);
10340     const bool offset = *(get_invlist_offset_addr(invlist));
10341 
10342     PERL_ARGS_ASSERT_INVLIST_CLONE;
10343 
10344     if (new_invlist == NULL) {
10345         new_invlist = _new_invlist(nominal_length);
10346     }
10347     else {
10348         sv_upgrade(new_invlist, SVt_INVLIST);
10349         initialize_invlist_guts(new_invlist, nominal_length);
10350     }
10351 
10352     *(get_invlist_offset_addr(new_invlist)) = offset;
10353     invlist_set_len(new_invlist, nominal_length, offset);
10354     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10355 
10356     return new_invlist;
10357 }
10358 
10359 #endif
10360 
10361 PERL_STATIC_INLINE STRLEN*
S_get_invlist_iter_addr(SV * invlist)10362 S_get_invlist_iter_addr(SV* invlist)
10363 {
10364     /* Return the address of the UV that contains the current iteration
10365      * position */
10366 
10367     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10368 
10369     assert(is_invlist(invlist));
10370 
10371     return &(((XINVLIST*) SvANY(invlist))->iterator);
10372 }
10373 
10374 PERL_STATIC_INLINE void
S_invlist_iterinit(SV * invlist)10375 S_invlist_iterinit(SV* invlist)	/* Initialize iterator for invlist */
10376 {
10377     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10378 
10379     *get_invlist_iter_addr(invlist) = 0;
10380 }
10381 
10382 PERL_STATIC_INLINE void
S_invlist_iterfinish(SV * invlist)10383 S_invlist_iterfinish(SV* invlist)
10384 {
10385     /* Terminate iterator for invlist.  This is to catch development errors.
10386      * Any iteration that is interrupted before completed should call this
10387      * function.  Functions that add code points anywhere else but to the end
10388      * of an inversion list assert that they are not in the middle of an
10389      * iteration.  If they were, the addition would make the iteration
10390      * problematical: if the iteration hadn't reached the place where things
10391      * were being added, it would be ok */
10392 
10393     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10394 
10395     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10396 }
10397 
10398 STATIC bool
S_invlist_iternext(SV * invlist,UV * start,UV * end)10399 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10400 {
10401     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10402      * This call sets in <*start> and <*end>, the next range in <invlist>.
10403      * Returns <TRUE> if successful and the next call will return the next
10404      * range; <FALSE> if was already at the end of the list.  If the latter,
10405      * <*start> and <*end> are unchanged, and the next call to this function
10406      * will start over at the beginning of the list */
10407 
10408     STRLEN* pos = get_invlist_iter_addr(invlist);
10409     UV len = _invlist_len(invlist);
10410     UV *array;
10411 
10412     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10413 
10414     if (*pos >= len) {
10415 	*pos = (STRLEN) UV_MAX;	/* Force iterinit() to be required next time */
10416 	return FALSE;
10417     }
10418 
10419     array = invlist_array(invlist);
10420 
10421     *start = array[(*pos)++];
10422 
10423     if (*pos >= len) {
10424 	*end = UV_MAX;
10425     }
10426     else {
10427 	*end = array[(*pos)++] - 1;
10428     }
10429 
10430     return TRUE;
10431 }
10432 
10433 PERL_STATIC_INLINE UV
S_invlist_highest(SV * const invlist)10434 S_invlist_highest(SV* const invlist)
10435 {
10436     /* Returns the highest code point that matches an inversion list.  This API
10437      * has an ambiguity, as it returns 0 under either the highest is actually
10438      * 0, or if the list is empty.  If this distinction matters to you, check
10439      * for emptiness before calling this function */
10440 
10441     UV len = _invlist_len(invlist);
10442     UV *array;
10443 
10444     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10445 
10446     if (len == 0) {
10447 	return 0;
10448     }
10449 
10450     array = invlist_array(invlist);
10451 
10452     /* The last element in the array in the inversion list always starts a
10453      * range that goes to infinity.  That range may be for code points that are
10454      * matched in the inversion list, or it may be for ones that aren't
10455      * matched.  In the latter case, the highest code point in the set is one
10456      * less than the beginning of this range; otherwise it is the final element
10457      * of this range: infinity */
10458     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10459            ? UV_MAX
10460            : array[len - 1] - 1;
10461 }
10462 
10463 STATIC SV *
S_invlist_contents(pTHX_ SV * const invlist,const bool traditional_style)10464 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10465 {
10466     /* Get the contents of an inversion list into a string SV so that they can
10467      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10468      * traditionally done for debug tracing; otherwise it uses a format
10469      * suitable for just copying to the output, with blanks between ranges and
10470      * a dash between range components */
10471 
10472     UV start, end;
10473     SV* output;
10474     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10475     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10476 
10477     if (traditional_style) {
10478         output = newSVpvs("\n");
10479     }
10480     else {
10481         output = newSVpvs("");
10482     }
10483 
10484     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10485 
10486     assert(! invlist_is_iterating(invlist));
10487 
10488     invlist_iterinit(invlist);
10489     while (invlist_iternext(invlist, &start, &end)) {
10490 	if (end == UV_MAX) {
10491 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10492                                           start, intra_range_delimiter,
10493                                                  inter_range_delimiter);
10494 	}
10495 	else if (end != start) {
10496 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10497 		                          start,
10498                                                    intra_range_delimiter,
10499                                                   end, inter_range_delimiter);
10500 	}
10501 	else {
10502 	    Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10503                                           start, inter_range_delimiter);
10504 	}
10505     }
10506 
10507     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10508         SvCUR_set(output, SvCUR(output) - 1);
10509     }
10510 
10511     return output;
10512 }
10513 
10514 #ifndef PERL_IN_XSUB_RE
10515 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)10516 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10517                          const char * const indent, SV* const invlist)
10518 {
10519     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10520      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10521      * the string 'indent'.  The output looks like this:
10522          [0] 0x000A .. 0x000D
10523          [2] 0x0085
10524          [4] 0x2028 .. 0x2029
10525          [6] 0x3104 .. INFTY
10526      * This means that the first range of code points matched by the list are
10527      * 0xA through 0xD; the second range contains only the single code point
10528      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10529      * are used to define each range (except if the final range extends to
10530      * infinity, only a single element is needed).  The array index of the
10531      * first element for the corresponding range is given in brackets. */
10532 
10533     UV start, end;
10534     STRLEN count = 0;
10535 
10536     PERL_ARGS_ASSERT__INVLIST_DUMP;
10537 
10538     if (invlist_is_iterating(invlist)) {
10539         Perl_dump_indent(aTHX_ level, file,
10540              "%sCan't dump inversion list because is in middle of iterating\n",
10541              indent);
10542         return;
10543     }
10544 
10545     invlist_iterinit(invlist);
10546     while (invlist_iternext(invlist, &start, &end)) {
10547 	if (end == UV_MAX) {
10548 	    Perl_dump_indent(aTHX_ level, file,
10549                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10550                                    indent, (UV)count, start);
10551 	}
10552 	else if (end != start) {
10553 	    Perl_dump_indent(aTHX_ level, file,
10554                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10555 		                indent, (UV)count, start,         end);
10556 	}
10557 	else {
10558 	    Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10559                                             indent, (UV)count, start);
10560 	}
10561         count += 2;
10562     }
10563 }
10564 
10565 #endif
10566 
10567 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10568 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)10569 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10570 {
10571     /* Return a boolean as to if the two passed in inversion lists are
10572      * identical.  The final argument, if TRUE, says to take the complement of
10573      * the second inversion list before doing the comparison */
10574 
10575     const UV len_a = _invlist_len(a);
10576     UV len_b = _invlist_len(b);
10577 
10578     const UV* array_a = NULL;
10579     const UV* array_b = NULL;
10580 
10581     PERL_ARGS_ASSERT__INVLISTEQ;
10582 
10583     /* This code avoids accessing the arrays unless it knows the length is
10584      * non-zero */
10585 
10586     if (len_a == 0) {
10587         if (len_b == 0) {
10588             return ! complement_b;
10589         }
10590     }
10591     else {
10592         array_a = invlist_array(a);
10593     }
10594 
10595     if (len_b != 0) {
10596         array_b = invlist_array(b);
10597     }
10598 
10599     /* If are to compare 'a' with the complement of b, set it
10600      * up so are looking at b's complement. */
10601     if (complement_b) {
10602 
10603         /* The complement of nothing is everything, so <a> would have to have
10604          * just one element, starting at zero (ending at infinity) */
10605         if (len_b == 0) {
10606             return (len_a == 1 && array_a[0] == 0);
10607         }
10608         if (array_b[0] == 0) {
10609 
10610             /* Otherwise, to complement, we invert.  Here, the first element is
10611              * 0, just remove it.  To do this, we just pretend the array starts
10612              * one later */
10613 
10614             array_b++;
10615             len_b--;
10616         }
10617         else {
10618 
10619             /* But if the first element is not zero, we pretend the list starts
10620              * at the 0 that is always stored immediately before the array. */
10621             array_b--;
10622             len_b++;
10623         }
10624     }
10625 
10626     return    len_a == len_b
10627            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10628 
10629 }
10630 #endif
10631 
10632 /*
10633  * As best we can, determine the characters that can match the start of
10634  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10635  * can be false positive matches
10636  *
10637  * Returns the invlist as a new SV*; it is the caller's responsibility to
10638  * call SvREFCNT_dec() when done with it.
10639  */
10640 STATIC SV*
S__make_exactf_invlist(pTHX_ RExC_state_t * pRExC_state,regnode * node)10641 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10642 {
10643     dVAR;
10644     const U8 * s = (U8*)STRING(node);
10645     SSize_t bytelen = STR_LEN(node);
10646     UV uc;
10647     /* Start out big enough for 2 separate code points */
10648     SV* invlist = _new_invlist(4);
10649 
10650     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10651 
10652     if (! UTF) {
10653         uc = *s;
10654 
10655         /* We punt and assume can match anything if the node begins
10656          * with a multi-character fold.  Things are complicated.  For
10657          * example, /ffi/i could match any of:
10658          *  "\N{LATIN SMALL LIGATURE FFI}"
10659          *  "\N{LATIN SMALL LIGATURE FF}I"
10660          *  "F\N{LATIN SMALL LIGATURE FI}"
10661          *  plus several other things; and making sure we have all the
10662          *  possibilities is hard. */
10663         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10664             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10665         }
10666         else {
10667             /* Any Latin1 range character can potentially match any
10668              * other depending on the locale, and in Turkic locales, U+130 and
10669              * U+131 */
10670             if (OP(node) == EXACTFL) {
10671                 _invlist_union(invlist, PL_Latin1, &invlist);
10672                 invlist = add_cp_to_invlist(invlist,
10673                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10674                 invlist = add_cp_to_invlist(invlist,
10675                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10676             }
10677             else {
10678                 /* But otherwise, it matches at least itself.  We can
10679                  * quickly tell if it has a distinct fold, and if so,
10680                  * it matches that as well */
10681                 invlist = add_cp_to_invlist(invlist, uc);
10682                 if (IS_IN_SOME_FOLD_L1(uc))
10683                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10684             }
10685 
10686             /* Some characters match above-Latin1 ones under /i.  This
10687              * is true of EXACTFL ones when the locale is UTF-8 */
10688             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10689                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10690                                     && OP(node) != EXACTFAA_NO_TRIE)))
10691             {
10692                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10693             }
10694         }
10695     }
10696     else {  /* Pattern is UTF-8 */
10697         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10698         const U8* e = s + bytelen;
10699         IV fc;
10700 
10701         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10702 
10703         /* The only code points that aren't folded in a UTF EXACTFish
10704          * node are are the problematic ones in EXACTFL nodes */
10705         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10706             /* We need to check for the possibility that this EXACTFL
10707              * node begins with a multi-char fold.  Therefore we fold
10708              * the first few characters of it so that we can make that
10709              * check */
10710             U8 *d = folded;
10711             int i;
10712 
10713             fc = -1;
10714             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10715                 if (isASCII(*s)) {
10716                     *(d++) = (U8) toFOLD(*s);
10717                     if (fc < 0) {       /* Save the first fold */
10718                         fc = *(d-1);
10719                     }
10720                     s++;
10721                 }
10722                 else {
10723                     STRLEN len;
10724                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10725                     if (fc < 0) {       /* Save the first fold */
10726                         fc = fold;
10727                     }
10728                     d += len;
10729                     s += UTF8SKIP(s);
10730                 }
10731             }
10732 
10733             /* And set up so the code below that looks in this folded
10734              * buffer instead of the node's string */
10735             e = d;
10736             s = folded;
10737         }
10738 
10739         /* When we reach here 's' points to the fold of the first
10740          * character(s) of the node; and 'e' points to far enough along
10741          * the folded string to be just past any possible multi-char
10742          * fold.
10743          *
10744          * Unlike the non-UTF-8 case, the macro for determining if a
10745          * string is a multi-char fold requires all the characters to
10746          * already be folded.  This is because of all the complications
10747          * if not.  Note that they are folded anyway, except in EXACTFL
10748          * nodes.  Like the non-UTF case above, we punt if the node
10749          * begins with a multi-char fold  */
10750 
10751         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10752             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10753         }
10754         else {  /* Single char fold */
10755             unsigned int k;
10756             unsigned int first_fold;
10757             const unsigned int * remaining_folds;
10758             Size_t folds_count;
10759 
10760             /* It matches itself */
10761             invlist = add_cp_to_invlist(invlist, fc);
10762 
10763             /* ... plus all the things that fold to it, which are found in
10764              * PL_utf8_foldclosures */
10765             folds_count = _inverse_folds(fc, &first_fold,
10766                                                 &remaining_folds);
10767             for (k = 0; k < folds_count; k++) {
10768                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10769 
10770                 /* /aa doesn't allow folds between ASCII and non- */
10771                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10772                     && isASCII(c) != isASCII(fc))
10773                 {
10774                     continue;
10775                 }
10776 
10777                 invlist = add_cp_to_invlist(invlist, c);
10778             }
10779 
10780             if (OP(node) == EXACTFL) {
10781 
10782                 /* If either [iI] are present in an EXACTFL node the above code
10783                  * should have added its normal case pair, but under a Turkish
10784                  * locale they could match instead the case pairs from it.  Add
10785                  * those as potential matches as well */
10786                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10787                     invlist = add_cp_to_invlist(invlist,
10788                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10789                     invlist = add_cp_to_invlist(invlist,
10790                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10791                 }
10792                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10793                     invlist = add_cp_to_invlist(invlist, 'I');
10794                 }
10795                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10796                     invlist = add_cp_to_invlist(invlist, 'i');
10797                 }
10798             }
10799         }
10800     }
10801 
10802     return invlist;
10803 }
10804 
10805 #undef HEADER_LENGTH
10806 #undef TO_INTERNAL_SIZE
10807 #undef FROM_INTERNAL_SIZE
10808 #undef INVLIST_VERSION_ID
10809 
10810 /* End of inversion list object */
10811 
10812 STATIC void
S_parse_lparen_question_flags(pTHX_ RExC_state_t * pRExC_state)10813 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10814 {
10815     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10816      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10817      * should point to the first flag; it is updated on output to point to the
10818      * final ')' or ':'.  There needs to be at least one flag, or this will
10819      * abort */
10820 
10821     /* for (?g), (?gc), and (?o) warnings; warning
10822        about (?c) will warn about (?g) -- japhy    */
10823 
10824 #define WASTED_O  0x01
10825 #define WASTED_G  0x02
10826 #define WASTED_C  0x04
10827 #define WASTED_GC (WASTED_G|WASTED_C)
10828     I32 wastedflags = 0x00;
10829     U32 posflags = 0, negflags = 0;
10830     U32 *flagsp = &posflags;
10831     char has_charset_modifier = '\0';
10832     regex_charset cs;
10833     bool has_use_defaults = FALSE;
10834     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10835     int x_mod_count = 0;
10836 
10837     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10838 
10839     /* '^' as an initial flag sets certain defaults */
10840     if (UCHARAT(RExC_parse) == '^') {
10841         RExC_parse++;
10842         has_use_defaults = TRUE;
10843         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10844         cs = (RExC_uni_semantics)
10845              ? REGEX_UNICODE_CHARSET
10846              : REGEX_DEPENDS_CHARSET;
10847         set_regex_charset(&RExC_flags, cs);
10848     }
10849     else {
10850         cs = get_regex_charset(RExC_flags);
10851         if (   cs == REGEX_DEPENDS_CHARSET
10852             && RExC_uni_semantics)
10853         {
10854             cs = REGEX_UNICODE_CHARSET;
10855         }
10856     }
10857 
10858     while (RExC_parse < RExC_end) {
10859         /* && strchr("iogcmsx", *RExC_parse) */
10860         /* (?g), (?gc) and (?o) are useless here
10861            and must be globally applied -- japhy */
10862         switch (*RExC_parse) {
10863 
10864             /* Code for the imsxn flags */
10865             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10866 
10867             case LOCALE_PAT_MOD:
10868                 if (has_charset_modifier) {
10869                     goto excess_modifier;
10870                 }
10871                 else if (flagsp == &negflags) {
10872                     goto neg_modifier;
10873                 }
10874                 cs = REGEX_LOCALE_CHARSET;
10875                 has_charset_modifier = LOCALE_PAT_MOD;
10876                 break;
10877             case UNICODE_PAT_MOD:
10878                 if (has_charset_modifier) {
10879                     goto excess_modifier;
10880                 }
10881                 else if (flagsp == &negflags) {
10882                     goto neg_modifier;
10883                 }
10884                 cs = REGEX_UNICODE_CHARSET;
10885                 has_charset_modifier = UNICODE_PAT_MOD;
10886                 break;
10887             case ASCII_RESTRICT_PAT_MOD:
10888                 if (flagsp == &negflags) {
10889                     goto neg_modifier;
10890                 }
10891                 if (has_charset_modifier) {
10892                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10893                         goto excess_modifier;
10894                     }
10895                     /* Doubled modifier implies more restricted */
10896                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10897                 }
10898                 else {
10899                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10900                 }
10901                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10902                 break;
10903             case DEPENDS_PAT_MOD:
10904                 if (has_use_defaults) {
10905                     goto fail_modifiers;
10906                 }
10907                 else if (flagsp == &negflags) {
10908                     goto neg_modifier;
10909                 }
10910                 else if (has_charset_modifier) {
10911                     goto excess_modifier;
10912                 }
10913 
10914                 /* The dual charset means unicode semantics if the
10915                  * pattern (or target, not known until runtime) are
10916                  * utf8, or something in the pattern indicates unicode
10917                  * semantics */
10918                 cs = (RExC_uni_semantics)
10919                      ? REGEX_UNICODE_CHARSET
10920                      : REGEX_DEPENDS_CHARSET;
10921                 has_charset_modifier = DEPENDS_PAT_MOD;
10922                 break;
10923               excess_modifier:
10924                 RExC_parse++;
10925                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10926                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10927                 }
10928                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10929                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10930                                         *(RExC_parse - 1));
10931                 }
10932                 else {
10933                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10934                 }
10935                 NOT_REACHED; /*NOTREACHED*/
10936               neg_modifier:
10937                 RExC_parse++;
10938                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10939                                     *(RExC_parse - 1));
10940                 NOT_REACHED; /*NOTREACHED*/
10941             case ONCE_PAT_MOD: /* 'o' */
10942             case GLOBAL_PAT_MOD: /* 'g' */
10943                 if (ckWARN(WARN_REGEXP)) {
10944                     const I32 wflagbit = *RExC_parse == 'o'
10945                                          ? WASTED_O
10946                                          : WASTED_G;
10947                     if (! (wastedflags & wflagbit) ) {
10948                         wastedflags |= wflagbit;
10949 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10950                         vWARN5(
10951                             RExC_parse + 1,
10952                             "Useless (%s%c) - %suse /%c modifier",
10953                             flagsp == &negflags ? "?-" : "?",
10954                             *RExC_parse,
10955                             flagsp == &negflags ? "don't " : "",
10956                             *RExC_parse
10957                         );
10958                     }
10959                 }
10960                 break;
10961 
10962             case CONTINUE_PAT_MOD: /* 'c' */
10963                 if (ckWARN(WARN_REGEXP)) {
10964                     if (! (wastedflags & WASTED_C) ) {
10965                         wastedflags |= WASTED_GC;
10966 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10967                         vWARN3(
10968                             RExC_parse + 1,
10969                             "Useless (%sc) - %suse /gc modifier",
10970                             flagsp == &negflags ? "?-" : "?",
10971                             flagsp == &negflags ? "don't " : ""
10972                         );
10973                     }
10974                 }
10975                 break;
10976             case KEEPCOPY_PAT_MOD: /* 'p' */
10977                 if (flagsp == &negflags) {
10978                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10979                 } else {
10980                     *flagsp |= RXf_PMf_KEEPCOPY;
10981                 }
10982                 break;
10983             case '-':
10984                 /* A flag is a default iff it is following a minus, so
10985                  * if there is a minus, it means will be trying to
10986                  * re-specify a default which is an error */
10987                 if (has_use_defaults || flagsp == &negflags) {
10988                     goto fail_modifiers;
10989                 }
10990                 flagsp = &negflags;
10991                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10992                 x_mod_count = 0;
10993                 break;
10994             case ':':
10995             case ')':
10996 
10997                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10998                     negflags |= RXf_PMf_EXTENDED_MORE;
10999                 }
11000                 RExC_flags |= posflags;
11001 
11002                 if (negflags & RXf_PMf_EXTENDED) {
11003                     negflags |= RXf_PMf_EXTENDED_MORE;
11004                 }
11005                 RExC_flags &= ~negflags;
11006                 set_regex_charset(&RExC_flags, cs);
11007 
11008                 return;
11009             default:
11010               fail_modifiers:
11011                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11012 		/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11013                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11014                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11015                 NOT_REACHED; /*NOTREACHED*/
11016         }
11017 
11018         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11019     }
11020 
11021     vFAIL("Sequence (?... not terminated");
11022 }
11023 
11024 /*
11025  - reg - regular expression, i.e. main body or parenthesized thing
11026  *
11027  * Caller must absorb opening parenthesis.
11028  *
11029  * Combining parenthesis handling with the base level of regular expression
11030  * is a trifle forced, but the need to tie the tails of the branches to what
11031  * follows makes it hard to avoid.
11032  */
11033 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11034 #ifdef DEBUGGING
11035 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11036 #else
11037 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11038 #endif
11039 
11040 PERL_STATIC_INLINE regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,char * parse_start,char ch)11041 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11042                              I32 *flagp,
11043                              char * parse_start,
11044                              char ch
11045                       )
11046 {
11047     regnode_offset ret;
11048     char* name_start = RExC_parse;
11049     U32 num = 0;
11050     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11051     GET_RE_DEBUG_FLAGS_DECL;
11052 
11053     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11054 
11055     if (RExC_parse == name_start || *RExC_parse != ch) {
11056         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11057         vFAIL2("Sequence %.3s... not terminated", parse_start);
11058     }
11059 
11060     if (sv_dat) {
11061         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11062         RExC_rxi->data->data[num]=(void*)sv_dat;
11063         SvREFCNT_inc_simple_void_NN(sv_dat);
11064     }
11065     RExC_sawback = 1;
11066     ret = reganode(pRExC_state,
11067                    ((! FOLD)
11068                      ? NREF
11069                      : (ASCII_FOLD_RESTRICTED)
11070                        ? NREFFA
11071                        : (AT_LEAST_UNI_SEMANTICS)
11072                          ? NREFFU
11073                          : (LOC)
11074                            ? NREFFL
11075                            : NREFF),
11076                     num);
11077     *flagp |= HASWIDTH;
11078 
11079     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11080     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11081 
11082     nextchar(pRExC_state);
11083     return ret;
11084 }
11085 
11086 /* On success, returns the offset at which any next node should be placed into
11087  * the regex engine program being compiled.
11088  *
11089  * Returns 0 otherwise, with *flagp set to indicate why:
11090  *  TRYAGAIN        at the end of (?) that only sets flags.
11091  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11092  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11093  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11094  *  happen.  */
11095 STATIC regnode_offset
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp,U32 depth)11096 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11097     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11098      * 2 is like 1, but indicates that nextchar() has been called to advance
11099      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11100      * this flag alerts us to the need to check for that */
11101 {
11102     regnode_offset ret = 0;    /* Will be the head of the group. */
11103     regnode_offset br;
11104     regnode_offset lastbr;
11105     regnode_offset ender = 0;
11106     I32 parno = 0;
11107     I32 flags;
11108     U32 oregflags = RExC_flags;
11109     bool have_branch = 0;
11110     bool is_open = 0;
11111     I32 freeze_paren = 0;
11112     I32 after_freeze = 0;
11113     I32 num; /* numeric backreferences */
11114     SV * max_open;  /* Max number of unclosed parens */
11115 
11116     char * parse_start = RExC_parse; /* MJD */
11117     char * const oregcomp_parse = RExC_parse;
11118 
11119     GET_RE_DEBUG_FLAGS_DECL;
11120 
11121     PERL_ARGS_ASSERT_REG;
11122     DEBUG_PARSE("reg ");
11123 
11124 
11125     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11126     assert(max_open);
11127     if (!SvIOK(max_open)) {
11128         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11129     }
11130     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11131                                               open paren */
11132         vFAIL("Too many nested open parens");
11133     }
11134 
11135     *flagp = 0;				/* Tentatively. */
11136 
11137     /* Having this true makes it feasible to have a lot fewer tests for the
11138      * parse pointer being in scope.  For example, we can write
11139      *      while(isFOO(*RExC_parse)) RExC_parse++;
11140      * instead of
11141      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11142      */
11143     assert(*RExC_end == '\0');
11144 
11145     /* Make an OPEN node, if parenthesized. */
11146     if (paren) {
11147 
11148         /* Under /x, space and comments can be gobbled up between the '(' and
11149          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11150          * intervening space, as the sequence is a token, and a token should be
11151          * indivisible */
11152         bool has_intervening_patws = (paren == 2)
11153                                   && *(RExC_parse - 1) != '(';
11154 
11155         if (RExC_parse >= RExC_end) {
11156 	    vFAIL("Unmatched (");
11157         }
11158 
11159         if (paren == 'r') {     /* Atomic script run */
11160             paren = '>';
11161             goto parse_rest;
11162         }
11163         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11164 	    char *start_verb = RExC_parse + 1;
11165 	    STRLEN verb_len;
11166 	    char *start_arg = NULL;
11167 	    unsigned char op = 0;
11168             int arg_required = 0;
11169             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11170             bool has_upper = FALSE;
11171 
11172             if (has_intervening_patws) {
11173                 RExC_parse++;   /* past the '*' */
11174 
11175                 /* For strict backwards compatibility, don't change the message
11176                  * now that we also have lowercase operands */
11177                 if (isUPPER(*RExC_parse)) {
11178                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11179                 }
11180                 else {
11181                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11182                 }
11183             }
11184 	    while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11185 	        if ( *RExC_parse == ':' ) {
11186 	            start_arg = RExC_parse + 1;
11187 	            break;
11188 	        }
11189                 else if (! UTF) {
11190                     if (isUPPER(*RExC_parse)) {
11191                         has_upper = TRUE;
11192                     }
11193                     RExC_parse++;
11194                 }
11195                 else {
11196                     RExC_parse += UTF8SKIP(RExC_parse);
11197                 }
11198 	    }
11199 	    verb_len = RExC_parse - start_verb;
11200 	    if ( start_arg ) {
11201                 if (RExC_parse >= RExC_end) {
11202                     goto unterminated_verb_pattern;
11203                 }
11204 
11205 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11206 	        while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11207                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11208                 }
11209 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11210                   unterminated_verb_pattern:
11211                     if (has_upper) {
11212                         vFAIL("Unterminated verb pattern argument");
11213                     }
11214                     else {
11215                         vFAIL("Unterminated '(*...' argument");
11216                     }
11217                 }
11218 	    } else {
11219 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11220                     if (has_upper) {
11221                         vFAIL("Unterminated verb pattern");
11222                     }
11223                     else {
11224                         vFAIL("Unterminated '(*...' construct");
11225                     }
11226                 }
11227 	    }
11228 
11229             /* Here, we know that RExC_parse < RExC_end */
11230 
11231 	    switch ( *start_verb ) {
11232             case 'A':  /* (*ACCEPT) */
11233                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11234 		    op = ACCEPT;
11235 		    internal_argval = RExC_nestroot;
11236 		}
11237 		break;
11238             case 'C':  /* (*COMMIT) */
11239                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11240                     op = COMMIT;
11241                 break;
11242             case 'F':  /* (*FAIL) */
11243                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11244 		    op = OPFAIL;
11245 		}
11246 		break;
11247             case ':':  /* (*:NAME) */
11248 	    case 'M':  /* (*MARK:NAME) */
11249 	        if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11250                     op = MARKPOINT;
11251                     arg_required = 1;
11252                 }
11253                 break;
11254             case 'P':  /* (*PRUNE) */
11255                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11256                     op = PRUNE;
11257                 break;
11258             case 'S':   /* (*SKIP) */
11259                 if ( memEQs(start_verb, verb_len,"SKIP") )
11260                     op = SKIP;
11261                 break;
11262             case 'T':  /* (*THEN) */
11263                 /* [19:06] <TimToady> :: is then */
11264                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11265                     op = CUTGROUP;
11266                     RExC_seen |= REG_CUTGROUP_SEEN;
11267                 }
11268                 break;
11269             case 'a':
11270                 if (   memEQs(start_verb, verb_len, "asr")
11271                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11272                 {
11273                     paren = 'r';        /* Mnemonic: recursed run */
11274                     goto script_run;
11275                 }
11276                 else if (memEQs(start_verb, verb_len, "atomic")) {
11277                     paren = 't';    /* AtOMIC */
11278                     goto alpha_assertions;
11279                 }
11280                 break;
11281             case 'p':
11282                 if (   memEQs(start_verb, verb_len, "plb")
11283                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11284                 {
11285                     paren = 'b';
11286                     goto lookbehind_alpha_assertions;
11287                 }
11288                 else if (   memEQs(start_verb, verb_len, "pla")
11289                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11290                 {
11291                     paren = 'a';
11292                     goto alpha_assertions;
11293                 }
11294                 break;
11295             case 'n':
11296                 if (   memEQs(start_verb, verb_len, "nlb")
11297                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11298                 {
11299                     paren = 'B';
11300                     goto lookbehind_alpha_assertions;
11301                 }
11302                 else if (   memEQs(start_verb, verb_len, "nla")
11303                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11304                 {
11305                     paren = 'A';
11306                     goto alpha_assertions;
11307                 }
11308                 break;
11309             case 's':
11310                 if (   memEQs(start_verb, verb_len, "sr")
11311                     || memEQs(start_verb, verb_len, "script_run"))
11312                 {
11313                     regnode_offset atomic;
11314 
11315                     paren = 's';
11316 
11317                    script_run:
11318 
11319                     /* This indicates Unicode rules. */
11320                     REQUIRE_UNI_RULES(flagp, 0);
11321 
11322                     if (! start_arg) {
11323                         goto no_colon;
11324                     }
11325 
11326                     RExC_parse = start_arg;
11327 
11328                     if (RExC_in_script_run) {
11329 
11330                         /*  Nested script runs are treated as no-ops, because
11331                          *  if the nested one fails, the outer one must as
11332                          *  well.  It could fail sooner, and avoid (??{} with
11333                          *  side effects, but that is explicitly documented as
11334                          *  undefined behavior. */
11335 
11336                         ret = 0;
11337 
11338                         if (paren == 's') {
11339                             paren = ':';
11340                             goto parse_rest;
11341                         }
11342 
11343                         /* But, the atomic part of a nested atomic script run
11344                          * isn't a no-op, but can be treated just like a '(?>'
11345                          * */
11346                         paren = '>';
11347                         goto parse_rest;
11348                     }
11349 
11350                     /* By doing this here, we avoid extra warnings for nested
11351                      * script runs */
11352                     ckWARNexperimental(RExC_parse,
11353                         WARN_EXPERIMENTAL__SCRIPT_RUN,
11354                         "The script_run feature is experimental");
11355 
11356                     if (paren == 's') {
11357                         /* Here, we're starting a new regular script run */
11358                         ret = reg_node(pRExC_state, SROPEN);
11359                         RExC_in_script_run = 1;
11360                         is_open = 1;
11361                         goto parse_rest;
11362                     }
11363 
11364                     /* Here, we are starting an atomic script run.  This is
11365                      * handled by recursing to deal with the atomic portion
11366                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11367 
11368                     ret = reg_node(pRExC_state, SROPEN);
11369 
11370                     RExC_in_script_run = 1;
11371 
11372                     atomic = reg(pRExC_state, 'r', &flags, depth);
11373                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11374                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11375                         return 0;
11376                     }
11377 
11378                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11379                         REQUIRE_BRANCHJ(flagp, 0);
11380                     }
11381 
11382                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11383                                                                 SRCLOSE)))
11384                     {
11385                         REQUIRE_BRANCHJ(flagp, 0);
11386                     }
11387 
11388                     RExC_in_script_run = 0;
11389                     return ret;
11390                 }
11391 
11392                 break;
11393 
11394             lookbehind_alpha_assertions:
11395                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11396                 RExC_in_lookbehind++;
11397                 /*FALLTHROUGH*/
11398 
11399             alpha_assertions:
11400                 ckWARNexperimental(RExC_parse,
11401                         WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11402                         "The alpha_assertions feature is experimental");
11403 
11404                 RExC_seen_zerolen++;
11405 
11406                 if (! start_arg) {
11407                     goto no_colon;
11408                 }
11409 
11410                 /* An empty negative lookahead assertion simply is failure */
11411                 if (paren == 'A' && RExC_parse == start_arg) {
11412                     ret=reganode(pRExC_state, OPFAIL, 0);
11413                     nextchar(pRExC_state);
11414                     return ret;
11415 	        }
11416 
11417                 RExC_parse = start_arg;
11418                 goto parse_rest;
11419 
11420               no_colon:
11421                 vFAIL2utf8f(
11422                 "'(*%" UTF8f "' requires a terminating ':'",
11423                 UTF8fARG(UTF, verb_len, start_verb));
11424 		NOT_REACHED; /*NOTREACHED*/
11425 
11426 	    } /* End of switch */
11427 	    if ( ! op ) {
11428 	        RExC_parse += UTF
11429                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11430                               : 1;
11431                 if (has_upper || verb_len == 0) {
11432                     vFAIL2utf8f(
11433                     "Unknown verb pattern '%" UTF8f "'",
11434                     UTF8fARG(UTF, verb_len, start_verb));
11435                 }
11436                 else {
11437                     vFAIL2utf8f(
11438                     "Unknown '(*...)' construct '%" UTF8f "'",
11439                     UTF8fARG(UTF, verb_len, start_verb));
11440                 }
11441 	    }
11442             if ( RExC_parse == start_arg ) {
11443                 start_arg = NULL;
11444             }
11445             if ( arg_required && !start_arg ) {
11446                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11447                     verb_len, start_verb);
11448             }
11449             if (internal_argval == -1) {
11450                 ret = reganode(pRExC_state, op, 0);
11451             } else {
11452                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11453             }
11454             RExC_seen |= REG_VERBARG_SEEN;
11455             if (start_arg) {
11456                 SV *sv = newSVpvn( start_arg,
11457                                     RExC_parse - start_arg);
11458                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11459                                         STR_WITH_LEN("S"));
11460                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11461                 FLAGS(REGNODE_p(ret)) = 1;
11462             } else {
11463                 FLAGS(REGNODE_p(ret)) = 0;
11464             }
11465             if ( internal_argval != -1 )
11466                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11467 	    nextchar(pRExC_state);
11468 	    return ret;
11469         }
11470         else if (*RExC_parse == '?') { /* (?...) */
11471 	    bool is_logical = 0;
11472 	    const char * const seqstart = RExC_parse;
11473             const char * endptr;
11474             if (has_intervening_patws) {
11475                 RExC_parse++;
11476                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11477             }
11478 
11479 	    RExC_parse++;           /* past the '?' */
11480             paren = *RExC_parse;    /* might be a trailing NUL, if not
11481                                        well-formed */
11482             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11483             if (RExC_parse > RExC_end) {
11484                 paren = '\0';
11485             }
11486 	    ret = 0;			/* For look-ahead/behind. */
11487 	    switch (paren) {
11488 
11489 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
11490 	        paren = *RExC_parse;
11491 		if ( paren == '<') {    /* (?P<...>) named capture */
11492                     RExC_parse++;
11493                     if (RExC_parse >= RExC_end) {
11494                         vFAIL("Sequence (?P<... not terminated");
11495                     }
11496 		    goto named_capture;
11497                 }
11498                 else if (paren == '>') {   /* (?P>name) named recursion */
11499                     RExC_parse++;
11500                     if (RExC_parse >= RExC_end) {
11501                         vFAIL("Sequence (?P>... not terminated");
11502                     }
11503                     goto named_recursion;
11504                 }
11505                 else if (paren == '=') {   /* (?P=...)  named backref */
11506                     RExC_parse++;
11507                     return handle_named_backref(pRExC_state, flagp,
11508                                                 parse_start, ')');
11509                 }
11510                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11511                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11512 		vFAIL3("Sequence (%.*s...) not recognized",
11513                                 RExC_parse-seqstart, seqstart);
11514 		NOT_REACHED; /*NOTREACHED*/
11515             case '<':           /* (?<...) */
11516 		if (*RExC_parse == '!')
11517 		    paren = ',';
11518 		else if (*RExC_parse != '=')
11519               named_capture:
11520 		{               /* (?<...>) */
11521 		    char *name_start;
11522 		    SV *svname;
11523 		    paren= '>';
11524                 /* FALLTHROUGH */
11525             case '\'':          /* (?'...') */
11526                     name_start = RExC_parse;
11527                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11528 		    if (   RExC_parse == name_start
11529                         || RExC_parse >= RExC_end
11530                         || *RExC_parse != paren)
11531                     {
11532 		        vFAIL2("Sequence (?%c... not terminated",
11533 		            paren=='>' ? '<' : paren);
11534                     }
11535 		    {
11536 			HE *he_str;
11537 			SV *sv_dat = NULL;
11538                         if (!svname) /* shouldn't happen */
11539                             Perl_croak(aTHX_
11540                                 "panic: reg_scan_name returned NULL");
11541                         if (!RExC_paren_names) {
11542                             RExC_paren_names= newHV();
11543                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11544 #ifdef DEBUGGING
11545                             RExC_paren_name_list= newAV();
11546                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11547 #endif
11548                         }
11549                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11550                         if ( he_str )
11551                             sv_dat = HeVAL(he_str);
11552                         if ( ! sv_dat ) {
11553                             /* croak baby croak */
11554                             Perl_croak(aTHX_
11555                                 "panic: paren_name hash element allocation failed");
11556                         } else if ( SvPOK(sv_dat) ) {
11557                             /* (?|...) can mean we have dupes so scan to check
11558                                its already been stored. Maybe a flag indicating
11559                                we are inside such a construct would be useful,
11560                                but the arrays are likely to be quite small, so
11561                                for now we punt -- dmq */
11562                             IV count = SvIV(sv_dat);
11563                             I32 *pv = (I32*)SvPVX(sv_dat);
11564                             IV i;
11565                             for ( i = 0 ; i < count ; i++ ) {
11566                                 if ( pv[i] == RExC_npar ) {
11567                                     count = 0;
11568                                     break;
11569                                 }
11570                             }
11571                             if ( count ) {
11572                                 pv = (I32*)SvGROW(sv_dat,
11573                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11574                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11575                                 pv[count] = RExC_npar;
11576                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11577                             }
11578                         } else {
11579                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11580                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11581                                                                 sizeof(I32));
11582                             SvIOK_on(sv_dat);
11583                             SvIV_set(sv_dat, 1);
11584                         }
11585 #ifdef DEBUGGING
11586                         /* Yes this does cause a memory leak in debugging Perls
11587                          * */
11588                         if (!av_store(RExC_paren_name_list,
11589                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11590                             SvREFCNT_dec_NN(svname);
11591 #endif
11592 
11593                         /*sv_dump(sv_dat);*/
11594                     }
11595                     nextchar(pRExC_state);
11596 		    paren = 1;
11597 		    goto capturing_parens;
11598 		}
11599 
11600                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11601 		RExC_in_lookbehind++;
11602 		RExC_parse++;
11603                 if (RExC_parse >= RExC_end) {
11604                     vFAIL("Sequence (?... not terminated");
11605                 }
11606 
11607                 /* FALLTHROUGH */
11608 	    case '=':           /* (?=...) */
11609 		RExC_seen_zerolen++;
11610                 break;
11611 	    case '!':           /* (?!...) */
11612 		RExC_seen_zerolen++;
11613 		/* check if we're really just a "FAIL" assertion */
11614                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11615                                         FALSE /* Don't force to /x */ );
11616 	        if (*RExC_parse == ')') {
11617                     ret=reganode(pRExC_state, OPFAIL, 0);
11618 	            nextchar(pRExC_state);
11619 	            return ret;
11620 	        }
11621 	        break;
11622 	    case '|':           /* (?|...) */
11623 	        /* branch reset, behave like a (?:...) except that
11624 	           buffers in alternations share the same numbers */
11625 	        paren = ':';
11626 	        after_freeze = freeze_paren = RExC_npar;
11627 
11628                 /* XXX This construct currently requires an extra pass.
11629                  * Investigation would be required to see if that could be
11630                  * changed */
11631                 REQUIRE_PARENS_PASS;
11632 	        break;
11633 	    case ':':           /* (?:...) */
11634 	    case '>':           /* (?>...) */
11635 		break;
11636 	    case '$':           /* (?$...) */
11637 	    case '@':           /* (?@...) */
11638 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11639 		break;
11640 	    case '0' :           /* (?0) */
11641 	    case 'R' :           /* (?R) */
11642                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11643 		    FAIL("Sequence (?R) not terminated");
11644                 num = 0;
11645                 RExC_seen |= REG_RECURSE_SEEN;
11646 
11647                 /* XXX These constructs currently require an extra pass.
11648                  * It probably could be changed */
11649                 REQUIRE_PARENS_PASS;
11650 
11651 		*flagp |= POSTPONED;
11652                 goto gen_recurse_regop;
11653 		/*notreached*/
11654             /* named and numeric backreferences */
11655             case '&':            /* (?&NAME) */
11656                 parse_start = RExC_parse - 1;
11657               named_recursion:
11658                 {
11659                     SV *sv_dat = reg_scan_name(pRExC_state,
11660                                                REG_RSN_RETURN_DATA);
11661                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11662                 }
11663                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11664                     vFAIL("Sequence (?&... not terminated");
11665                 goto gen_recurse_regop;
11666                 /* NOTREACHED */
11667             case '+':
11668                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11669                     RExC_parse++;
11670                     vFAIL("Illegal pattern");
11671                 }
11672                 goto parse_recursion;
11673                 /* NOTREACHED*/
11674             case '-': /* (?-1) */
11675                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11676                     RExC_parse--; /* rewind to let it be handled later */
11677                     goto parse_flags;
11678                 }
11679                 /* FALLTHROUGH */
11680             case '1': case '2': case '3': case '4': /* (?1) */
11681 	    case '5': case '6': case '7': case '8': case '9':
11682 	        RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11683               parse_recursion:
11684                 {
11685                     bool is_neg = FALSE;
11686                     UV unum;
11687                     parse_start = RExC_parse - 1; /* MJD */
11688                     if (*RExC_parse == '-') {
11689                         RExC_parse++;
11690                         is_neg = TRUE;
11691                     }
11692                     endptr = RExC_end;
11693                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11694                         && unum <= I32_MAX
11695                     ) {
11696                         num = (I32)unum;
11697                         RExC_parse = (char*)endptr;
11698                     } else
11699                         num = I32_MAX;
11700                     if (is_neg) {
11701                         /* Some limit for num? */
11702                         num = -num;
11703                     }
11704                 }
11705 	        if (*RExC_parse!=')')
11706 	            vFAIL("Expecting close bracket");
11707 
11708               gen_recurse_regop:
11709                 if ( paren == '-' ) {
11710                     /*
11711                     Diagram of capture buffer numbering.
11712                     Top line is the normal capture buffer numbers
11713                     Bottom line is the negative indexing as from
11714                     the X (the (?-2))
11715 
11716                     +   1 2    3 4 5 X          6 7
11717                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11718                     -   5 4    3 2 1 X          x x
11719 
11720                     */
11721                     num = RExC_npar + num;
11722                     if (num < 1)  {
11723 
11724                         /* It might be a forward reference; we can't fail until
11725                          * we know, by completing the parse to get all the
11726                          * groups, and then reparsing */
11727                         if (ALL_PARENS_COUNTED)  {
11728                             RExC_parse++;
11729                             vFAIL("Reference to nonexistent group");
11730                         }
11731                         else {
11732                             REQUIRE_PARENS_PASS;
11733                         }
11734                     }
11735                 } else if ( paren == '+' ) {
11736                     num = RExC_npar + num - 1;
11737                 }
11738                 /* We keep track how many GOSUB items we have produced.
11739                    To start off the ARG2L() of the GOSUB holds its "id",
11740                    which is used later in conjunction with RExC_recurse
11741                    to calculate the offset we need to jump for the GOSUB,
11742                    which it will store in the final representation.
11743                    We have to defer the actual calculation until much later
11744                    as the regop may move.
11745                  */
11746 
11747                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11748                 if (num >= RExC_npar) {
11749 
11750                     /* It might be a forward reference; we can't fail until we
11751                      * know, by completing the parse to get all the groups, and
11752                      * then reparsing */
11753                     if (ALL_PARENS_COUNTED)  {
11754                         if (num >= RExC_total_parens) {
11755                             RExC_parse++;
11756                             vFAIL("Reference to nonexistent group");
11757                         }
11758                     }
11759                     else {
11760                         REQUIRE_PARENS_PASS;
11761                     }
11762                 }
11763                 RExC_recurse_count++;
11764                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11765                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11766                             22, "|    |", (int)(depth * 2 + 1), "",
11767                             (UV)ARG(REGNODE_p(ret)),
11768                             (IV)ARG2L(REGNODE_p(ret))));
11769                 RExC_seen |= REG_RECURSE_SEEN;
11770 
11771                 Set_Node_Length(REGNODE_p(ret),
11772                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11773 		Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11774 
11775                 *flagp |= POSTPONED;
11776                 assert(*RExC_parse == ')');
11777                 nextchar(pRExC_state);
11778                 return ret;
11779 
11780             /* NOTREACHED */
11781 
11782 	    case '?':           /* (??...) */
11783 		is_logical = 1;
11784 		if (*RExC_parse != '{') {
11785                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11786                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11787                     vFAIL2utf8f(
11788                         "Sequence (%" UTF8f "...) not recognized",
11789                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11790 		    NOT_REACHED; /*NOTREACHED*/
11791 		}
11792 		*flagp |= POSTPONED;
11793 		paren = '{';
11794                 RExC_parse++;
11795 		/* FALLTHROUGH */
11796 	    case '{':           /* (?{...}) */
11797 	    {
11798 		U32 n = 0;
11799 		struct reg_code_block *cb;
11800                 OP * o;
11801 
11802 		RExC_seen_zerolen++;
11803 
11804 		if (   !pRExC_state->code_blocks
11805 		    || pRExC_state->code_index
11806                                         >= pRExC_state->code_blocks->count
11807 		    || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11808 			!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11809 			    - RExC_start)
11810 		) {
11811 		    if (RExC_pm_flags & PMf_USE_RE_EVAL)
11812 			FAIL("panic: Sequence (?{...}): no code block found\n");
11813 		    FAIL("Eval-group not allowed at runtime, use re 'eval'");
11814 		}
11815 		/* this is a pre-compiled code block (?{...}) */
11816 		cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11817 		RExC_parse = RExC_start + cb->end;
11818 		o = cb->block;
11819                 if (cb->src_regex) {
11820                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11821                     RExC_rxi->data->data[n] =
11822                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11823                     RExC_rxi->data->data[n+1] = (void*)o;
11824                 }
11825                 else {
11826                     n = add_data(pRExC_state,
11827                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11828                     RExC_rxi->data->data[n] = (void*)o;
11829                 }
11830 		pRExC_state->code_index++;
11831 		nextchar(pRExC_state);
11832 
11833 		if (is_logical) {
11834                     regnode_offset eval;
11835 		    ret = reg_node(pRExC_state, LOGICAL);
11836 
11837                     eval = reg2Lanode(pRExC_state, EVAL,
11838                                        n,
11839 
11840                                        /* for later propagation into (??{})
11841                                         * return value */
11842                                        RExC_flags & RXf_PMf_COMPILETIME
11843                                       );
11844                     FLAGS(REGNODE_p(ret)) = 2;
11845                     if (! REGTAIL(pRExC_state, ret, eval)) {
11846                         REQUIRE_BRANCHJ(flagp, 0);
11847                     }
11848                     /* deal with the length of this later - MJD */
11849 		    return ret;
11850 		}
11851 		ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11852 		Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11853 		Set_Node_Offset(REGNODE_p(ret), parse_start);
11854 		return ret;
11855 	    }
11856 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
11857 	    {
11858 	        int is_define= 0;
11859                 const int DEFINE_len = sizeof("DEFINE") - 1;
11860 		if (    RExC_parse < RExC_end - 1
11861                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11862                             && (   RExC_parse[1] == '='
11863                                 || RExC_parse[1] == '!'
11864                                 || RExC_parse[1] == '<'
11865                                 || RExC_parse[1] == '{'))
11866 		        || (       RExC_parse[0] == '*'        /* (?(*...)) */
11867                             && (   memBEGINs(RExC_parse + 1,
11868                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11869                                          "pla:")
11870                                 || memBEGINs(RExC_parse + 1,
11871                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11872                                          "plb:")
11873                                 || memBEGINs(RExC_parse + 1,
11874                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11875                                          "nla:")
11876                                 || memBEGINs(RExC_parse + 1,
11877                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11878                                          "nlb:")
11879                                 || memBEGINs(RExC_parse + 1,
11880                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11881                                          "positive_lookahead:")
11882                                 || memBEGINs(RExC_parse + 1,
11883                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11884                                          "positive_lookbehind:")
11885                                 || memBEGINs(RExC_parse + 1,
11886                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11887                                          "negative_lookahead:")
11888                                 || memBEGINs(RExC_parse + 1,
11889                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11890                                          "negative_lookbehind:"))))
11891                 ) { /* Lookahead or eval. */
11892                     I32 flag;
11893                     regnode_offset tail;
11894 
11895                     ret = reg_node(pRExC_state, LOGICAL);
11896                     FLAGS(REGNODE_p(ret)) = 1;
11897 
11898                     tail = reg(pRExC_state, 1, &flag, depth+1);
11899                     RETURN_FAIL_ON_RESTART(flag, flagp);
11900                     if (! REGTAIL(pRExC_state, ret, tail)) {
11901                         REQUIRE_BRANCHJ(flagp, 0);
11902                     }
11903                     goto insert_if;
11904                 }
11905 		else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11906 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11907 	        {
11908 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
11909 	            char *name_start= RExC_parse++;
11910 	            U32 num = 0;
11911 	            SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11912 	            if (   RExC_parse == name_start
11913                         || RExC_parse >= RExC_end
11914                         || *RExC_parse != ch)
11915                     {
11916                         vFAIL2("Sequence (?(%c... not terminated",
11917                             (ch == '>' ? '<' : ch));
11918                     }
11919                     RExC_parse++;
11920                     if (sv_dat) {
11921                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11922                         RExC_rxi->data->data[num]=(void*)sv_dat;
11923                         SvREFCNT_inc_simple_void_NN(sv_dat);
11924                     }
11925                     ret = reganode(pRExC_state, NGROUPP, num);
11926                     goto insert_if_check_paren;
11927 		}
11928 		else if (memBEGINs(RExC_parse,
11929                                    (STRLEN) (RExC_end - RExC_parse),
11930                                    "DEFINE"))
11931                 {
11932 		    ret = reganode(pRExC_state, DEFINEP, 0);
11933 		    RExC_parse += DEFINE_len;
11934 		    is_define = 1;
11935 		    goto insert_if_check_paren;
11936 		}
11937 		else if (RExC_parse[0] == 'R') {
11938 		    RExC_parse++;
11939                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11940                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11941                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11942                      */
11943 		    parno = 0;
11944                     if (RExC_parse[0] == '0') {
11945                         parno = 1;
11946                         RExC_parse++;
11947                     }
11948                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11949                         UV uv;
11950                         endptr = RExC_end;
11951                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11952                             && uv <= I32_MAX
11953                         ) {
11954                             parno = (I32)uv + 1;
11955                             RExC_parse = (char*)endptr;
11956                         }
11957                         /* else "Switch condition not recognized" below */
11958 		    } else if (RExC_parse[0] == '&') {
11959 		        SV *sv_dat;
11960 		        RExC_parse++;
11961 		        sv_dat = reg_scan_name(pRExC_state,
11962                                                REG_RSN_RETURN_DATA);
11963                         if (sv_dat)
11964                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11965 		    }
11966 		    ret = reganode(pRExC_state, INSUBP, parno);
11967 		    goto insert_if_check_paren;
11968 		}
11969                 else if (inRANGE(RExC_parse[0], '1', '9')) {
11970                     /* (?(1)...) */
11971 		    char c;
11972                     UV uv;
11973                     endptr = RExC_end;
11974                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11975                         && uv <= I32_MAX
11976                     ) {
11977                         parno = (I32)uv;
11978                         RExC_parse = (char*)endptr;
11979                     }
11980                     else {
11981                         vFAIL("panic: grok_atoUV returned FALSE");
11982                     }
11983                     ret = reganode(pRExC_state, GROUPP, parno);
11984 
11985                  insert_if_check_paren:
11986 		    if (UCHARAT(RExC_parse) != ')') {
11987                         RExC_parse += UTF
11988                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11989                                       : 1;
11990 			vFAIL("Switch condition not recognized");
11991 		    }
11992 		    nextchar(pRExC_state);
11993 		  insert_if:
11994                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
11995                                                              IFTHEN, 0)))
11996                     {
11997                         REQUIRE_BRANCHJ(flagp, 0);
11998                     }
11999                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12000 		    if (br == 0) {
12001                         RETURN_FAIL_ON_RESTART(flags,flagp);
12002                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12003                               (UV) flags);
12004                     } else
12005                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12006                                                              LONGJMP, 0)))
12007                     {
12008                         REQUIRE_BRANCHJ(flagp, 0);
12009                     }
12010 		    c = UCHARAT(RExC_parse);
12011                     nextchar(pRExC_state);
12012 		    if (flags&HASWIDTH)
12013 			*flagp |= HASWIDTH;
12014 		    if (c == '|') {
12015 		        if (is_define)
12016 		            vFAIL("(?(DEFINE)....) does not allow branches");
12017 
12018                         /* Fake one for optimizer.  */
12019                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12020 
12021                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12022                             RETURN_FAIL_ON_RESTART(flags, flagp);
12023                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12024                                   (UV) flags);
12025                         }
12026                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12027                             REQUIRE_BRANCHJ(flagp, 0);
12028                         }
12029 		 	if (flags&HASWIDTH)
12030 			    *flagp |= HASWIDTH;
12031                         c = UCHARAT(RExC_parse);
12032                         nextchar(pRExC_state);
12033 		    }
12034 		    else
12035 			lastbr = 0;
12036                     if (c != ')') {
12037                         if (RExC_parse >= RExC_end)
12038                             vFAIL("Switch (?(condition)... not terminated");
12039                         else
12040                             vFAIL("Switch (?(condition)... contains too many branches");
12041                     }
12042 		    ender = reg_node(pRExC_state, TAIL);
12043                     if (! REGTAIL(pRExC_state, br, ender)) {
12044                         REQUIRE_BRANCHJ(flagp, 0);
12045                     }
12046 		    if (lastbr) {
12047                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12048                             REQUIRE_BRANCHJ(flagp, 0);
12049                         }
12050                         if (! REGTAIL(pRExC_state,
12051                                       REGNODE_OFFSET(
12052                                                  NEXTOPER(
12053                                                  NEXTOPER(REGNODE_p(lastbr)))),
12054                                       ender))
12055                         {
12056                             REQUIRE_BRANCHJ(flagp, 0);
12057                         }
12058 		    }
12059 		    else
12060                         if (! REGTAIL(pRExC_state, ret, ender)) {
12061                             REQUIRE_BRANCHJ(flagp, 0);
12062                         }
12063 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12064                     RExC_size++; /* XXX WHY do we need this?!!
12065                                     For large programs it seems to be required
12066                                     but I can't figure out why. -- dmq*/
12067 #endif
12068 		    return ret;
12069 		}
12070                 RExC_parse += UTF
12071                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12072                               : 1;
12073                 vFAIL("Unknown switch condition (?(...))");
12074 	    }
12075 	    case '[':           /* (?[ ... ]) */
12076                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12077                                          oregcomp_parse);
12078             case 0: /* A NUL */
12079 		RExC_parse--; /* for vFAIL to print correctly */
12080                 vFAIL("Sequence (? incomplete");
12081                 break;
12082 
12083             case ')':
12084                 if (RExC_strict) {  /* [perl #132851] */
12085                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12086                 }
12087                 /* FALLTHROUGH */
12088 	    default: /* e.g., (?i) */
12089 	        RExC_parse = (char *) seqstart + 1;
12090               parse_flags:
12091 		parse_lparen_question_flags(pRExC_state);
12092                 if (UCHARAT(RExC_parse) != ':') {
12093                     if (RExC_parse < RExC_end)
12094                         nextchar(pRExC_state);
12095                     *flagp = TRYAGAIN;
12096                     return 0;
12097                 }
12098                 paren = ':';
12099                 nextchar(pRExC_state);
12100                 ret = 0;
12101                 goto parse_rest;
12102             } /* end switch */
12103 	}
12104 	else {
12105             if (*RExC_parse == '{') {
12106                 ckWARNregdep(RExC_parse + 1,
12107                             "Unescaped left brace in regex is "
12108                             "deprecated here (and will be fatal "
12109                             "in Perl 5.32), passed through");
12110             }
12111             /* Not bothering to indent here, as the above 'else' is temporary
12112              * */
12113         if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12114 	  capturing_parens:
12115 	    parno = RExC_npar;
12116 	    RExC_npar++;
12117             if (! ALL_PARENS_COUNTED) {
12118                 /* If we are in our first pass through (and maybe only pass),
12119                  * we  need to allocate memory for the capturing parentheses
12120                  * data structures.
12121                  */
12122 
12123                 if (!RExC_parens_buf_size) {
12124                     /* first guess at number of parens we might encounter */
12125                     RExC_parens_buf_size = 10;
12126 
12127                     /* setup RExC_open_parens, which holds the address of each
12128                      * OPEN tag, and to make things simpler for the 0 index the
12129                      * start of the program - this is used later for offsets */
12130                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12131                             regnode_offset);
12132                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12133 
12134                     /* setup RExC_close_parens, which holds the address of each
12135                      * CLOSE tag, and to make things simpler for the 0 index
12136                      * the end of the program - this is used later for offsets
12137                      * */
12138                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12139                             regnode_offset);
12140                     /* we dont know where end op starts yet, so we dont need to
12141                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12142                      * above */
12143                 }
12144                 else if (RExC_npar > RExC_parens_buf_size) {
12145                     I32 old_size = RExC_parens_buf_size;
12146 
12147                     RExC_parens_buf_size *= 2;
12148 
12149                     Renew(RExC_open_parens, RExC_parens_buf_size,
12150                             regnode_offset);
12151                     Zero(RExC_open_parens + old_size,
12152                             RExC_parens_buf_size - old_size, regnode_offset);
12153 
12154                     Renew(RExC_close_parens, RExC_parens_buf_size,
12155                             regnode_offset);
12156                     Zero(RExC_close_parens + old_size,
12157                             RExC_parens_buf_size - old_size, regnode_offset);
12158                 }
12159             }
12160 
12161 	    ret = reganode(pRExC_state, OPEN, parno);
12162             if (!RExC_nestroot)
12163                 RExC_nestroot = parno;
12164             if (RExC_open_parens && !RExC_open_parens[parno])
12165             {
12166                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12167                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12168                     22, "|    |", (int)(depth * 2 + 1), "",
12169                     (IV)parno, ret));
12170                 RExC_open_parens[parno]= ret;
12171             }
12172 
12173             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12174             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12175 	    is_open = 1;
12176 	} else {
12177             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12178             paren = ':';
12179 	    ret = 0;
12180 	}
12181         }
12182     }
12183     else                        /* ! paren */
12184 	ret = 0;
12185 
12186    parse_rest:
12187     /* Pick up the branches, linking them together. */
12188     parse_start = RExC_parse;   /* MJD */
12189     br = regbranch(pRExC_state, &flags, 1, depth+1);
12190 
12191     /*     branch_len = (paren != 0); */
12192 
12193     if (br == 0) {
12194         RETURN_FAIL_ON_RESTART(flags, flagp);
12195         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12196     }
12197     if (*RExC_parse == '|') {
12198 	if (RExC_use_BRANCHJ) {
12199 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
12200 	}
12201 	else {                  /* MJD */
12202 	    reginsert(pRExC_state, BRANCH, br, depth+1);
12203             Set_Node_Length(REGNODE_p(br), paren != 0);
12204             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12205         }
12206 	have_branch = 1;
12207     }
12208     else if (paren == ':') {
12209 	*flagp |= flags&SIMPLE;
12210     }
12211     if (is_open) {				/* Starts with OPEN. */
12212         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12213             REQUIRE_BRANCHJ(flagp, 0);
12214         }
12215     }
12216     else if (paren != '?')		/* Not Conditional */
12217 	ret = br;
12218     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12219     lastbr = br;
12220     while (*RExC_parse == '|') {
12221 	if (RExC_use_BRANCHJ) {
12222             bool shut_gcc_up;
12223 
12224 	    ender = reganode(pRExC_state, LONGJMP, 0);
12225 
12226             /* Append to the previous. */
12227             shut_gcc_up = REGTAIL(pRExC_state,
12228                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12229                          ender);
12230             PERL_UNUSED_VAR(shut_gcc_up);
12231 	}
12232 	nextchar(pRExC_state);
12233 	if (freeze_paren) {
12234 	    if (RExC_npar > after_freeze)
12235 	        after_freeze = RExC_npar;
12236             RExC_npar = freeze_paren;
12237         }
12238         br = regbranch(pRExC_state, &flags, 0, depth+1);
12239 
12240 	if (br == 0) {
12241             RETURN_FAIL_ON_RESTART(flags, flagp);
12242             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12243         }
12244         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12245             REQUIRE_BRANCHJ(flagp, 0);
12246         }
12247 	lastbr = br;
12248 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12249     }
12250 
12251     if (have_branch || paren != ':') {
12252         regnode * br;
12253 
12254 	/* Make a closing node, and hook it on the end. */
12255 	switch (paren) {
12256 	case ':':
12257 	    ender = reg_node(pRExC_state, TAIL);
12258 	    break;
12259 	case 1: case 2:
12260 	    ender = reganode(pRExC_state, CLOSE, parno);
12261             if ( RExC_close_parens ) {
12262                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12263                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12264                         22, "|    |", (int)(depth * 2 + 1), "",
12265                         (IV)parno, ender));
12266                 RExC_close_parens[parno]= ender;
12267 	        if (RExC_nestroot == parno)
12268 	            RExC_nestroot = 0;
12269 	    }
12270             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12271             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12272 	    break;
12273 	case 's':
12274 	    ender = reg_node(pRExC_state, SRCLOSE);
12275             RExC_in_script_run = 0;
12276 	    break;
12277 	case '<':
12278         case 'a':
12279         case 'A':
12280         case 'b':
12281         case 'B':
12282 	case ',':
12283 	case '=':
12284 	case '!':
12285 	    *flagp &= ~HASWIDTH;
12286 	    /* FALLTHROUGH */
12287         case 't':   /* aTomic */
12288 	case '>':
12289 	    ender = reg_node(pRExC_state, SUCCEED);
12290 	    break;
12291 	case 0:
12292 	    ender = reg_node(pRExC_state, END);
12293             assert(!RExC_end_op); /* there can only be one! */
12294             RExC_end_op = REGNODE_p(ender);
12295             if (RExC_close_parens) {
12296                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12297                     "%*s%*s Setting close paren #0 (END) to %d\n",
12298                     22, "|    |", (int)(depth * 2 + 1), "",
12299                     ender));
12300 
12301                 RExC_close_parens[0]= ender;
12302             }
12303 	    break;
12304 	}
12305         DEBUG_PARSE_r({
12306             DEBUG_PARSE_MSG("lsbr");
12307             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12308             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12309             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12310                           SvPV_nolen_const(RExC_mysv1),
12311                           (IV)lastbr,
12312                           SvPV_nolen_const(RExC_mysv2),
12313                           (IV)ender,
12314                           (IV)(ender - lastbr)
12315             );
12316         });
12317         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12318             REQUIRE_BRANCHJ(flagp, 0);
12319         }
12320 
12321 	if (have_branch) {
12322             char is_nothing= 1;
12323 	    if (depth==1)
12324                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12325 
12326 	    /* Hook the tails of the branches to the closing node. */
12327 	    for (br = REGNODE_p(ret); br; br = regnext(br)) {
12328 		const U8 op = PL_regkind[OP(br)];
12329 		if (op == BRANCH) {
12330                     if (! REGTAIL_STUDY(pRExC_state,
12331                                         REGNODE_OFFSET(NEXTOPER(br)),
12332                                         ender))
12333                     {
12334                         REQUIRE_BRANCHJ(flagp, 0);
12335                     }
12336                     if ( OP(NEXTOPER(br)) != NOTHING
12337                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12338                         is_nothing= 0;
12339 		}
12340 		else if (op == BRANCHJ) {
12341                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12342                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12343                                         ender);
12344                     PERL_UNUSED_VAR(shut_gcc_up);
12345                     /* for now we always disable this optimisation * /
12346                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12347                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12348                     */
12349                         is_nothing= 0;
12350 		}
12351 	    }
12352             if (is_nothing) {
12353                 regnode * ret_as_regnode = REGNODE_p(ret);
12354                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12355                                ? regnext(ret_as_regnode)
12356                                : ret_as_regnode;
12357                 DEBUG_PARSE_r({
12358                     DEBUG_PARSE_MSG("NADA");
12359                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12360                                      NULL, pRExC_state);
12361                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12362                                      NULL, pRExC_state);
12363                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12364                                   SvPV_nolen_const(RExC_mysv1),
12365                                   (IV)REG_NODE_NUM(ret_as_regnode),
12366                                   SvPV_nolen_const(RExC_mysv2),
12367                                   (IV)ender,
12368                                   (IV)(ender - ret)
12369                     );
12370                 });
12371                 OP(br)= NOTHING;
12372                 if (OP(REGNODE_p(ender)) == TAIL) {
12373                     NEXT_OFF(br)= 0;
12374                     RExC_emit= REGNODE_OFFSET(br) + 1;
12375                 } else {
12376                     regnode *opt;
12377                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12378                         OP(opt)= OPTIMIZED;
12379                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12380                 }
12381             }
12382 	}
12383     }
12384 
12385     {
12386         const char *p;
12387          /* Even/odd or x=don't care: 010101x10x */
12388         static const char parens[] = "=!aA<,>Bbt";
12389          /* flag below is set to 0 up through 'A'; 1 for larger */
12390 
12391 	if (paren && (p = strchr(parens, paren))) {
12392 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12393 	    int flag = (p - parens) > 3;
12394 
12395 	    if (paren == '>' || paren == 't') {
12396 		node = SUSPEND, flag = 0;
12397             }
12398 
12399 	    reginsert(pRExC_state, node, ret, depth+1);
12400             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12401 	    Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12402 	    FLAGS(REGNODE_p(ret)) = flag;
12403             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12404             {
12405                 REQUIRE_BRANCHJ(flagp, 0);
12406             }
12407 	}
12408     }
12409 
12410     /* Check for proper termination. */
12411     if (paren) {
12412         /* restore original flags, but keep (?p) and, if we've encountered
12413          * something in the parse that changes /d rules into /u, keep the /u */
12414 	RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12415         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12416             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12417         }
12418 	if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12419 	    RExC_parse = oregcomp_parse;
12420 	    vFAIL("Unmatched (");
12421 	}
12422 	nextchar(pRExC_state);
12423     }
12424     else if (!paren && RExC_parse < RExC_end) {
12425 	if (*RExC_parse == ')') {
12426 	    RExC_parse++;
12427 	    vFAIL("Unmatched )");
12428 	}
12429 	else
12430 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
12431 	NOT_REACHED; /* NOTREACHED */
12432     }
12433 
12434     if (RExC_in_lookbehind) {
12435 	RExC_in_lookbehind--;
12436     }
12437     if (after_freeze > RExC_npar)
12438         RExC_npar = after_freeze;
12439     return(ret);
12440 }
12441 
12442 /*
12443  - regbranch - one alternative of an | operator
12444  *
12445  * Implements the concatenation operator.
12446  *
12447  * On success, returns the offset at which any next node should be placed into
12448  * the regex engine program being compiled.
12449  *
12450  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12451  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12452  * UTF-8
12453  */
12454 STATIC regnode_offset
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first,U32 depth)12455 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12456 {
12457     regnode_offset ret;
12458     regnode_offset chain = 0;
12459     regnode_offset latest;
12460     I32 flags = 0, c = 0;
12461     GET_RE_DEBUG_FLAGS_DECL;
12462 
12463     PERL_ARGS_ASSERT_REGBRANCH;
12464 
12465     DEBUG_PARSE("brnc");
12466 
12467     if (first)
12468 	ret = 0;
12469     else {
12470 	if (RExC_use_BRANCHJ)
12471 	    ret = reganode(pRExC_state, BRANCHJ, 0);
12472 	else {
12473 	    ret = reg_node(pRExC_state, BRANCH);
12474             Set_Node_Length(REGNODE_p(ret), 1);
12475         }
12476     }
12477 
12478     *flagp = WORST;			/* Tentatively. */
12479 
12480     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12481                             FALSE /* Don't force to /x */ );
12482     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12483 	flags &= ~TRYAGAIN;
12484         latest = regpiece(pRExC_state, &flags, depth+1);
12485 	if (latest == 0) {
12486 	    if (flags & TRYAGAIN)
12487 		continue;
12488             RETURN_FAIL_ON_RESTART(flags, flagp);
12489             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12490 	}
12491 	else if (ret == 0)
12492             ret = latest;
12493 	*flagp |= flags&(HASWIDTH|POSTPONED);
12494 	if (chain == 0) 	/* First piece. */
12495 	    *flagp |= flags&SPSTART;
12496 	else {
12497 	    /* FIXME adding one for every branch after the first is probably
12498 	     * excessive now we have TRIE support. (hv) */
12499 	    MARK_NAUGHTY(1);
12500             if (! REGTAIL(pRExC_state, chain, latest)) {
12501                 /* XXX We could just redo this branch, but figuring out what
12502                  * bookkeeping needs to be reset is a pain, and it's likely
12503                  * that other branches that goto END will also be too large */
12504                 REQUIRE_BRANCHJ(flagp, 0);
12505             }
12506 	}
12507 	chain = latest;
12508 	c++;
12509     }
12510     if (chain == 0) {	/* Loop ran zero times. */
12511 	chain = reg_node(pRExC_state, NOTHING);
12512 	if (ret == 0)
12513 	    ret = chain;
12514     }
12515     if (c == 1) {
12516 	*flagp |= flags&SIMPLE;
12517     }
12518 
12519     return ret;
12520 }
12521 
12522 /*
12523  - regpiece - something followed by possible quantifier * + ? {n,m}
12524  *
12525  * Note that the branching code sequences used for ? and the general cases
12526  * of * and + are somewhat optimized:  they use the same NOTHING node as
12527  * both the endmarker for their branch list and the body of the last branch.
12528  * It might seem that this node could be dispensed with entirely, but the
12529  * endmarker role is not redundant.
12530  *
12531  * On success, returns the offset at which any next node should be placed into
12532  * the regex engine program being compiled.
12533  *
12534  * Returns 0 otherwise, with *flagp set to indicate why:
12535  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12536  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12537  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12538  */
12539 STATIC regnode_offset
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)12540 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12541 {
12542     regnode_offset ret;
12543     char op;
12544     char *next;
12545     I32 flags;
12546     const char * const origparse = RExC_parse;
12547     I32 min;
12548     I32 max = REG_INFTY;
12549 #ifdef RE_TRACK_PATTERN_OFFSETS
12550     char *parse_start;
12551 #endif
12552     const char *maxpos = NULL;
12553     UV uv;
12554 
12555     /* Save the original in case we change the emitted regop to a FAIL. */
12556     const regnode_offset orig_emit = RExC_emit;
12557 
12558     GET_RE_DEBUG_FLAGS_DECL;
12559 
12560     PERL_ARGS_ASSERT_REGPIECE;
12561 
12562     DEBUG_PARSE("piec");
12563 
12564     ret = regatom(pRExC_state, &flags, depth+1);
12565     if (ret == 0) {
12566         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12567         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12568     }
12569 
12570     op = *RExC_parse;
12571 
12572     if (op == '{' && regcurly(RExC_parse)) {
12573 	maxpos = NULL;
12574 #ifdef RE_TRACK_PATTERN_OFFSETS
12575         parse_start = RExC_parse; /* MJD */
12576 #endif
12577 	next = RExC_parse + 1;
12578 	while (isDIGIT(*next) || *next == ',') {
12579 	    if (*next == ',') {
12580 		if (maxpos)
12581 		    break;
12582 		else
12583 		    maxpos = next;
12584 	    }
12585 	    next++;
12586 	}
12587 	if (*next == '}') {		/* got one */
12588             const char* endptr;
12589 	    if (!maxpos)
12590 		maxpos = next;
12591 	    RExC_parse++;
12592             if (isDIGIT(*RExC_parse)) {
12593                 endptr = RExC_end;
12594                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12595                     vFAIL("Invalid quantifier in {,}");
12596                 if (uv >= REG_INFTY)
12597                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12598                 min = (I32)uv;
12599             } else {
12600                 min = 0;
12601             }
12602 	    if (*maxpos == ',')
12603 		maxpos++;
12604 	    else
12605 		maxpos = RExC_parse;
12606             if (isDIGIT(*maxpos)) {
12607                 endptr = RExC_end;
12608                 if (!grok_atoUV(maxpos, &uv, &endptr))
12609                     vFAIL("Invalid quantifier in {,}");
12610                 if (uv >= REG_INFTY)
12611                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12612                 max = (I32)uv;
12613             } else {
12614 		max = REG_INFTY;		/* meaning "infinity" */
12615             }
12616 	    RExC_parse = next;
12617 	    nextchar(pRExC_state);
12618             if (max < min) {    /* If can't match, warn and optimize to fail
12619                                    unconditionally */
12620                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12621                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12622                 NEXT_OFF(REGNODE_p(orig_emit)) =
12623                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12624                 return ret;
12625             }
12626             else if (min == max && *RExC_parse == '?')
12627             {
12628                 ckWARN2reg(RExC_parse + 1,
12629                            "Useless use of greediness modifier '%c'",
12630                            *RExC_parse);
12631             }
12632 
12633 	  do_curly:
12634 	    if ((flags&SIMPLE)) {
12635                 if (min == 0 && max == REG_INFTY) {
12636                     reginsert(pRExC_state, STAR, ret, depth+1);
12637                     MARK_NAUGHTY(4);
12638                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12639                     goto nest_check;
12640                 }
12641                 if (min == 1 && max == REG_INFTY) {
12642                     reginsert(pRExC_state, PLUS, ret, depth+1);
12643                     MARK_NAUGHTY(3);
12644                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12645                     goto nest_check;
12646                 }
12647                 MARK_NAUGHTY_EXP(2, 2);
12648 		reginsert(pRExC_state, CURLY, ret, depth+1);
12649                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12650                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12651 	    }
12652 	    else {
12653 		const regnode_offset w = reg_node(pRExC_state, WHILEM);
12654 
12655 		FLAGS(REGNODE_p(w)) = 0;
12656                 if (!  REGTAIL(pRExC_state, ret, w)) {
12657                     REQUIRE_BRANCHJ(flagp, 0);
12658                 }
12659 		if (RExC_use_BRANCHJ) {
12660 		    reginsert(pRExC_state, LONGJMP, ret, depth+1);
12661 		    reginsert(pRExC_state, NOTHING, ret, depth+1);
12662 		    NEXT_OFF(REGNODE_p(ret)) = 3;	/* Go over LONGJMP. */
12663 		}
12664 		reginsert(pRExC_state, CURLYX, ret, depth+1);
12665                                 /* MJD hk */
12666                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12667                 Set_Node_Length(REGNODE_p(ret),
12668                                 op == '{' ? (RExC_parse - parse_start) : 1);
12669 
12670 		if (RExC_use_BRANCHJ)
12671                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12672                                                        LONGJMP. */
12673                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12674                                                           NOTHING)))
12675                 {
12676                     REQUIRE_BRANCHJ(flagp, 0);
12677                 }
12678                 RExC_whilem_seen++;
12679                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12680 	    }
12681 	    FLAGS(REGNODE_p(ret)) = 0;
12682 
12683 	    if (min > 0)
12684 		*flagp = WORST;
12685 	    if (max > 0)
12686 		*flagp |= HASWIDTH;
12687             ARG1_SET(REGNODE_p(ret), (U16)min);
12688             ARG2_SET(REGNODE_p(ret), (U16)max);
12689             if (max == REG_INFTY)
12690                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12691 
12692 	    goto nest_check;
12693 	}
12694     }
12695 
12696     if (!ISMULT1(op)) {
12697 	*flagp = flags;
12698 	return(ret);
12699     }
12700 
12701 #if 0				/* Now runtime fix should be reliable. */
12702 
12703     /* if this is reinstated, don't forget to put this back into perldiag:
12704 
12705 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
12706 
12707 	   (F) The part of the regexp subject to either the * or + quantifier
12708            could match an empty string. The {#} shows in the regular
12709            expression about where the problem was discovered.
12710 
12711     */
12712 
12713     if (!(flags&HASWIDTH) && op != '?')
12714       vFAIL("Regexp *+ operand could be empty");
12715 #endif
12716 
12717 #ifdef RE_TRACK_PATTERN_OFFSETS
12718     parse_start = RExC_parse;
12719 #endif
12720     nextchar(pRExC_state);
12721 
12722     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12723 
12724     if (op == '*') {
12725 	min = 0;
12726 	goto do_curly;
12727     }
12728     else if (op == '+') {
12729 	min = 1;
12730 	goto do_curly;
12731     }
12732     else if (op == '?') {
12733 	min = 0; max = 1;
12734 	goto do_curly;
12735     }
12736   nest_check:
12737     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12738 	ckWARN2reg(RExC_parse,
12739 		   "%" UTF8f " matches null string many times",
12740 		   UTF8fARG(UTF, (RExC_parse >= origparse
12741                                  ? RExC_parse - origparse
12742                                  : 0),
12743 		   origparse));
12744     }
12745 
12746     if (*RExC_parse == '?') {
12747 	nextchar(pRExC_state);
12748 	reginsert(pRExC_state, MINMOD, ret, depth+1);
12749         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12750             REQUIRE_BRANCHJ(flagp, 0);
12751         }
12752     }
12753     else if (*RExC_parse == '+') {
12754         regnode_offset ender;
12755         nextchar(pRExC_state);
12756         ender = reg_node(pRExC_state, SUCCEED);
12757         if (! REGTAIL(pRExC_state, ret, ender)) {
12758             REQUIRE_BRANCHJ(flagp, 0);
12759         }
12760         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12761         ender = reg_node(pRExC_state, TAIL);
12762         if (! REGTAIL(pRExC_state, ret, ender)) {
12763             REQUIRE_BRANCHJ(flagp, 0);
12764         }
12765     }
12766 
12767     if (ISMULT2(RExC_parse)) {
12768 	RExC_parse++;
12769 	vFAIL("Nested quantifiers");
12770     }
12771 
12772     return(ret);
12773 }
12774 
12775 STATIC bool
S_grok_bslash_N(pTHX_ RExC_state_t * pRExC_state,regnode_offset * node_p,UV * code_point_p,int * cp_count,I32 * flagp,const bool strict,const U32 depth)12776 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12777                 regnode_offset * node_p,
12778                 UV * code_point_p,
12779                 int * cp_count,
12780                 I32 * flagp,
12781                 const bool strict,
12782                 const U32 depth
12783     )
12784 {
12785  /* This routine teases apart the various meanings of \N and returns
12786   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12787   * in the current context.
12788   *
12789   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12790   *
12791   * If <code_point_p> is not NULL, the context is expecting the result to be a
12792   * single code point.  If this \N instance turns out to a single code point,
12793   * the function returns TRUE and sets *code_point_p to that code point.
12794   *
12795   * If <node_p> is not NULL, the context is expecting the result to be one of
12796   * the things representable by a regnode.  If this \N instance turns out to be
12797   * one such, the function generates the regnode, returns TRUE and sets *node_p
12798   * to point to the offset of that regnode into the regex engine program being
12799   * compiled.
12800   *
12801   * If this instance of \N isn't legal in any context, this function will
12802   * generate a fatal error and not return.
12803   *
12804   * On input, RExC_parse should point to the first char following the \N at the
12805   * time of the call.  On successful return, RExC_parse will have been updated
12806   * to point to just after the sequence identified by this routine.  Also
12807   * *flagp has been updated as needed.
12808   *
12809   * When there is some problem with the current context and this \N instance,
12810   * the function returns FALSE, without advancing RExC_parse, nor setting
12811   * *node_p, nor *code_point_p, nor *flagp.
12812   *
12813   * If <cp_count> is not NULL, the caller wants to know the length (in code
12814   * points) that this \N sequence matches.  This is set, and the input is
12815   * parsed for errors, even if the function returns FALSE, as detailed below.
12816   *
12817   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12818   *
12819   * Probably the most common case is for the \N to specify a single code point.
12820   * *cp_count will be set to 1, and *code_point_p will be set to that code
12821   * point.
12822   *
12823   * Another possibility is for the input to be an empty \N{}.  This is no
12824   * longer accepted, and will generate a fatal error.
12825   *
12826   * Another possibility is for a custom charnames handler to be in effect which
12827   * translates the input name to an empty string.  *cp_count will be set to 0.
12828   * *node_p will be set to a generated NOTHING node.
12829   *
12830   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12831   * set to 0. *node_p will be set to a generated REG_ANY node.
12832   *
12833   * The fifth possibility is that \N resolves to a sequence of more than one
12834   * code points.  *cp_count will be set to the number of code points in the
12835   * sequence. *node_p will be set to a generated node returned by this
12836   * function calling S_reg().
12837   *
12838   * The final possibility is that it is premature to be calling this function;
12839   * the parse needs to be restarted.  This can happen when this changes from
12840   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12841   * latter occurs only when the fifth possibility would otherwise be in
12842   * effect, and is because one of those code points requires the pattern to be
12843   * recompiled as UTF-8.  The function returns FALSE, and sets the
12844   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12845   * happens, the caller needs to desist from continuing parsing, and return
12846   * this information to its caller.  This is not set for when there is only one
12847   * code point, as this can be called as part of an ANYOF node, and they can
12848   * store above-Latin1 code points without the pattern having to be in UTF-8.
12849   *
12850   * For non-single-quoted regexes, the tokenizer has resolved character and
12851   * sequence names inside \N{...} into their Unicode values, normalizing the
12852   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12853   * hex-represented code points in the sequence.  This is done there because
12854   * the names can vary based on what charnames pragma is in scope at the time,
12855   * so we need a way to take a snapshot of what they resolve to at the time of
12856   * the original parse. [perl #56444].
12857   *
12858   * That parsing is skipped for single-quoted regexes, so here we may get
12859   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12860   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12861   * the native character set for non-ASCII platforms.  The other possibilities
12862   * are already native, so no translation is done. */
12863 
12864     char * endbrace;    /* points to '}' following the name */
12865     char* p = RExC_parse; /* Temporary */
12866 
12867     SV * substitute_parse = NULL;
12868     char *orig_end;
12869     char *save_start;
12870     I32 flags;
12871 
12872     GET_RE_DEBUG_FLAGS_DECL;
12873 
12874     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12875 
12876     GET_RE_DEBUG_FLAGS;
12877 
12878     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12879     assert(! (node_p && cp_count));               /* At most 1 should be set */
12880 
12881     if (cp_count) {     /* Initialize return for the most common case */
12882         *cp_count = 1;
12883     }
12884 
12885     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12886      * modifier.  The other meanings do not, so use a temporary until we find
12887      * out which we are being called with */
12888     skip_to_be_ignored_text(pRExC_state, &p,
12889                             FALSE /* Don't force to /x */ );
12890 
12891     /* Disambiguate between \N meaning a named character versus \N meaning
12892      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12893      * quantifier, or if there is no '{' at all */
12894     if (*p != '{' || regcurly(p)) {
12895         RExC_parse = p;
12896         if (cp_count) {
12897             *cp_count = -1;
12898         }
12899 
12900         if (! node_p) {
12901             return FALSE;
12902         }
12903 
12904         *node_p = reg_node(pRExC_state, REG_ANY);
12905         *flagp |= HASWIDTH|SIMPLE;
12906         MARK_NAUGHTY(1);
12907         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12908         return TRUE;
12909     }
12910 
12911     /* The test above made sure that the next real character is a '{', but
12912      * under the /x modifier, it could be separated by space (or a comment and
12913      * \n) and this is not allowed (for consistency with \x{...} and the
12914      * tokenizer handling of \N{NAME}). */
12915     if (*RExC_parse != '{') {
12916         vFAIL("Missing braces on \\N{}");
12917     }
12918 
12919     RExC_parse++;       /* Skip past the '{' */
12920 
12921     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12922     if (! endbrace) { /* no trailing brace */
12923         vFAIL2("Missing right brace on \\%c{}", 'N');
12924     }
12925 
12926     /* Here, we have decided it should be a named character or sequence.  These
12927      * imply Unicode semantics */
12928     REQUIRE_UNI_RULES(flagp, FALSE);
12929 
12930     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12931      * nothing at all (not allowed under strict) */
12932     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12933         RExC_parse = endbrace;
12934         if (strict) {
12935             RExC_parse++;   /* Position after the "}" */
12936             vFAIL("Zero length \\N{}");
12937         }
12938 
12939         if (cp_count) {
12940             *cp_count = 0;
12941         }
12942         nextchar(pRExC_state);
12943         if (! node_p) {
12944             return FALSE;
12945         }
12946 
12947         *node_p = reg_node(pRExC_state, NOTHING);
12948         return TRUE;
12949     }
12950 
12951     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12952 
12953         /* Here, the name isn't of the form  U+....  This can happen if the
12954          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
12955          * is the time to find out what the name means */
12956 
12957         const STRLEN name_len = endbrace - RExC_parse;
12958         SV *  value_sv;     /* What does this name evaluate to */
12959         SV ** value_svp;
12960         const U8 * value;   /* string of name's value */
12961         STRLEN value_len;   /* and its length */
12962 
12963         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
12964          *  toke.c, and their values. Make sure is initialized */
12965         if (! RExC_unlexed_names) {
12966             RExC_unlexed_names = newHV();
12967         }
12968 
12969         /* If we have already seen this name in this pattern, use that.  This
12970          * allows us to only call the charnames handler once per name per
12971          * pattern.  A broken or malicious handler could return something
12972          * different each time, which could cause the results to vary depending
12973          * on if something gets added or subtracted from the pattern that
12974          * causes the number of passes to change, for example */
12975         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12976                                                       name_len, 0)))
12977         {
12978             value_sv = *value_svp;
12979         }
12980         else { /* Otherwise we have to go out and get the name */
12981             const char * error_msg = NULL;
12982             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12983                                                       UTF,
12984                                                       &error_msg);
12985             if (error_msg) {
12986                 RExC_parse = endbrace;
12987                 vFAIL(error_msg);
12988             }
12989 
12990             /* If no error message, should have gotten a valid return */
12991             assert (value_sv);
12992 
12993             /* Save the name's meaning for later use */
12994             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12995                            value_sv, 0))
12996             {
12997                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12998             }
12999         }
13000 
13001         /* Here, we have the value the name evaluates to in 'value_sv' */
13002         value = (U8 *) SvPV(value_sv, value_len);
13003 
13004         /* See if the result is one code point vs 0 or multiple */
13005         if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
13006                                                ? UTF8SKIP(value)
13007                                                : 1))
13008         {
13009             /* Here, exactly one code point.  If that isn't what is wanted,
13010              * fail */
13011             if (! code_point_p) {
13012                 RExC_parse = p;
13013                 return FALSE;
13014             }
13015 
13016             /* Convert from string to numeric code point */
13017             *code_point_p = (SvUTF8(value_sv))
13018                             ? valid_utf8_to_uvchr(value, NULL)
13019                             : *value;
13020 
13021             /* Have parsed this entire single code point \N{...}.  *cp_count
13022              * has already been set to 1, so don't do it again. */
13023             RExC_parse = endbrace;
13024             nextchar(pRExC_state);
13025             return TRUE;
13026         } /* End of is a single code point */
13027 
13028         /* Count the code points, if caller desires.  The API says to do this
13029          * even if we will later return FALSE */
13030         if (cp_count) {
13031             *cp_count = 0;
13032 
13033             *cp_count = (SvUTF8(value_sv))
13034                         ? utf8_length(value, value + value_len)
13035                         : value_len;
13036         }
13037 
13038         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13039          * But don't back the pointer up if the caller wants to know how many
13040          * code points there are (they need to handle it themselves in this
13041          * case).  */
13042         if (! node_p) {
13043             if (! cp_count) {
13044                 RExC_parse = p;
13045             }
13046             return FALSE;
13047         }
13048 
13049         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13050          * reg recursively to parse it.  That way, it retains its atomicness,
13051          * while not having to worry about any special handling that some code
13052          * points may have. */
13053 
13054         substitute_parse = newSVpvs("?:");
13055         sv_catsv(substitute_parse, value_sv);
13056         sv_catpv(substitute_parse, ")");
13057 
13058 #ifdef EBCDIC
13059         /* The value should already be native, so no need to convert on EBCDIC
13060          * platforms.*/
13061         assert(! RExC_recode_x_to_native);
13062 #endif
13063 
13064     }
13065     else {   /* \N{U+...} */
13066         Size_t count = 0;   /* code point count kept internally */
13067 
13068         /* We can get to here when the input is \N{U+...} or when toke.c has
13069          * converted a name to the \N{U+...} form.  This include changing a
13070          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13071 
13072         RExC_parse += 2;    /* Skip past the 'U+' */
13073 
13074         /* Code points are separated by dots.  The '}' terminates the whole
13075          * thing. */
13076 
13077         do {    /* Loop until the ending brace */
13078             UV cp = 0;
13079             char * start_digit;     /* The first of the current code point */
13080             if (! isXDIGIT(*RExC_parse)) {
13081                 RExC_parse++;
13082                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13083             }
13084 
13085             start_digit = RExC_parse;
13086             count++;
13087 
13088             /* Loop through the hex digits of the current code point */
13089             do {
13090                 /* Adding this digit will shift the result 4 bits.  If that
13091                  * result would be above the legal max, it's overflow */
13092                 if (cp > MAX_LEGAL_CP >> 4) {
13093 
13094                     /* Find the end of the code point */
13095                     do {
13096                         RExC_parse ++;
13097                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
13098 
13099                     /* Be sure to synchronize this message with the similar one
13100                      * in utf8.c */
13101                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
13102                         " permissible max is 0x%" UVxf,
13103                         (int) (RExC_parse - start_digit), start_digit,
13104                         MAX_LEGAL_CP);
13105                 }
13106 
13107                 /* Accumulate this (valid) digit into the running total */
13108                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
13109 
13110                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
13111                  * underscore separator */
13112                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
13113                     RExC_parse++;
13114                 }
13115             } while (isXDIGIT(*RExC_parse));
13116 
13117             /* Here, have accumulated the next code point */
13118             if (RExC_parse >= endbrace) {   /* If done ... */
13119                 if (count != 1) {
13120                     goto do_concat;
13121                 }
13122 
13123                 /* Here, is a single code point; fail if doesn't want that */
13124                 if (! code_point_p) {
13125                     RExC_parse = p;
13126                     return FALSE;
13127                 }
13128 
13129                 /* A single code point is easy to handle; just return it */
13130                 *code_point_p = UNI_TO_NATIVE(cp);
13131                 RExC_parse = endbrace;
13132                 nextchar(pRExC_state);
13133                 return TRUE;
13134             }
13135 
13136             /* Here, the only legal thing would be a multiple character
13137              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
13138              * character must be a dot (and the one after that can't be the
13139              * endbrace, or we'd have something like \N{U+100.} ) */
13140             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13141                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13142                                 ? UTF8SKIP(RExC_parse)
13143                                 : 1;
13144                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
13145                     RExC_parse = endbrace;
13146                 }
13147                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13148             }
13149 
13150             /* Here, looks like its really a multiple character sequence.  Fail
13151              * if that's not what the caller wants.  But continue with counting
13152              * and error checking if they still want a count */
13153             if (! node_p && ! cp_count) {
13154                 return FALSE;
13155             }
13156 
13157             /* What is done here is to convert this to a sub-pattern of the
13158              * form \x{char1}\x{char2}...  and then call reg recursively to
13159              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13160              * atomicness, while not having to worry about special handling
13161              * that some code points may have.  We don't create a subpattern,
13162              * but go through the motions of code point counting and error
13163              * checking, if the caller doesn't want a node returned. */
13164 
13165             if (node_p && count == 1) {
13166                 substitute_parse = newSVpvs("?:");
13167             }
13168 
13169           do_concat:
13170 
13171             if (node_p) {
13172                 /* Convert to notation the rest of the code understands */
13173                 sv_catpvs(substitute_parse, "\\x{");
13174                 sv_catpvn(substitute_parse, start_digit,
13175                                             RExC_parse - start_digit);
13176                 sv_catpvs(substitute_parse, "}");
13177             }
13178 
13179             /* Move to after the dot (or ending brace the final time through.)
13180              * */
13181             RExC_parse++;
13182             count++;
13183 
13184         } while (RExC_parse < endbrace);
13185 
13186         if (! node_p) { /* Doesn't want the node */
13187             assert (cp_count);
13188 
13189             *cp_count = count;
13190             return FALSE;
13191         }
13192 
13193         sv_catpvs(substitute_parse, ")");
13194 
13195 #ifdef EBCDIC
13196         /* The values are Unicode, and therefore have to be converted to native
13197          * on a non-Unicode (meaning non-ASCII) platform. */
13198         RExC_recode_x_to_native = 1;
13199 #endif
13200 
13201     }
13202 
13203     /* Here, we have the string the name evaluates to, ready to be parsed,
13204      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13205      * constructs.  This can be called from within a substitute parse already.
13206      * The error reporting mechanism doesn't work for 2 levels of this, but the
13207      * code above has validated this new construct, so there should be no
13208      * errors generated by the below.  And this isn' an exact copy, so the
13209      * mechanism to seamlessly deal with this won't work, so turn off warnings
13210      * during it */
13211     save_start = RExC_start;
13212     orig_end = RExC_end;
13213 
13214     RExC_parse = RExC_start = SvPVX(substitute_parse);
13215     RExC_end = RExC_parse + SvCUR(substitute_parse);
13216     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13217 
13218     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13219 
13220     /* Restore the saved values */
13221     RESTORE_WARNINGS;
13222     RExC_start = save_start;
13223     RExC_parse = endbrace;
13224     RExC_end = orig_end;
13225 #ifdef EBCDIC
13226     RExC_recode_x_to_native = 0;
13227 #endif
13228 
13229     SvREFCNT_dec_NN(substitute_parse);
13230 
13231     if (! *node_p) {
13232         RETURN_FAIL_ON_RESTART(flags, flagp);
13233         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13234             (UV) flags);
13235     }
13236     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13237 
13238     nextchar(pRExC_state);
13239 
13240     return TRUE;
13241 }
13242 
13243 
13244 PERL_STATIC_INLINE U8
S_compute_EXACTish(RExC_state_t * pRExC_state)13245 S_compute_EXACTish(RExC_state_t *pRExC_state)
13246 {
13247     U8 op;
13248 
13249     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13250 
13251     if (! FOLD) {
13252         return (LOC)
13253                 ? EXACTL
13254                 : EXACT;
13255     }
13256 
13257     op = get_regex_charset(RExC_flags);
13258     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13259         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13260                  been, so there is no hole */
13261     }
13262 
13263     return op + EXACTF;
13264 }
13265 
13266 STATIC bool
S_new_regcurly(const char * s,const char * e)13267 S_new_regcurly(const char *s, const char *e)
13268 {
13269     /* This is a temporary function designed to match the most lenient form of
13270      * a {m,n} quantifier we ever envision, with either number omitted, and
13271      * spaces anywhere between/before/after them.
13272      *
13273      * If this function fails, then the string it matches is very unlikely to
13274      * ever be considered a valid quantifier, so we can allow the '{' that
13275      * begins it to be considered as a literal */
13276 
13277     bool has_min = FALSE;
13278     bool has_max = FALSE;
13279 
13280     PERL_ARGS_ASSERT_NEW_REGCURLY;
13281 
13282     if (s >= e || *s++ != '{')
13283 	return FALSE;
13284 
13285     while (s < e && isSPACE(*s)) {
13286         s++;
13287     }
13288     while (s < e && isDIGIT(*s)) {
13289         has_min = TRUE;
13290         s++;
13291     }
13292     while (s < e && isSPACE(*s)) {
13293         s++;
13294     }
13295 
13296     if (*s == ',') {
13297 	s++;
13298         while (s < e && isSPACE(*s)) {
13299             s++;
13300         }
13301         while (s < e && isDIGIT(*s)) {
13302             has_max = TRUE;
13303             s++;
13304         }
13305         while (s < e && isSPACE(*s)) {
13306             s++;
13307         }
13308     }
13309 
13310     return s < e && *s == '}' && (has_min || has_max);
13311 }
13312 
13313 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13314  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13315 
13316 static I32
S_backref_value(char * p,char * e)13317 S_backref_value(char *p, char *e)
13318 {
13319     const char* endptr = e;
13320     UV val;
13321     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13322         return (I32)val;
13323     return I32_MAX;
13324 }
13325 
13326 
13327 /*
13328  - regatom - the lowest level
13329 
13330    Try to identify anything special at the start of the current parse position.
13331    If there is, then handle it as required. This may involve generating a
13332    single regop, such as for an assertion; or it may involve recursing, such as
13333    to handle a () structure.
13334 
13335    If the string doesn't start with something special then we gobble up
13336    as much literal text as we can.  If we encounter a quantifier, we have to
13337    back off the final literal character, as that quantifier applies to just it
13338    and not to the whole string of literals.
13339 
13340    Once we have been able to handle whatever type of thing started the
13341    sequence, we return the offset into the regex engine program being compiled
13342    at which any  next regnode should be placed.
13343 
13344    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13345    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13346    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13347    Otherwise does not return 0.
13348 
13349    Note: we have to be careful with escapes, as they can be both literal
13350    and special, and in the case of \10 and friends, context determines which.
13351 
13352    A summary of the code structure is:
13353 
13354    switch (first_byte) {
13355 	cases for each special:
13356 	    handle this special;
13357 	    break;
13358 	case '\\':
13359 	    switch (2nd byte) {
13360 		cases for each unambiguous special:
13361 		    handle this special;
13362 		    break;
13363 		cases for each ambigous special/literal:
13364 		    disambiguate;
13365 		    if (special)  handle here
13366 		    else goto defchar;
13367 		default: // unambiguously literal:
13368 		    goto defchar;
13369 	    }
13370 	default:  // is a literal char
13371 	    // FALL THROUGH
13372 	defchar:
13373 	    create EXACTish node for literal;
13374 	    while (more input and node isn't full) {
13375 		switch (input_byte) {
13376 		   cases for each special;
13377                        make sure parse pointer is set so that the next call to
13378                            regatom will see this special first
13379                        goto loopdone; // EXACTish node terminated by prev. char
13380 		   default:
13381 		       append char to EXACTISH node;
13382 		}
13383 	        get next input byte;
13384 	    }
13385         loopdone:
13386    }
13387    return the generated node;
13388 
13389    Specifically there are two separate switches for handling
13390    escape sequences, with the one for handling literal escapes requiring
13391    a dummy entry for all of the special escapes that are actually handled
13392    by the other.
13393 
13394 */
13395 
13396 STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth)13397 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13398 {
13399     dVAR;
13400     regnode_offset ret = 0;
13401     I32 flags = 0;
13402     char *parse_start;
13403     U8 op;
13404     int invert = 0;
13405     U8 arg;
13406 
13407     GET_RE_DEBUG_FLAGS_DECL;
13408 
13409     *flagp = WORST;		/* Tentatively. */
13410 
13411     DEBUG_PARSE("atom");
13412 
13413     PERL_ARGS_ASSERT_REGATOM;
13414 
13415   tryagain:
13416     parse_start = RExC_parse;
13417     assert(RExC_parse < RExC_end);
13418     switch ((U8)*RExC_parse) {
13419     case '^':
13420 	RExC_seen_zerolen++;
13421 	nextchar(pRExC_state);
13422 	if (RExC_flags & RXf_PMf_MULTILINE)
13423 	    ret = reg_node(pRExC_state, MBOL);
13424 	else
13425 	    ret = reg_node(pRExC_state, SBOL);
13426         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13427 	break;
13428     case '$':
13429 	nextchar(pRExC_state);
13430 	if (*RExC_parse)
13431 	    RExC_seen_zerolen++;
13432 	if (RExC_flags & RXf_PMf_MULTILINE)
13433 	    ret = reg_node(pRExC_state, MEOL);
13434 	else
13435 	    ret = reg_node(pRExC_state, SEOL);
13436         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13437 	break;
13438     case '.':
13439 	nextchar(pRExC_state);
13440 	if (RExC_flags & RXf_PMf_SINGLELINE)
13441 	    ret = reg_node(pRExC_state, SANY);
13442 	else
13443 	    ret = reg_node(pRExC_state, REG_ANY);
13444 	*flagp |= HASWIDTH|SIMPLE;
13445 	MARK_NAUGHTY(1);
13446         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13447 	break;
13448     case '[':
13449     {
13450 	char * const oregcomp_parse = ++RExC_parse;
13451         ret = regclass(pRExC_state, flagp, depth+1,
13452                        FALSE, /* means parse the whole char class */
13453                        TRUE, /* allow multi-char folds */
13454                        FALSE, /* don't silence non-portable warnings. */
13455                        (bool) RExC_strict,
13456                        TRUE, /* Allow an optimized regnode result */
13457                        NULL);
13458         if (ret == 0) {
13459             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13460             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13461                   (UV) *flagp);
13462         }
13463 	if (*RExC_parse != ']') {
13464 	    RExC_parse = oregcomp_parse;
13465 	    vFAIL("Unmatched [");
13466 	}
13467 	nextchar(pRExC_state);
13468         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13469 	break;
13470     }
13471     case '(':
13472 	nextchar(pRExC_state);
13473         ret = reg(pRExC_state, 2, &flags, depth+1);
13474 	if (ret == 0) {
13475 		if (flags & TRYAGAIN) {
13476 		    if (RExC_parse >= RExC_end) {
13477 			 /* Make parent create an empty node if needed. */
13478 			*flagp |= TRYAGAIN;
13479 			return(0);
13480 		    }
13481 		    goto tryagain;
13482 		}
13483                 RETURN_FAIL_ON_RESTART(flags, flagp);
13484                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13485                                                                  (UV) flags);
13486 	}
13487 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13488 	break;
13489     case '|':
13490     case ')':
13491 	if (flags & TRYAGAIN) {
13492 	    *flagp |= TRYAGAIN;
13493 	    return 0;
13494 	}
13495 	vFAIL("Internal urp");
13496 				/* Supposed to be caught earlier. */
13497 	break;
13498     case '?':
13499     case '+':
13500     case '*':
13501 	RExC_parse++;
13502 	vFAIL("Quantifier follows nothing");
13503 	break;
13504     case '\\':
13505 	/* Special Escapes
13506 
13507 	   This switch handles escape sequences that resolve to some kind
13508 	   of special regop and not to literal text. Escape sequences that
13509 	   resolve to literal text are handled below in the switch marked
13510 	   "Literal Escapes".
13511 
13512 	   Every entry in this switch *must* have a corresponding entry
13513 	   in the literal escape switch. However, the opposite is not
13514 	   required, as the default for this switch is to jump to the
13515 	   literal text handling code.
13516 	*/
13517 	RExC_parse++;
13518 	switch ((U8)*RExC_parse) {
13519 	/* Special Escapes */
13520 	case 'A':
13521 	    RExC_seen_zerolen++;
13522 	    ret = reg_node(pRExC_state, SBOL);
13523             /* SBOL is shared with /^/ so we set the flags so we can tell
13524              * /\A/ from /^/ in split. */
13525             FLAGS(REGNODE_p(ret)) = 1;
13526 	    *flagp |= SIMPLE;
13527 	    goto finish_meta_pat;
13528 	case 'G':
13529 	    ret = reg_node(pRExC_state, GPOS);
13530             RExC_seen |= REG_GPOS_SEEN;
13531 	    *flagp |= SIMPLE;
13532 	    goto finish_meta_pat;
13533 	case 'K':
13534 	    RExC_seen_zerolen++;
13535 	    ret = reg_node(pRExC_state, KEEPS);
13536 	    *flagp |= SIMPLE;
13537 	    /* XXX:dmq : disabling in-place substitution seems to
13538 	     * be necessary here to avoid cases of memory corruption, as
13539 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13540 	     */
13541             RExC_seen |= REG_LOOKBEHIND_SEEN;
13542 	    goto finish_meta_pat;
13543 	case 'Z':
13544 	    ret = reg_node(pRExC_state, SEOL);
13545 	    *flagp |= SIMPLE;
13546 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13547 	    goto finish_meta_pat;
13548 	case 'z':
13549 	    ret = reg_node(pRExC_state, EOS);
13550 	    *flagp |= SIMPLE;
13551 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
13552 	    goto finish_meta_pat;
13553 	case 'C':
13554 	    vFAIL("\\C no longer supported");
13555 	case 'X':
13556 	    ret = reg_node(pRExC_state, CLUMP);
13557 	    *flagp |= HASWIDTH;
13558 	    goto finish_meta_pat;
13559 
13560 	case 'W':
13561             invert = 1;
13562             /* FALLTHROUGH */
13563 	case 'w':
13564             arg = ANYOF_WORDCHAR;
13565             goto join_posix;
13566 
13567 	case 'B':
13568             invert = 1;
13569             /* FALLTHROUGH */
13570 	case 'b':
13571           {
13572             U8 flags = 0;
13573 	    regex_charset charset = get_regex_charset(RExC_flags);
13574 
13575 	    RExC_seen_zerolen++;
13576             RExC_seen |= REG_LOOKBEHIND_SEEN;
13577 	    op = BOUND + charset;
13578 
13579 	    if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13580                 flags = TRADITIONAL_BOUND;
13581                 if (op > BOUNDA) {  /* /aa is same as /a */
13582                     op = BOUNDA;
13583                 }
13584             }
13585             else {
13586                 STRLEN length;
13587                 char name = *RExC_parse;
13588                 char * endbrace = NULL;
13589                 RExC_parse += 2;
13590                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13591 
13592                 if (! endbrace) {
13593                     vFAIL2("Missing right brace on \\%c{}", name);
13594                 }
13595                 /* XXX Need to decide whether to take spaces or not.  Should be
13596                  * consistent with \p{}, but that currently is SPACE, which
13597                  * means vertical too, which seems wrong
13598                  * while (isBLANK(*RExC_parse)) {
13599                     RExC_parse++;
13600                 }*/
13601                 if (endbrace == RExC_parse) {
13602                     RExC_parse++;  /* After the '}' */
13603                     vFAIL2("Empty \\%c{}", name);
13604                 }
13605                 length = endbrace - RExC_parse;
13606                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13607                     length--;
13608                 }*/
13609                 switch (*RExC_parse) {
13610                     case 'g':
13611                         if (    length != 1
13612                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13613                         {
13614                             goto bad_bound_type;
13615                         }
13616                         flags = GCB_BOUND;
13617                         break;
13618                     case 'l':
13619                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13620                             goto bad_bound_type;
13621                         }
13622                         flags = LB_BOUND;
13623                         break;
13624                     case 's':
13625                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13626                             goto bad_bound_type;
13627                         }
13628                         flags = SB_BOUND;
13629                         break;
13630                     case 'w':
13631                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13632                             goto bad_bound_type;
13633                         }
13634                         flags = WB_BOUND;
13635                         break;
13636                     default:
13637                       bad_bound_type:
13638                         RExC_parse = endbrace;
13639 			vFAIL2utf8f(
13640                             "'%" UTF8f "' is an unknown bound type",
13641 			    UTF8fARG(UTF, length, endbrace - length));
13642                         NOT_REACHED; /*NOTREACHED*/
13643                 }
13644                 RExC_parse = endbrace;
13645                 REQUIRE_UNI_RULES(flagp, 0);
13646 
13647                 if (op == BOUND) {
13648                     op = BOUNDU;
13649                 }
13650                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13651                     op = BOUNDU;
13652                     length += 4;
13653 
13654                     /* Don't have to worry about UTF-8, in this message because
13655                      * to get here the contents of the \b must be ASCII */
13656                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13657                               "Using /u for '%.*s' instead of /%s",
13658                               (unsigned) length,
13659                               endbrace - length + 1,
13660                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13661                               ? ASCII_RESTRICT_PAT_MODS
13662                               : ASCII_MORE_RESTRICT_PAT_MODS);
13663                 }
13664 	    }
13665 
13666             if (op == BOUND) {
13667                 RExC_seen_d_op = TRUE;
13668             }
13669             else if (op == BOUNDL) {
13670                 RExC_contains_locale = 1;
13671             }
13672 
13673             if (invert) {
13674                 op += NBOUND - BOUND;
13675             }
13676 
13677 	    ret = reg_node(pRExC_state, op);
13678             FLAGS(REGNODE_p(ret)) = flags;
13679 
13680 	    *flagp |= SIMPLE;
13681 
13682 	    goto finish_meta_pat;
13683           }
13684 
13685 	case 'D':
13686             invert = 1;
13687             /* FALLTHROUGH */
13688 	case 'd':
13689             arg = ANYOF_DIGIT;
13690             if (! DEPENDS_SEMANTICS) {
13691                 goto join_posix;
13692             }
13693 
13694             /* \d doesn't have any matches in the upper Latin1 range, hence /d
13695              * is equivalent to /u.  Changing to /u saves some branches at
13696              * runtime */
13697             op = POSIXU;
13698             goto join_posix_op_known;
13699 
13700 	case 'R':
13701 	    ret = reg_node(pRExC_state, LNBREAK);
13702 	    *flagp |= HASWIDTH|SIMPLE;
13703 	    goto finish_meta_pat;
13704 
13705 	case 'H':
13706             invert = 1;
13707             /* FALLTHROUGH */
13708 	case 'h':
13709 	    arg = ANYOF_BLANK;
13710             op = POSIXU;
13711             goto join_posix_op_known;
13712 
13713 	case 'V':
13714             invert = 1;
13715             /* FALLTHROUGH */
13716 	case 'v':
13717 	    arg = ANYOF_VERTWS;
13718             op = POSIXU;
13719             goto join_posix_op_known;
13720 
13721 	case 'S':
13722             invert = 1;
13723             /* FALLTHROUGH */
13724 	case 's':
13725             arg = ANYOF_SPACE;
13726 
13727           join_posix:
13728 
13729 	    op = POSIXD + get_regex_charset(RExC_flags);
13730             if (op > POSIXA) {  /* /aa is same as /a */
13731                 op = POSIXA;
13732             }
13733             else if (op == POSIXL) {
13734                 RExC_contains_locale = 1;
13735             }
13736             else if (op == POSIXD) {
13737                 RExC_seen_d_op = TRUE;
13738             }
13739 
13740           join_posix_op_known:
13741 
13742             if (invert) {
13743                 op += NPOSIXD - POSIXD;
13744             }
13745 
13746 	    ret = reg_node(pRExC_state, op);
13747             FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13748 
13749 	    *flagp |= HASWIDTH|SIMPLE;
13750             /* FALLTHROUGH */
13751 
13752           finish_meta_pat:
13753             if (   UCHARAT(RExC_parse + 1) == '{'
13754                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13755             {
13756                 RExC_parse += 2;
13757                 vFAIL("Unescaped left brace in regex is illegal here");
13758             }
13759 	    nextchar(pRExC_state);
13760             Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13761 	    break;
13762 	case 'p':
13763 	case 'P':
13764             RExC_parse--;
13765 
13766             ret = regclass(pRExC_state, flagp, depth+1,
13767                            TRUE, /* means just parse this element */
13768                            FALSE, /* don't allow multi-char folds */
13769                            FALSE, /* don't silence non-portable warnings.  It
13770                                      would be a bug if these returned
13771                                      non-portables */
13772                            (bool) RExC_strict,
13773                            TRUE, /* Allow an optimized regnode result */
13774                            NULL);
13775             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13776             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13777              * multi-char folds are allowed.  */
13778             if (!ret)
13779                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13780                       (UV) *flagp);
13781 
13782             RExC_parse--;
13783 
13784             Set_Node_Offset(REGNODE_p(ret), parse_start);
13785             Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13786             nextchar(pRExC_state);
13787 	    break;
13788         case 'N':
13789             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13790              * \N{...} evaluates to a sequence of more than one code points).
13791              * The function call below returns a regnode, which is our result.
13792              * The parameters cause it to fail if the \N{} evaluates to a
13793              * single code point; we handle those like any other literal.  The
13794              * reason that the multicharacter case is handled here and not as
13795              * part of the EXACtish code is because of quantifiers.  In
13796              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13797              * this way makes that Just Happen. dmq.
13798              * join_exact() will join this up with adjacent EXACTish nodes
13799              * later on, if appropriate. */
13800             ++RExC_parse;
13801             if (grok_bslash_N(pRExC_state,
13802                               &ret,     /* Want a regnode returned */
13803                               NULL,     /* Fail if evaluates to a single code
13804                                            point */
13805                               NULL,     /* Don't need a count of how many code
13806                                            points */
13807                               flagp,
13808                               RExC_strict,
13809                               depth)
13810             ) {
13811                 break;
13812             }
13813 
13814             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13815 
13816             /* Here, evaluates to a single code point.  Go get that */
13817             RExC_parse = parse_start;
13818             goto defchar;
13819 
13820 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
13821       parse_named_seq:
13822         {
13823             char ch;
13824             if (   RExC_parse >= RExC_end - 1
13825                 || ((   ch = RExC_parse[1]) != '<'
13826                                       && ch != '\''
13827                                       && ch != '{'))
13828             {
13829 	        RExC_parse++;
13830 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13831 	        vFAIL2("Sequence %.2s... not terminated", parse_start);
13832 	    } else {
13833 		RExC_parse += 2;
13834                 ret = handle_named_backref(pRExC_state,
13835                                            flagp,
13836                                            parse_start,
13837                                            (ch == '<')
13838                                            ? '>'
13839                                            : (ch == '{')
13840                                              ? '}'
13841                                              : '\'');
13842             }
13843             break;
13844 	}
13845 	case 'g':
13846 	case '1': case '2': case '3': case '4':
13847 	case '5': case '6': case '7': case '8': case '9':
13848 	    {
13849 		I32 num;
13850 		bool hasbrace = 0;
13851 
13852 		if (*RExC_parse == 'g') {
13853                     bool isrel = 0;
13854 
13855 		    RExC_parse++;
13856 		    if (*RExC_parse == '{') {
13857 		        RExC_parse++;
13858 		        hasbrace = 1;
13859 		    }
13860 		    if (*RExC_parse == '-') {
13861 		        RExC_parse++;
13862 		        isrel = 1;
13863 		    }
13864 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
13865 		        if (isrel) RExC_parse--;
13866                         RExC_parse -= 2;
13867 		        goto parse_named_seq;
13868                     }
13869 
13870                     if (RExC_parse >= RExC_end) {
13871                         goto unterminated_g;
13872                     }
13873                     num = S_backref_value(RExC_parse, RExC_end);
13874                     if (num == 0)
13875                         vFAIL("Reference to invalid group 0");
13876                     else if (num == I32_MAX) {
13877                          if (isDIGIT(*RExC_parse))
13878 			    vFAIL("Reference to nonexistent group");
13879                         else
13880                           unterminated_g:
13881                             vFAIL("Unterminated \\g... pattern");
13882                     }
13883 
13884                     if (isrel) {
13885                         num = RExC_npar - num;
13886                         if (num < 1)
13887                             vFAIL("Reference to nonexistent or unclosed group");
13888                     }
13889                 }
13890                 else {
13891                     num = S_backref_value(RExC_parse, RExC_end);
13892                     /* bare \NNN might be backref or octal - if it is larger
13893                      * than or equal RExC_npar then it is assumed to be an
13894                      * octal escape. Note RExC_npar is +1 from the actual
13895                      * number of parens. */
13896                     /* Note we do NOT check if num == I32_MAX here, as that is
13897                      * handled by the RExC_npar check */
13898 
13899                     if (
13900                         /* any numeric escape < 10 is always a backref */
13901                         num > 9
13902                         /* any numeric escape < RExC_npar is a backref */
13903                         && num >= RExC_npar
13904                         /* cannot be an octal escape if it starts with 8 */
13905                         && *RExC_parse != '8'
13906                         /* cannot be an octal escape it it starts with 9 */
13907                         && *RExC_parse != '9'
13908                     ) {
13909                         /* Probably not meant to be a backref, instead likely
13910                          * to be an octal character escape, e.g. \35 or \777.
13911                          * The above logic should make it obvious why using
13912                          * octal escapes in patterns is problematic. - Yves */
13913                         RExC_parse = parse_start;
13914                         goto defchar;
13915                     }
13916                 }
13917 
13918                 /* At this point RExC_parse points at a numeric escape like
13919                  * \12 or \88 or something similar, which we should NOT treat
13920                  * as an octal escape. It may or may not be a valid backref
13921                  * escape. For instance \88888888 is unlikely to be a valid
13922                  * backref. */
13923                 while (isDIGIT(*RExC_parse))
13924                     RExC_parse++;
13925                 if (hasbrace) {
13926                     if (*RExC_parse != '}')
13927                         vFAIL("Unterminated \\g{...} pattern");
13928                     RExC_parse++;
13929                 }
13930                 if (num >= (I32)RExC_npar) {
13931 
13932                     /* It might be a forward reference; we can't fail until we
13933                      * know, by completing the parse to get all the groups, and
13934                      * then reparsing */
13935                     if (ALL_PARENS_COUNTED)  {
13936                         if (num >= RExC_total_parens)  {
13937                             vFAIL("Reference to nonexistent group");
13938                         }
13939                     }
13940                     else {
13941                         REQUIRE_PARENS_PASS;
13942                     }
13943                 }
13944                 RExC_sawback = 1;
13945                 ret = reganode(pRExC_state,
13946                                ((! FOLD)
13947                                  ? REF
13948                                  : (ASCII_FOLD_RESTRICTED)
13949                                    ? REFFA
13950                                    : (AT_LEAST_UNI_SEMANTICS)
13951                                      ? REFFU
13952                                      : (LOC)
13953                                        ? REFFL
13954                                        : REFF),
13955                                 num);
13956                 if (OP(REGNODE_p(ret)) == REFF) {
13957                     RExC_seen_d_op = TRUE;
13958                 }
13959                 *flagp |= HASWIDTH;
13960 
13961                 /* override incorrect value set in reganode MJD */
13962                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13963                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13964                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13965                                         FALSE /* Don't force to /x */ );
13966 	    }
13967 	    break;
13968 	case '\0':
13969 	    if (RExC_parse >= RExC_end)
13970 		FAIL("Trailing \\");
13971 	    /* FALLTHROUGH */
13972 	default:
13973 	    /* Do not generate "unrecognized" warnings here, we fall
13974 	       back into the quick-grab loop below */
13975             RExC_parse = parse_start;
13976 	    goto defchar;
13977 	} /* end of switch on a \foo sequence */
13978 	break;
13979 
13980     case '#':
13981 
13982         /* '#' comments should have been spaced over before this function was
13983          * called */
13984         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13985 	/*
13986         if (RExC_flags & RXf_PMf_EXTENDED) {
13987 	    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13988 	    if (RExC_parse < RExC_end)
13989 		goto tryagain;
13990 	}
13991         */
13992 
13993 	/* FALLTHROUGH */
13994 
13995     default:
13996 	  defchar: {
13997 
13998             /* Here, we have determined that the next thing is probably a
13999              * literal character.  RExC_parse points to the first byte of its
14000              * definition.  (It still may be an escape sequence that evaluates
14001              * to a single character) */
14002 
14003 	    STRLEN len = 0;
14004 	    UV ender = 0;
14005 	    char *p;
14006 	    char *s;
14007 
14008 /* This allows us to fill a node with just enough spare so that if the final
14009  * character folds, its expansion is guaranteed to fit */
14010 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
14011 
14012 	    char *s0;
14013 	    U8 upper_parse = MAX_NODE_STRING_SIZE;
14014 
14015             /* We start out as an EXACT node, even if under /i, until we find a
14016              * character which is in a fold.  The algorithm now segregates into
14017              * separate nodes, characters that fold from those that don't under
14018              * /i.  (This hopefully will create nodes that are fixed strings
14019              * even under /i, giving the optimizer something to grab on to.)
14020              * So, if a node has something in it and the next character is in
14021              * the opposite category, that node is closed up, and the function
14022              * returns.  Then regatom is called again, and a new node is
14023              * created for the new category. */
14024             U8 node_type = EXACT;
14025 
14026             /* Assume the node will be fully used; the excess is given back at
14027              * the end.  We can't make any other length assumptions, as a byte
14028              * input sequence could shrink down. */
14029             Ptrdiff_t initial_size = STR_SZ(256);
14030 
14031             bool next_is_quantifier;
14032             char * oldp = NULL;
14033 
14034             /* We can convert EXACTF nodes to EXACTFU if they contain only
14035              * characters that match identically regardless of the target
14036              * string's UTF8ness.  The reason to do this is that EXACTF is not
14037              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14038              * runtime.
14039              *
14040              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14041              * contain only above-Latin1 characters (hence must be in UTF8),
14042              * which don't participate in folds with Latin1-range characters,
14043              * as the latter's folds aren't known until runtime. */
14044             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14045 
14046             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14047              * allows us to override this as encountered */
14048             U8 maybe_SIMPLE = SIMPLE;
14049 
14050             /* Does this node contain something that can't match unless the
14051              * target string is (also) in UTF-8 */
14052             bool requires_utf8_target = FALSE;
14053 
14054             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14055             bool has_ss = FALSE;
14056 
14057             /* So is the MICRO SIGN */
14058             bool has_micro_sign = FALSE;
14059 
14060             /* Allocate an EXACT node.  The node_type may change below to
14061              * another EXACTish node, but since the size of the node doesn't
14062              * change, it works */
14063             ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
14064             FILL_NODE(ret, node_type);
14065             RExC_emit++;
14066 
14067 	    s = STRING(REGNODE_p(ret));
14068 
14069             s0 = s;
14070 
14071 	  reparse:
14072 
14073             /* This breaks under rare circumstances.  If folding, we do not
14074              * want to split a node at a character that is a non-final in a
14075              * multi-char fold, as an input string could just happen to want to
14076              * match across the node boundary.  The code at the end of the loop
14077              * looks for this, and backs off until it finds not such a
14078              * character, but it is possible (though extremely, extremely
14079              * unlikely) for all characters in the node to be non-final fold
14080              * ones, in which case we just leave the node fully filled, and
14081              * hope that it doesn't match the string in just the wrong place */
14082 
14083             assert( ! UTF     /* Is at the beginning of a character */
14084                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14085                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14086 
14087             /* Here, we have a literal character.  Find the maximal string of
14088              * them in the input that we can fit into a single EXACTish node.
14089              * We quit at the first non-literal or when the node gets full, or
14090              * under /i the categorization of folding/non-folding character
14091              * changes */
14092 	    for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
14093 
14094                 /* In most cases each iteration adds one byte to the output.
14095                  * The exceptions override this */
14096                 Size_t added_len = 1;
14097 
14098 		oldp = p;
14099 
14100                 /* White space has already been ignored */
14101                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14102                        || ! is_PATWS_safe((p), RExC_end, UTF));
14103 
14104 		switch ((U8)*p) {
14105 		case '^':
14106 		case '$':
14107 		case '.':
14108 		case '[':
14109 		case '(':
14110 		case ')':
14111 		case '|':
14112 		    goto loopdone;
14113 		case '\\':
14114 		    /* Literal Escapes Switch
14115 
14116 		       This switch is meant to handle escape sequences that
14117 		       resolve to a literal character.
14118 
14119 		       Every escape sequence that represents something
14120 		       else, like an assertion or a char class, is handled
14121 		       in the switch marked 'Special Escapes' above in this
14122 		       routine, but also has an entry here as anything that
14123 		       isn't explicitly mentioned here will be treated as
14124 		       an unescaped equivalent literal.
14125 		    */
14126 
14127 		    switch ((U8)*++p) {
14128 
14129 		    /* These are all the special escapes. */
14130 		    case 'A':             /* Start assertion */
14131 		    case 'b': case 'B':   /* Word-boundary assertion*/
14132 		    case 'C':             /* Single char !DANGEROUS! */
14133 		    case 'd': case 'D':   /* digit class */
14134 		    case 'g': case 'G':   /* generic-backref, pos assertion */
14135 		    case 'h': case 'H':   /* HORIZWS */
14136 		    case 'k': case 'K':   /* named backref, keep marker */
14137 		    case 'p': case 'P':   /* Unicode property */
14138 		              case 'R':   /* LNBREAK */
14139 		    case 's': case 'S':   /* space class */
14140 		    case 'v': case 'V':   /* VERTWS */
14141 		    case 'w': case 'W':   /* word class */
14142                     case 'X':             /* eXtended Unicode "combining
14143                                              character sequence" */
14144 		    case 'z': case 'Z':   /* End of line/string assertion */
14145 			--p;
14146 			goto loopdone;
14147 
14148 	            /* Anything after here is an escape that resolves to a
14149 	               literal. (Except digits, which may or may not)
14150 	             */
14151 		    case 'n':
14152 			ender = '\n';
14153 			p++;
14154 			break;
14155 		    case 'N': /* Handle a single-code point named character. */
14156                         RExC_parse = p + 1;
14157                         if (! grok_bslash_N(pRExC_state,
14158                                             NULL,   /* Fail if evaluates to
14159                                                        anything other than a
14160                                                        single code point */
14161                                             &ender, /* The returned single code
14162                                                        point */
14163                                             NULL,   /* Don't need a count of
14164                                                        how many code points */
14165                                             flagp,
14166                                             RExC_strict,
14167                                             depth)
14168                         ) {
14169                             if (*flagp & NEED_UTF8)
14170                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14171                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14172 
14173                             /* Here, it wasn't a single code point.  Go close
14174                              * up this EXACTish node.  The switch() prior to
14175                              * this switch handles the other cases */
14176                             RExC_parse = p = oldp;
14177                             goto loopdone;
14178                         }
14179                         p = RExC_parse;
14180                         RExC_parse = parse_start;
14181 
14182                         /* The \N{} means the pattern, if previously /d,
14183                          * becomes /u.  That means it can't be an EXACTF node,
14184                          * but an EXACTFU */
14185                         if (node_type == EXACTF) {
14186                             node_type = EXACTFU;
14187 
14188                             /* If the node already contains something that
14189                              * differs between EXACTF and EXACTFU, reparse it
14190                              * as EXACTFU */
14191                             if (! maybe_exactfu) {
14192                                 len = 0;
14193                                 s = s0;
14194                                 goto reparse;
14195                             }
14196                         }
14197 
14198                         break;
14199 		    case 'r':
14200 			ender = '\r';
14201 			p++;
14202 			break;
14203 		    case 't':
14204 			ender = '\t';
14205 			p++;
14206 			break;
14207 		    case 'f':
14208 			ender = '\f';
14209 			p++;
14210 			break;
14211 		    case 'e':
14212 			ender = ESC_NATIVE;
14213 			p++;
14214 			break;
14215 		    case 'a':
14216 			ender = '\a';
14217 			p++;
14218 			break;
14219 		    case 'o':
14220 			{
14221 			    UV result;
14222 			    const char* error_msg;
14223 
14224 			    bool valid = grok_bslash_o(&p,
14225                                                        RExC_end,
14226 						       &result,
14227 						       &error_msg,
14228 						       TO_OUTPUT_WARNINGS(p),
14229                                                        (bool) RExC_strict,
14230                                                        TRUE, /* Output warnings
14231                                                                 for non-
14232                                                                 portables */
14233                                                        UTF);
14234 			    if (! valid) {
14235 				RExC_parse = p;	/* going to die anyway; point
14236 						   to exact spot of failure */
14237 				vFAIL(error_msg);
14238 			    }
14239                             UPDATE_WARNINGS_LOC(p - 1);
14240                             ender = result;
14241 			    break;
14242 			}
14243 		    case 'x':
14244 			{
14245                             UV result = UV_MAX; /* initialize to erroneous
14246                                                    value */
14247 			    const char* error_msg;
14248 
14249 			    bool valid = grok_bslash_x(&p,
14250                                                        RExC_end,
14251 						       &result,
14252 						       &error_msg,
14253                                                        TO_OUTPUT_WARNINGS(p),
14254                                                        (bool) RExC_strict,
14255                                                        TRUE, /* Silence warnings
14256                                                                 for non-
14257                                                                 portables */
14258                                                        UTF);
14259 			    if (! valid) {
14260 				RExC_parse = p;	/* going to die anyway; point
14261 						   to exact spot of failure */
14262 				vFAIL(error_msg);
14263 			    }
14264                             UPDATE_WARNINGS_LOC(p - 1);
14265                             ender = result;
14266 
14267                             if (ender < 0x100) {
14268 #ifdef EBCDIC
14269                                 if (RExC_recode_x_to_native) {
14270                                     ender = LATIN1_TO_NATIVE(ender);
14271                                 }
14272 #endif
14273 			    }
14274 			    break;
14275 			}
14276 		    case 'c':
14277 			p++;
14278 			ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14279                         UPDATE_WARNINGS_LOC(p);
14280                         p++;
14281 			break;
14282                     case '8': case '9': /* must be a backreference */
14283                         --p;
14284                         /* we have an escape like \8 which cannot be an octal escape
14285                          * so we exit the loop, and let the outer loop handle this
14286                          * escape which may or may not be a legitimate backref. */
14287                         goto loopdone;
14288                     case '1': case '2': case '3':case '4':
14289 		    case '5': case '6': case '7':
14290                         /* When we parse backslash escapes there is ambiguity
14291                          * between backreferences and octal escapes. Any escape
14292                          * from \1 - \9 is a backreference, any multi-digit
14293                          * escape which does not start with 0 and which when
14294                          * evaluated as decimal could refer to an already
14295                          * parsed capture buffer is a back reference. Anything
14296                          * else is octal.
14297                          *
14298                          * Note this implies that \118 could be interpreted as
14299                          * 118 OR as "\11" . "8" depending on whether there
14300                          * were 118 capture buffers defined already in the
14301                          * pattern.  */
14302 
14303                         /* NOTE, RExC_npar is 1 more than the actual number of
14304                          * parens we have seen so far, hence the "<" as opposed
14305                          * to "<=" */
14306                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14307                         {  /* Not to be treated as an octal constant, go
14308                                    find backref */
14309                             --p;
14310                             goto loopdone;
14311                         }
14312                         /* FALLTHROUGH */
14313                     case '0':
14314 			{
14315 			    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14316 			    STRLEN numlen = 3;
14317 			    ender = grok_oct(p, &numlen, &flags, NULL);
14318 			    p += numlen;
14319                             if (   isDIGIT(*p)  /* like \08, \178 */
14320                                 && ckWARN(WARN_REGEXP)
14321                                 && numlen < 3)
14322                             {
14323 				reg_warn_non_literal_string(
14324                                          p + 1,
14325                                          form_short_octal_warning(p, numlen));
14326                             }
14327 			}
14328 			break;
14329 		    case '\0':
14330 			if (p >= RExC_end)
14331 			    FAIL("Trailing \\");
14332 			/* FALLTHROUGH */
14333 		    default:
14334 			if (isALPHANUMERIC(*p)) {
14335                             /* An alpha followed by '{' is going to fail next
14336                              * iteration, so don't output this warning in that
14337                              * case */
14338                             if (! isALPHA(*p) || *(p + 1) != '{') {
14339                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14340                                                   " passed through", p);
14341                             }
14342 			}
14343 			goto normal_default;
14344 		    } /* End of switch on '\' */
14345 		    break;
14346 		case '{':
14347                     /* Trying to gain new uses for '{' without breaking too
14348                      * much existing code is hard.  The solution currently
14349                      * adopted is:
14350                      *  1)  If there is no ambiguity that a '{' should always
14351                      *      be taken literally, at the start of a construct, we
14352                      *      just do so.
14353                      *  2)  If the literal '{' conflicts with our desired use
14354                      *      of it as a metacharacter, we die.  The deprecation
14355                      *      cycles for this have come and gone.
14356                      *  3)  If there is ambiguity, we raise a simple warning.
14357                      *      This could happen, for example, if the user
14358                      *      intended it to introduce a quantifier, but slightly
14359                      *      misspelled the quantifier.  Without this warning,
14360                      *      the quantifier would silently be taken as a literal
14361                      *      string of characters instead of a meta construct */
14362 		    if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14363                         if (      RExC_strict
14364                             || (  p > parse_start + 1
14365                                 && isALPHA_A(*(p - 1))
14366                                 && *(p - 2) == '\\')
14367                             || new_regcurly(p, RExC_end))
14368                         {
14369                             RExC_parse = p + 1;
14370                             vFAIL("Unescaped left brace in regex is "
14371                                   "illegal here");
14372                         }
14373                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14374                                          " passed through");
14375 		    }
14376 		    goto normal_default;
14377                 case '}':
14378                 case ']':
14379                     if (p > RExC_parse && RExC_strict) {
14380                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14381                     }
14382 		    /*FALLTHROUGH*/
14383 		default:    /* A literal character */
14384 		  normal_default:
14385 		    if (! UTF8_IS_INVARIANT(*p) && UTF) {
14386 			STRLEN numlen;
14387 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14388 					       &numlen, UTF8_ALLOW_DEFAULT);
14389 			p += numlen;
14390 		    }
14391 		    else
14392 			ender = (U8) *p++;
14393 		    break;
14394 		} /* End of switch on the literal */
14395 
14396 		/* Here, have looked at the literal character, and <ender>
14397                  * contains its ordinal; <p> points to the character after it.
14398                  * */
14399 
14400                 if (ender > 255) {
14401                     REQUIRE_UTF8(flagp);
14402                 }
14403 
14404                 /* We need to check if the next non-ignored thing is a
14405                  * quantifier.  Move <p> to after anything that should be
14406                  * ignored, which, as a side effect, positions <p> for the next
14407                  * loop iteration */
14408                 skip_to_be_ignored_text(pRExC_state, &p,
14409                                         FALSE /* Don't force to /x */ );
14410 
14411                 /* If the next thing is a quantifier, it applies to this
14412                  * character only, which means that this character has to be in
14413                  * its own node and can't just be appended to the string in an
14414                  * existing node, so if there are already other characters in
14415                  * the node, close the node with just them, and set up to do
14416                  * this character again next time through, when it will be the
14417                  * only thing in its new node */
14418 
14419                 next_is_quantifier =    LIKELY(p < RExC_end)
14420                                      && UNLIKELY(ISMULT2(p));
14421 
14422                 if (next_is_quantifier && LIKELY(len)) {
14423                     p = oldp;
14424                     goto loopdone;
14425                 }
14426 
14427                 /* Ready to add 'ender' to the node */
14428 
14429                 if (! FOLD) {  /* The simple case, just append the literal */
14430 
14431                       not_fold_common:
14432                         if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14433                             *(s++) = (char) ender;
14434                         }
14435                         else {
14436                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14437                             added_len = (char *) new_s - s;
14438                             s = (char *) new_s;
14439 
14440                             if (ender > 255)  {
14441                                 requires_utf8_target = TRUE;
14442                             }
14443                         }
14444                 }
14445                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14446 
14447                     /* Here are folding under /l, and the code point is
14448                      * problematic.  If this is the first character in the
14449                      * node, change the node type to folding.   Otherwise, if
14450                      * this is the first problematic character, close up the
14451                      * existing node, so can start a new node with this one */
14452                     if (! len) {
14453                         node_type = EXACTFL;
14454                         RExC_contains_locale = 1;
14455                     }
14456                     else if (node_type == EXACT) {
14457                         p = oldp;
14458                         goto loopdone;
14459                     }
14460 
14461                     /* This problematic code point means we can't simplify
14462                      * things */
14463                     maybe_exactfu = FALSE;
14464 
14465                     /* Here, we are adding a problematic fold character.
14466                      * "Problematic" in this context means that its fold isn't
14467                      * known until runtime.  (The non-problematic code points
14468                      * are the above-Latin1 ones that fold to also all
14469                      * above-Latin1.  Their folds don't vary no matter what the
14470                      * locale is.) But here we have characters whose fold
14471                      * depends on the locale.  We just add in the unfolded
14472                      * character, and wait until runtime to fold it */
14473                     goto not_fold_common;
14474                 }
14475                 else /* regular fold; see if actually is in a fold */
14476                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14477                          || (ender > 255
14478                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14479                 {
14480                     /* Here, folding, but the character isn't in a fold.
14481                      *
14482                      * Start a new node if previous characters in the node were
14483                      * folded */
14484                     if (len && node_type != EXACT) {
14485                         p = oldp;
14486                         goto loopdone;
14487                     }
14488 
14489                     /* Here, continuing a node with non-folded characters.  Add
14490                      * this one */
14491                     goto not_fold_common;
14492                 }
14493                 else {  /* Here, does participate in some fold */
14494 
14495                     /* If this is the first character in the node, change its
14496                      * type to folding.  Otherwise, if this is the first
14497                      * folding character in the node, close up the existing
14498                      * node, so can start a new node with this one.  */
14499                     if (! len) {
14500                         node_type = compute_EXACTish(pRExC_state);
14501                     }
14502                     else if (node_type == EXACT) {
14503                         p = oldp;
14504                         goto loopdone;
14505                     }
14506 
14507                     if (UTF) {  /* Use the folded value */
14508                         if (UVCHR_IS_INVARIANT(ender)) {
14509                             *(s)++ = (U8) toFOLD(ender);
14510                         }
14511                         else {
14512                             ender = _to_uni_fold_flags(
14513                                     ender,
14514                                     (U8 *) s,
14515                                     &added_len,
14516                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14517                                                     ? FOLD_FLAGS_NOMIX_ASCII
14518                                                     : 0));
14519                             s += added_len;
14520 
14521                             if (   ender > 255
14522                                 && LIKELY(ender != GREEK_SMALL_LETTER_MU))
14523                             {
14524                                 /* U+B5 folds to the MU, so its possible for a
14525                                  * non-UTF-8 target to match it */
14526                                 requires_utf8_target = TRUE;
14527                             }
14528                         }
14529                     }
14530                     else {
14531 
14532                         /* Here is non-UTF8.  First, see if the character's
14533                          * fold differs between /d and /u. */
14534                         if (PL_fold[ender] != PL_fold_latin1[ender]) {
14535                             maybe_exactfu = FALSE;
14536                         }
14537 
14538 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14539    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14540                                       || UNICODE_DOT_DOT_VERSION > 0)
14541 
14542                         /* On non-ancient Unicode versions, this includes the
14543                          * multi-char fold SHARP S to 'ss' */
14544 
14545                         if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
14546                                  || (   isALPHA_FOLD_EQ(ender, 's')
14547                                      && len > 0
14548                                      && isALPHA_FOLD_EQ(*(s-1), 's')))
14549                         {
14550                             /* Here, we have one of the following:
14551                              *  a)  a SHARP S.  This folds to 'ss' only under
14552                              *      /u rules.  If we are in that situation,
14553                              *      fold the SHARP S to 'ss'.  See the comments
14554                              *      for join_exact() as to why we fold this
14555                              *      non-UTF at compile time, and no others.
14556                              *  b)  'ss'.  When under /u, there's nothing
14557                              *      special needed to be done here.  The
14558                              *      previous iteration handled the first 's',
14559                              *      and this iteration will handle the second.
14560                              *      If, on the otherhand it's not /u, we have
14561                              *      to exclude the possibility of moving to /u,
14562                              *      so that we won't generate an unwanted
14563                              *      match, unless, at runtime, the target
14564                              *      string is in UTF-8.
14565                              * */
14566 
14567                             has_ss = TRUE;
14568                             maybe_exactfu = FALSE;  /* Can't generate an
14569                                                        EXACTFU node (unless we
14570                                                        already are in one) */
14571                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14572                                 maybe_SIMPLE = 0;
14573                                 if (node_type == EXACTFU) {
14574                                     *(s++) = 's';
14575 
14576                                     /* Let the code below add in the extra 's' */
14577                                     ender = 's';
14578                                     added_len = 2;
14579                                 }
14580                             }
14581                         }
14582 #endif
14583 
14584                         else if (UNLIKELY(ender == MICRO_SIGN)) {
14585                             has_micro_sign = TRUE;
14586                         }
14587 
14588                         *(s++) = (DEPENDS_SEMANTICS)
14589                                  ? (char) toFOLD(ender)
14590 
14591                                    /* Under /u, the fold of any character in
14592                                     * the 0-255 range happens to be its
14593                                     * lowercase equivalent, except for LATIN
14594                                     * SMALL LETTER SHARP S, which was handled
14595                                     * above, and the MICRO SIGN, whose fold
14596                                     * requires UTF-8 to represent.  */
14597                                  : (char) toLOWER_L1(ender);
14598                     }
14599 		} /* End of adding current character to the node */
14600 
14601                 len += added_len;
14602 
14603 		if (next_is_quantifier) {
14604 
14605                     /* Here, the next input is a quantifier, and to get here,
14606                      * the current character is the only one in the node. */
14607                     goto loopdone;
14608 		}
14609 
14610 	    } /* End of loop through literal characters */
14611 
14612             /* Here we have either exhausted the input or ran out of room in
14613              * the node.  (If we encountered a character that can't be in the
14614              * node, transfer is made directly to <loopdone>, and so we
14615              * wouldn't have fallen off the end of the loop.)  In the latter
14616              * case, we artificially have to split the node into two, because
14617              * we just don't have enough space to hold everything.  This
14618              * creates a problem if the final character participates in a
14619              * multi-character fold in the non-final position, as a match that
14620              * should have occurred won't, due to the way nodes are matched,
14621              * and our artificial boundary.  So back off until we find a non-
14622              * problematic character -- one that isn't at the beginning or
14623              * middle of such a fold.  (Either it doesn't participate in any
14624              * folds, or appears only in the final position of all the folds it
14625              * does participate in.)  A better solution with far fewer false
14626              * positives, and that would fill the nodes more completely, would
14627              * be to actually have available all the multi-character folds to
14628              * test against, and to back-off only far enough to be sure that
14629              * this node isn't ending with a partial one.  <upper_parse> is set
14630              * further below (if we need to reparse the node) to include just
14631              * up through that final non-problematic character that this code
14632              * identifies, so when it is set to less than the full node, we can
14633              * skip the rest of this */
14634             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14635                 PERL_UINT_FAST8_T backup_count = 0;
14636 
14637                 const STRLEN full_len = len;
14638 
14639 		assert(len >= MAX_NODE_STRING_SIZE);
14640 
14641                 /* Here, <s> points to just beyond where we have output the
14642                  * final character of the node.  Look backwards through the
14643                  * string until find a non- problematic character */
14644 
14645 		if (! UTF) {
14646 
14647                     /* This has no multi-char folds to non-UTF characters */
14648                     if (ASCII_FOLD_RESTRICTED) {
14649                         goto loopdone;
14650                     }
14651 
14652                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
14653                         backup_count++;
14654                     }
14655                     len = s - s0 + 1;
14656 		}
14657                 else {
14658 
14659                     /* Point to the first byte of the final character */
14660                     s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
14661 
14662                     while (s >= s0) {   /* Search backwards until find
14663                                            a non-problematic char */
14664                         if (UTF8_IS_INVARIANT(*s)) {
14665 
14666                             /* There are no ascii characters that participate
14667                              * in multi-char folds under /aa.  In EBCDIC, the
14668                              * non-ascii invariants are all control characters,
14669                              * so don't ever participate in any folds. */
14670                             if (ASCII_FOLD_RESTRICTED
14671                                 || ! IS_NON_FINAL_FOLD(*s))
14672                             {
14673                                 break;
14674                             }
14675                         }
14676                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14677                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14678                                                                   *s, *(s+1))))
14679                             {
14680                                 break;
14681                             }
14682                         }
14683                         else if (! _invlist_contains_cp(
14684                                         PL_NonFinalFold,
14685                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
14686                         {
14687                             break;
14688                         }
14689 
14690                         /* Here, the current character is problematic in that
14691                          * it does occur in the non-final position of some
14692                          * fold, so try the character before it, but have to
14693                          * special case the very first byte in the string, so
14694                          * we don't read outside the string */
14695                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14696                         backup_count++;
14697                     } /* End of loop backwards through the string */
14698 
14699                     /* If there were only problematic characters in the string,
14700                      * <s> will point to before s0, in which case the length
14701                      * should be 0, otherwise include the length of the
14702                      * non-problematic character just found */
14703                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14704 		}
14705 
14706                 /* Here, have found the final character, if any, that is
14707                  * non-problematic as far as ending the node without splitting
14708                  * it across a potential multi-char fold.  <len> contains the
14709                  * number of bytes in the node up-to and including that
14710                  * character, or is 0 if there is no such character, meaning
14711                  * the whole node contains only problematic characters.  In
14712                  * this case, give up and just take the node as-is.  We can't
14713                  * do any better */
14714                 if (len == 0) {
14715                     len = full_len;
14716 
14717                 } else {
14718 
14719                     /* Here, the node does contain some characters that aren't
14720                      * problematic.  If we didn't have to backup any, then the
14721                      * final character in the node is non-problematic, and we
14722                      * can take the node as-is */
14723                     if (backup_count == 0) {
14724                         goto loopdone;
14725                     }
14726                     else if (backup_count == 1) {
14727 
14728                         /* If the final character is problematic, but the
14729                          * penultimate is not, back-off that last character to
14730                          * later start a new node with it */
14731                         p = oldp;
14732                         goto loopdone;
14733                     }
14734 
14735                     /* Here, the final non-problematic character is earlier
14736                      * in the input than the penultimate character.  What we do
14737                      * is reparse from the beginning, going up only as far as
14738                      * this final ok one, thus guaranteeing that the node ends
14739                      * in an acceptable character.  The reason we reparse is
14740                      * that we know how far in the character is, but we don't
14741                      * know how to correlate its position with the input parse.
14742                      * An alternate implementation would be to build that
14743                      * correlation as we go along during the original parse,
14744                      * but that would entail extra work for every node, whereas
14745                      * this code gets executed only when the string is too
14746                      * large for the node, and the final two characters are
14747                      * problematic, an infrequent occurrence.  Yet another
14748                      * possible strategy would be to save the tail of the
14749                      * string, and the next time regatom is called, initialize
14750                      * with that.  The problem with this is that unless you
14751                      * back off one more character, you won't be guaranteed
14752                      * regatom will get called again, unless regbranch,
14753                      * regpiece ... are also changed.  If you do back off that
14754                      * extra character, so that there is input guaranteed to
14755                      * force calling regatom, you can't handle the case where
14756                      * just the first character in the node is acceptable.  I
14757                      * (khw) decided to try this method which doesn't have that
14758                      * pitfall; if performance issues are found, we can do a
14759                      * combination of the current approach plus that one */
14760                     upper_parse = len;
14761                     len = 0;
14762                     s = s0;
14763                     goto reparse;
14764                 }
14765 	    }   /* End of verifying node ends with an appropriate char */
14766 
14767           loopdone:   /* Jumped to when encounters something that shouldn't be
14768                          in the node */
14769 
14770             /* Free up any over-allocated space; cast is to silence bogus
14771              * warning in MS VC */
14772             change_engine_size(pRExC_state,
14773                                 - (Ptrdiff_t) (initial_size - STR_SZ(len)));
14774 
14775             /* I (khw) don't know if you can get here with zero length, but the
14776              * old code handled this situation by creating a zero-length EXACT
14777              * node.  Might as well be NOTHING instead */
14778             if (len == 0) {
14779                 OP(REGNODE_p(ret)) = NOTHING;
14780             }
14781             else {
14782 
14783                 /* If the node type is EXACT here, check to see if it
14784                  * should be EXACTL, or EXACT_ONLY8. */
14785                 if (node_type == EXACT) {
14786                     if (LOC) {
14787                         node_type = EXACTL;
14788                     }
14789                     else if (requires_utf8_target) {
14790                         node_type = EXACT_ONLY8;
14791                     }
14792                 } else if (FOLD) {
14793                     if (    UNLIKELY(has_micro_sign || has_ss)
14794                         && (node_type == EXACTFU || (   node_type == EXACTF
14795                                                      && maybe_exactfu)))
14796                     {   /* These two conditions are problematic in non-UTF-8
14797                            EXACTFU nodes. */
14798                         assert(! UTF);
14799                         node_type = EXACTFUP;
14800                     }
14801                     else if (node_type == EXACTFL) {
14802 
14803                         /* 'maybe_exactfu' is deliberately set above to
14804                          * indicate this node type, where all code points in it
14805                          * are above 255 */
14806                         if (maybe_exactfu) {
14807                             node_type = EXACTFLU8;
14808                         }
14809                         else if (UNLIKELY(
14810                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
14811                         {
14812                             /* A character that folds to more than one will
14813                              * match multiple characters, so can't be SIMPLE.
14814                              * We don't have to worry about this with EXACTFLU8
14815                              * nodes just above, as they have already been
14816                              * folded (since the fold doesn't vary at run
14817                              * time).  Here, if the final character in the node
14818                              * folds to multiple, it can't be simple.  (This
14819                              * only has an effect if the node has only a single
14820                              * character, hence the final one, as elsewhere we
14821                              * turn off simple for nodes whose length > 1 */
14822                             maybe_SIMPLE = 0;
14823                         }
14824                     }
14825                     else if (node_type == EXACTF) {  /* Means is /di */
14826 
14827                         /* If 'maybe_exactfu' is clear, then we need to stay
14828                          * /di.  If it is set, it means there are no code
14829                          * points that match differently depending on UTF8ness
14830                          * of the target string, so it can become an EXACTFU
14831                          * node */
14832                         if (! maybe_exactfu) {
14833                             RExC_seen_d_op = TRUE;
14834                         }
14835                         else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
14836                                  || isALPHA_FOLD_EQ(ender, 's'))
14837                         {
14838                             /* But, if the node begins or ends in an 's' we
14839                              * have to defer changing it into an EXACTFU, as
14840                              * the node could later get joined with another one
14841                              * that ends or begins with 's' creating an 'ss'
14842                              * sequence which would then wrongly match the
14843                              * sharp s without the target being UTF-8.  We
14844                              * create a special node that we resolve later when
14845                              * we join nodes together */
14846 
14847                             node_type = EXACTFU_S_EDGE;
14848                         }
14849                         else {
14850                             node_type = EXACTFU;
14851                         }
14852                     }
14853 
14854                     if (requires_utf8_target && node_type == EXACTFU) {
14855                         node_type = EXACTFU_ONLY8;
14856                     }
14857                 }
14858 
14859                 OP(REGNODE_p(ret)) = node_type;
14860                 STR_LEN(REGNODE_p(ret)) = len;
14861                 RExC_emit += STR_SZ(len);
14862 
14863                 /* If the node isn't a single character, it can't be SIMPLE */
14864                 if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
14865                     maybe_SIMPLE = 0;
14866                 }
14867 
14868                 *flagp |= HASWIDTH | maybe_SIMPLE;
14869             }
14870 
14871             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
14872             RExC_parse = p;
14873 
14874 	    {
14875 		/* len is STRLEN which is unsigned, need to copy to signed */
14876 		IV iv = len;
14877 		if (iv < 0)
14878 		    vFAIL("Internal disaster");
14879 	    }
14880 
14881 	} /* End of label 'defchar:' */
14882 	break;
14883     } /* End of giant switch on input character */
14884 
14885     /* Position parse to next real character */
14886     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14887                                             FALSE /* Don't force to /x */ );
14888     if (   *RExC_parse == '{'
14889         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14890     {
14891         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14892             RExC_parse++;
14893             vFAIL("Unescaped left brace in regex is illegal here");
14894         }
14895         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14896                                   " passed through");
14897     }
14898 
14899     return(ret);
14900 }
14901 
14902 
14903 STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode * node,SV ** invlist_ptr)14904 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14905 {
14906     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
14907      * sets up the bitmap and any flags, removing those code points from the
14908      * inversion list, setting it to NULL should it become completely empty */
14909 
14910     dVAR;
14911 
14912     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14913     assert(PL_regkind[OP(node)] == ANYOF);
14914 
14915     /* There is no bitmap for this node type */
14916     if (OP(node) == ANYOFH) {
14917         return;
14918     }
14919 
14920     ANYOF_BITMAP_ZERO(node);
14921     if (*invlist_ptr) {
14922 
14923 	/* This gets set if we actually need to modify things */
14924 	bool change_invlist = FALSE;
14925 
14926 	UV start, end;
14927 
14928 	/* Start looking through *invlist_ptr */
14929 	invlist_iterinit(*invlist_ptr);
14930 	while (invlist_iternext(*invlist_ptr, &start, &end)) {
14931 	    UV high;
14932 	    int i;
14933 
14934             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14935                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14936             }
14937 
14938 	    /* Quit if are above what we should change */
14939 	    if (start >= NUM_ANYOF_CODE_POINTS) {
14940 		break;
14941 	    }
14942 
14943 	    change_invlist = TRUE;
14944 
14945 	    /* Set all the bits in the range, up to the max that we are doing */
14946 	    high = (end < NUM_ANYOF_CODE_POINTS - 1)
14947                    ? end
14948                    : NUM_ANYOF_CODE_POINTS - 1;
14949 	    for (i = start; i <= (int) high; i++) {
14950 		if (! ANYOF_BITMAP_TEST(node, i)) {
14951 		    ANYOF_BITMAP_SET(node, i);
14952 		}
14953 	    }
14954 	}
14955 	invlist_iterfinish(*invlist_ptr);
14956 
14957         /* Done with loop; remove any code points that are in the bitmap from
14958          * *invlist_ptr; similarly for code points above the bitmap if we have
14959          * a flag to match all of them anyways */
14960 	if (change_invlist) {
14961 	    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14962 	}
14963         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14964 	    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14965 	}
14966 
14967 	/* If have completely emptied it, remove it completely */
14968 	if (_invlist_len(*invlist_ptr) == 0) {
14969 	    SvREFCNT_dec_NN(*invlist_ptr);
14970 	    *invlist_ptr = NULL;
14971 	}
14972     }
14973 }
14974 
14975 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14976    Character classes ([:foo:]) can also be negated ([:^foo:]).
14977    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14978    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14979    but trigger failures because they are currently unimplemented. */
14980 
14981 #define POSIXCC_DONE(c)   ((c) == ':')
14982 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14983 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14984 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14985 
14986 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
14987 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
14988 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
14989 
14990 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14991 
14992 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14993  * routine. q.v. */
14994 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
14995         if (posix_warnings) {                                               \
14996             if (! RExC_warn_text ) RExC_warn_text =                         \
14997                                          (AV *) sv_2mortal((SV *) newAV()); \
14998             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
14999                                              WARNING_PREFIX                 \
15000                                              text                           \
15001                                              REPORT_LOCATION,               \
15002                                              REPORT_LOCATION_ARGS(p)));     \
15003         }                                                                   \
15004     } STMT_END
15005 #define CLEAR_POSIX_WARNINGS()                                              \
15006     STMT_START {                                                            \
15007         if (posix_warnings && RExC_warn_text)                               \
15008             av_clear(RExC_warn_text);                                       \
15009     } STMT_END
15010 
15011 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15012     STMT_START {                                                            \
15013         CLEAR_POSIX_WARNINGS();                                             \
15014         return ret;                                                         \
15015     } STMT_END
15016 
15017 STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t * pRExC_state,const char * const s,char ** updated_parse_ptr,AV ** posix_warnings,const bool check_only)15018 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15019 
15020     const char * const s,      /* Where the putative posix class begins.
15021                                   Normally, this is one past the '['.  This
15022                                   parameter exists so it can be somewhere
15023                                   besides RExC_parse. */
15024     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15025                                   NULL */
15026     AV ** posix_warnings,      /* Where to place any generated warnings, or
15027                                   NULL */
15028     const bool check_only      /* Don't die if error */
15029 )
15030 {
15031     /* This parses what the caller thinks may be one of the three POSIX
15032      * constructs:
15033      *  1) a character class, like [:blank:]
15034      *  2) a collating symbol, like [. .]
15035      *  3) an equivalence class, like [= =]
15036      * In the latter two cases, it croaks if it finds a syntactically legal
15037      * one, as these are not handled by Perl.
15038      *
15039      * The main purpose is to look for a POSIX character class.  It returns:
15040      *  a) the class number
15041      *      if it is a completely syntactically and semantically legal class.
15042      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15043      *      closing ']' of the class
15044      *  b) OOB_NAMEDCLASS
15045      *      if it appears that one of the three POSIX constructs was meant, but
15046      *      its specification was somehow defective.  'updated_parse_ptr', if
15047      *      not NULL, is set to point to the character just after the end
15048      *      character of the class.  See below for handling of warnings.
15049      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15050      *      if it  doesn't appear that a POSIX construct was intended.
15051      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15052      *      raised.
15053      *
15054      * In b) there may be errors or warnings generated.  If 'check_only' is
15055      * TRUE, then any errors are discarded.  Warnings are returned to the
15056      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15057      * instead it is NULL, warnings are suppressed.
15058      *
15059      * The reason for this function, and its complexity is that a bracketed
15060      * character class can contain just about anything.  But it's easy to
15061      * mistype the very specific posix class syntax but yielding a valid
15062      * regular bracketed class, so it silently gets compiled into something
15063      * quite unintended.
15064      *
15065      * The solution adopted here maintains backward compatibility except that
15066      * it adds a warning if it looks like a posix class was intended but
15067      * improperly specified.  The warning is not raised unless what is input
15068      * very closely resembles one of the 14 legal posix classes.  To do this,
15069      * it uses fuzzy parsing.  It calculates how many single-character edits it
15070      * would take to transform what was input into a legal posix class.  Only
15071      * if that number is quite small does it think that the intention was a
15072      * posix class.  Obviously these are heuristics, and there will be cases
15073      * where it errs on one side or another, and they can be tweaked as
15074      * experience informs.
15075      *
15076      * The syntax for a legal posix class is:
15077      *
15078      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15079      *
15080      * What this routine considers syntactically to be an intended posix class
15081      * is this (the comments indicate some restrictions that the pattern
15082      * doesn't show):
15083      *
15084      *  qr/(?x: \[?                         # The left bracket, possibly
15085      *                                      # omitted
15086      *          \h*                         # possibly followed by blanks
15087      *          (?: \^ \h* )?               # possibly a misplaced caret
15088      *          [:;]?                       # The opening class character,
15089      *                                      # possibly omitted.  A typo
15090      *                                      # semi-colon can also be used.
15091      *          \h*
15092      *          \^?                         # possibly a correctly placed
15093      *                                      # caret, but not if there was also
15094      *                                      # a misplaced one
15095      *          \h*
15096      *          .{3,15}                     # The class name.  If there are
15097      *                                      # deviations from the legal syntax,
15098      *                                      # its edit distance must be close
15099      *                                      # to a real class name in order
15100      *                                      # for it to be considered to be
15101      *                                      # an intended posix class.
15102      *          \h*
15103      *          [[:punct:]]?                # The closing class character,
15104      *                                      # possibly omitted.  If not a colon
15105      *                                      # nor semi colon, the class name
15106      *                                      # must be even closer to a valid
15107      *                                      # one
15108      *          \h*
15109      *          \]?                         # The right bracket, possibly
15110      *                                      # omitted.
15111      *     )/
15112      *
15113      * In the above, \h must be ASCII-only.
15114      *
15115      * These are heuristics, and can be tweaked as field experience dictates.
15116      * There will be cases when someone didn't intend to specify a posix class
15117      * that this warns as being so.  The goal is to minimize these, while
15118      * maximizing the catching of things intended to be a posix class that
15119      * aren't parsed as such.
15120      */
15121 
15122     const char* p             = s;
15123     const char * const e      = RExC_end;
15124     unsigned complement       = 0;      /* If to complement the class */
15125     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15126     bool has_opening_bracket  = FALSE;
15127     bool has_opening_colon    = FALSE;
15128     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15129                                                    valid class */
15130     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15131     const char* name_start;             /* ptr to class name first char */
15132 
15133     /* If the number of single-character typos the input name is away from a
15134      * legal name is no more than this number, it is considered to have meant
15135      * the legal name */
15136     int max_distance          = 2;
15137 
15138     /* to store the name.  The size determines the maximum length before we
15139      * decide that no posix class was intended.  Should be at least
15140      * sizeof("alphanumeric") */
15141     UV input_text[15];
15142     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15143 
15144     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15145 
15146     CLEAR_POSIX_WARNINGS();
15147 
15148     if (p >= e) {
15149         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15150     }
15151 
15152     if (*(p - 1) != '[') {
15153         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15154         found_problem = TRUE;
15155     }
15156     else {
15157         has_opening_bracket = TRUE;
15158     }
15159 
15160     /* They could be confused and think you can put spaces between the
15161      * components */
15162     if (isBLANK(*p)) {
15163         found_problem = TRUE;
15164 
15165         do {
15166             p++;
15167         } while (p < e && isBLANK(*p));
15168 
15169         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15170     }
15171 
15172     /* For [. .] and [= =].  These are quite different internally from [: :],
15173      * so they are handled separately.  */
15174     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15175                                             and 1 for at least one char in it
15176                                           */
15177     {
15178         const char open_char  = *p;
15179         const char * temp_ptr = p + 1;
15180 
15181         /* These two constructs are not handled by perl, and if we find a
15182          * syntactically valid one, we croak.  khw, who wrote this code, finds
15183          * this explanation of them very unclear:
15184          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15185          * And searching the rest of the internet wasn't very helpful either.
15186          * It looks like just about any byte can be in these constructs,
15187          * depending on the locale.  But unless the pattern is being compiled
15188          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15189          * In that case, it looks like [= =] isn't allowed at all, and that
15190          * [. .] could be any single code point, but for longer strings the
15191          * constituent characters would have to be the ASCII alphabetics plus
15192          * the minus-hyphen.  Any sensible locale definition would limit itself
15193          * to these.  And any portable one definitely should.  Trying to parse
15194          * the general case is a nightmare (see [perl #127604]).  So, this code
15195          * looks only for interiors of these constructs that match:
15196          *      qr/.|[-\w]{2,}/
15197          * Using \w relaxes the apparent rules a little, without adding much
15198          * danger of mistaking something else for one of these constructs.
15199          *
15200          * [. .] in some implementations described on the internet is usable to
15201          * escape a character that otherwise is special in bracketed character
15202          * classes.  For example [.].] means a literal right bracket instead of
15203          * the ending of the class
15204          *
15205          * [= =] can legitimately contain a [. .] construct, but we don't
15206          * handle this case, as that [. .] construct will later get parsed
15207          * itself and croak then.  And [= =] is checked for even when not under
15208          * /l, as Perl has long done so.
15209          *
15210          * The code below relies on there being a trailing NUL, so it doesn't
15211          * have to keep checking if the parse ptr < e.
15212          */
15213         if (temp_ptr[1] == open_char) {
15214             temp_ptr++;
15215         }
15216         else while (    temp_ptr < e
15217                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15218         {
15219             temp_ptr++;
15220         }
15221 
15222         if (*temp_ptr == open_char) {
15223             temp_ptr++;
15224             if (*temp_ptr == ']') {
15225                 temp_ptr++;
15226                 if (! found_problem && ! check_only) {
15227                     RExC_parse = (char *) temp_ptr;
15228                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15229                             "extensions", open_char, open_char);
15230                 }
15231 
15232                 /* Here, the syntax wasn't completely valid, or else the call
15233                  * is to check-only */
15234                 if (updated_parse_ptr) {
15235                     *updated_parse_ptr = (char *) temp_ptr;
15236                 }
15237 
15238                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15239             }
15240         }
15241 
15242         /* If we find something that started out to look like one of these
15243          * constructs, but isn't, we continue below so that it can be checked
15244          * for being a class name with a typo of '.' or '=' instead of a colon.
15245          * */
15246     }
15247 
15248     /* Here, we think there is a possibility that a [: :] class was meant, and
15249      * we have the first real character.  It could be they think the '^' comes
15250      * first */
15251     if (*p == '^') {
15252         found_problem = TRUE;
15253         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15254         complement = 1;
15255         p++;
15256 
15257         if (isBLANK(*p)) {
15258             found_problem = TRUE;
15259 
15260             do {
15261                 p++;
15262             } while (p < e && isBLANK(*p));
15263 
15264             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15265         }
15266     }
15267 
15268     /* But the first character should be a colon, which they could have easily
15269      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15270      * distinguish from a colon, so treat that as a colon).  */
15271     if (*p == ':') {
15272         p++;
15273         has_opening_colon = TRUE;
15274     }
15275     else if (*p == ';') {
15276         found_problem = TRUE;
15277         p++;
15278         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15279         has_opening_colon = TRUE;
15280     }
15281     else {
15282         found_problem = TRUE;
15283         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15284 
15285         /* Consider an initial punctuation (not one of the recognized ones) to
15286          * be a left terminator */
15287         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15288             p++;
15289         }
15290     }
15291 
15292     /* They may think that you can put spaces between the components */
15293     if (isBLANK(*p)) {
15294         found_problem = TRUE;
15295 
15296         do {
15297             p++;
15298         } while (p < e && isBLANK(*p));
15299 
15300         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15301     }
15302 
15303     if (*p == '^') {
15304 
15305         /* We consider something like [^:^alnum:]] to not have been intended to
15306          * be a posix class, but XXX maybe we should */
15307         if (complement) {
15308             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15309         }
15310 
15311         complement = 1;
15312         p++;
15313     }
15314 
15315     /* Again, they may think that you can put spaces between the components */
15316     if (isBLANK(*p)) {
15317         found_problem = TRUE;
15318 
15319         do {
15320             p++;
15321         } while (p < e && isBLANK(*p));
15322 
15323         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15324     }
15325 
15326     if (*p == ']') {
15327 
15328         /* XXX This ']' may be a typo, and something else was meant.  But
15329          * treating it as such creates enough complications, that that
15330          * possibility isn't currently considered here.  So we assume that the
15331          * ']' is what is intended, and if we've already found an initial '[',
15332          * this leaves this construct looking like [:] or [:^], which almost
15333          * certainly weren't intended to be posix classes */
15334         if (has_opening_bracket) {
15335             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15336         }
15337 
15338         /* But this function can be called when we parse the colon for
15339          * something like qr/[alpha:]]/, so we back up to look for the
15340          * beginning */
15341         p--;
15342 
15343         if (*p == ';') {
15344             found_problem = TRUE;
15345             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15346         }
15347         else if (*p != ':') {
15348 
15349             /* XXX We are currently very restrictive here, so this code doesn't
15350              * consider the possibility that, say, /[alpha.]]/ was intended to
15351              * be a posix class. */
15352             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15353         }
15354 
15355         /* Here we have something like 'foo:]'.  There was no initial colon,
15356          * and we back up over 'foo.  XXX Unlike the going forward case, we
15357          * don't handle typos of non-word chars in the middle */
15358         has_opening_colon = FALSE;
15359         p--;
15360 
15361         while (p > RExC_start && isWORDCHAR(*p)) {
15362             p--;
15363         }
15364         p++;
15365 
15366         /* Here, we have positioned ourselves to where we think the first
15367          * character in the potential class is */
15368     }
15369 
15370     /* Now the interior really starts.  There are certain key characters that
15371      * can end the interior, or these could just be typos.  To catch both
15372      * cases, we may have to do two passes.  In the first pass, we keep on
15373      * going unless we come to a sequence that matches
15374      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15375      * This means it takes a sequence to end the pass, so two typos in a row if
15376      * that wasn't what was intended.  If the class is perfectly formed, just
15377      * this one pass is needed.  We also stop if there are too many characters
15378      * being accumulated, but this number is deliberately set higher than any
15379      * real class.  It is set high enough so that someone who thinks that
15380      * 'alphanumeric' is a correct name would get warned that it wasn't.
15381      * While doing the pass, we keep track of where the key characters were in
15382      * it.  If we don't find an end to the class, and one of the key characters
15383      * was found, we redo the pass, but stop when we get to that character.
15384      * Thus the key character was considered a typo in the first pass, but a
15385      * terminator in the second.  If two key characters are found, we stop at
15386      * the second one in the first pass.  Again this can miss two typos, but
15387      * catches a single one
15388      *
15389      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15390      * point to the first key character.  For the second pass, it starts as -1.
15391      * */
15392 
15393     name_start = p;
15394   parse_name:
15395     {
15396         bool has_blank               = FALSE;
15397         bool has_upper               = FALSE;
15398         bool has_terminating_colon   = FALSE;
15399         bool has_terminating_bracket = FALSE;
15400         bool has_semi_colon          = FALSE;
15401         unsigned int name_len        = 0;
15402         int punct_count              = 0;
15403 
15404         while (p < e) {
15405 
15406             /* Squeeze out blanks when looking up the class name below */
15407             if (isBLANK(*p) ) {
15408                 has_blank = TRUE;
15409                 found_problem = TRUE;
15410                 p++;
15411                 continue;
15412             }
15413 
15414             /* The name will end with a punctuation */
15415             if (isPUNCT(*p)) {
15416                 const char * peek = p + 1;
15417 
15418                 /* Treat any non-']' punctuation followed by a ']' (possibly
15419                  * with intervening blanks) as trying to terminate the class.
15420                  * ']]' is very likely to mean a class was intended (but
15421                  * missing the colon), but the warning message that gets
15422                  * generated shows the error position better if we exit the
15423                  * loop at the bottom (eventually), so skip it here. */
15424                 if (*p != ']') {
15425                     if (peek < e && isBLANK(*peek)) {
15426                         has_blank = TRUE;
15427                         found_problem = TRUE;
15428                         do {
15429                             peek++;
15430                         } while (peek < e && isBLANK(*peek));
15431                     }
15432 
15433                     if (peek < e && *peek == ']') {
15434                         has_terminating_bracket = TRUE;
15435                         if (*p == ':') {
15436                             has_terminating_colon = TRUE;
15437                         }
15438                         else if (*p == ';') {
15439                             has_semi_colon = TRUE;
15440                             has_terminating_colon = TRUE;
15441                         }
15442                         else {
15443                             found_problem = TRUE;
15444                         }
15445                         p = peek + 1;
15446                         goto try_posix;
15447                     }
15448                 }
15449 
15450                 /* Here we have punctuation we thought didn't end the class.
15451                  * Keep track of the position of the key characters that are
15452                  * more likely to have been class-enders */
15453                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15454 
15455                     /* Allow just one such possible class-ender not actually
15456                      * ending the class. */
15457                     if (possible_end) {
15458                         break;
15459                     }
15460                     possible_end = p;
15461                 }
15462 
15463                 /* If we have too many punctuation characters, no use in
15464                  * keeping going */
15465                 if (++punct_count > max_distance) {
15466                     break;
15467                 }
15468 
15469                 /* Treat the punctuation as a typo. */
15470                 input_text[name_len++] = *p;
15471                 p++;
15472             }
15473             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15474                 input_text[name_len++] = toLOWER(*p);
15475                 has_upper = TRUE;
15476                 found_problem = TRUE;
15477                 p++;
15478             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15479                 input_text[name_len++] = *p;
15480                 p++;
15481             }
15482             else {
15483                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15484                 p+= UTF8SKIP(p);
15485             }
15486 
15487             /* The declaration of 'input_text' is how long we allow a potential
15488              * class name to be, before saying they didn't mean a class name at
15489              * all */
15490             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15491                 break;
15492             }
15493         }
15494 
15495         /* We get to here when the possible class name hasn't been properly
15496          * terminated before:
15497          *   1) we ran off the end of the pattern; or
15498          *   2) found two characters, each of which might have been intended to
15499          *      be the name's terminator
15500          *   3) found so many punctuation characters in the purported name,
15501          *      that the edit distance to a valid one is exceeded
15502          *   4) we decided it was more characters than anyone could have
15503          *      intended to be one. */
15504 
15505         found_problem = TRUE;
15506 
15507         /* In the final two cases, we know that looking up what we've
15508          * accumulated won't lead to a match, even a fuzzy one. */
15509         if (   name_len >= C_ARRAY_LENGTH(input_text)
15510             || punct_count > max_distance)
15511         {
15512             /* If there was an intermediate key character that could have been
15513              * an intended end, redo the parse, but stop there */
15514             if (possible_end && possible_end != (char *) -1) {
15515                 possible_end = (char *) -1; /* Special signal value to say
15516                                                we've done a first pass */
15517                 p = name_start;
15518                 goto parse_name;
15519             }
15520 
15521             /* Otherwise, it can't have meant to have been a class */
15522             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15523         }
15524 
15525         /* If we ran off the end, and the final character was a punctuation
15526          * one, back up one, to look at that final one just below.  Later, we
15527          * will restore the parse pointer if appropriate */
15528         if (name_len && p == e && isPUNCT(*(p-1))) {
15529             p--;
15530             name_len--;
15531         }
15532 
15533         if (p < e && isPUNCT(*p)) {
15534             if (*p == ']') {
15535                 has_terminating_bracket = TRUE;
15536 
15537                 /* If this is a 2nd ']', and the first one is just below this
15538                  * one, consider that to be the real terminator.  This gives a
15539                  * uniform and better positioning for the warning message  */
15540                 if (   possible_end
15541                     && possible_end != (char *) -1
15542                     && *possible_end == ']'
15543                     && name_len && input_text[name_len - 1] == ']')
15544                 {
15545                     name_len--;
15546                     p = possible_end;
15547 
15548                     /* And this is actually equivalent to having done the 2nd
15549                      * pass now, so set it to not try again */
15550                     possible_end = (char *) -1;
15551                 }
15552             }
15553             else {
15554                 if (*p == ':') {
15555                     has_terminating_colon = TRUE;
15556                 }
15557                 else if (*p == ';') {
15558                     has_semi_colon = TRUE;
15559                     has_terminating_colon = TRUE;
15560                 }
15561                 p++;
15562             }
15563         }
15564 
15565     try_posix:
15566 
15567         /* Here, we have a class name to look up.  We can short circuit the
15568          * stuff below for short names that can't possibly be meant to be a
15569          * class name.  (We can do this on the first pass, as any second pass
15570          * will yield an even shorter name) */
15571         if (name_len < 3) {
15572             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15573         }
15574 
15575         /* Find which class it is.  Initially switch on the length of the name.
15576          * */
15577         switch (name_len) {
15578             case 4:
15579                 if (memEQs(name_start, 4, "word")) {
15580                     /* this is not POSIX, this is the Perl \w */
15581                     class_number = ANYOF_WORDCHAR;
15582                 }
15583                 break;
15584             case 5:
15585                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15586                  *                        graph lower print punct space upper
15587                  * Offset 4 gives the best switch position.  */
15588                 switch (name_start[4]) {
15589                     case 'a':
15590                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15591                             class_number = ANYOF_ALPHA;
15592                         break;
15593                     case 'e':
15594                         if (memBEGINs(name_start, 5, "spac")) /* space */
15595                             class_number = ANYOF_SPACE;
15596                         break;
15597                     case 'h':
15598                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15599                             class_number = ANYOF_GRAPH;
15600                         break;
15601                     case 'i':
15602                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15603                             class_number = ANYOF_ASCII;
15604                         break;
15605                     case 'k':
15606                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15607                             class_number = ANYOF_BLANK;
15608                         break;
15609                     case 'l':
15610                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15611                             class_number = ANYOF_CNTRL;
15612                         break;
15613                     case 'm':
15614                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15615                             class_number = ANYOF_ALPHANUMERIC;
15616                         break;
15617                     case 'r':
15618                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15619                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15620                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15621                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15622                         break;
15623                     case 't':
15624                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15625                             class_number = ANYOF_DIGIT;
15626                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15627                             class_number = ANYOF_PRINT;
15628                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15629                             class_number = ANYOF_PUNCT;
15630                         break;
15631                 }
15632                 break;
15633             case 6:
15634                 if (memEQs(name_start, 6, "xdigit"))
15635                     class_number = ANYOF_XDIGIT;
15636                 break;
15637         }
15638 
15639         /* If the name exactly matches a posix class name the class number will
15640          * here be set to it, and the input almost certainly was meant to be a
15641          * posix class, so we can skip further checking.  If instead the syntax
15642          * is exactly correct, but the name isn't one of the legal ones, we
15643          * will return that as an error below.  But if neither of these apply,
15644          * it could be that no posix class was intended at all, or that one
15645          * was, but there was a typo.  We tease these apart by doing fuzzy
15646          * matching on the name */
15647         if (class_number == OOB_NAMEDCLASS && found_problem) {
15648             const UV posix_names[][6] = {
15649                                                 { 'a', 'l', 'n', 'u', 'm' },
15650                                                 { 'a', 'l', 'p', 'h', 'a' },
15651                                                 { 'a', 's', 'c', 'i', 'i' },
15652                                                 { 'b', 'l', 'a', 'n', 'k' },
15653                                                 { 'c', 'n', 't', 'r', 'l' },
15654                                                 { 'd', 'i', 'g', 'i', 't' },
15655                                                 { 'g', 'r', 'a', 'p', 'h' },
15656                                                 { 'l', 'o', 'w', 'e', 'r' },
15657                                                 { 'p', 'r', 'i', 'n', 't' },
15658                                                 { 'p', 'u', 'n', 'c', 't' },
15659                                                 { 's', 'p', 'a', 'c', 'e' },
15660                                                 { 'u', 'p', 'p', 'e', 'r' },
15661                                                 { 'w', 'o', 'r', 'd' },
15662                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15663                                             };
15664             /* The names of the above all have added NULs to make them the same
15665              * size, so we need to also have the real lengths */
15666             const UV posix_name_lengths[] = {
15667                                                 sizeof("alnum") - 1,
15668                                                 sizeof("alpha") - 1,
15669                                                 sizeof("ascii") - 1,
15670                                                 sizeof("blank") - 1,
15671                                                 sizeof("cntrl") - 1,
15672                                                 sizeof("digit") - 1,
15673                                                 sizeof("graph") - 1,
15674                                                 sizeof("lower") - 1,
15675                                                 sizeof("print") - 1,
15676                                                 sizeof("punct") - 1,
15677                                                 sizeof("space") - 1,
15678                                                 sizeof("upper") - 1,
15679                                                 sizeof("word")  - 1,
15680                                                 sizeof("xdigit")- 1
15681                                             };
15682             unsigned int i;
15683             int temp_max = max_distance;    /* Use a temporary, so if we
15684                                                reparse, we haven't changed the
15685                                                outer one */
15686 
15687             /* Use a smaller max edit distance if we are missing one of the
15688              * delimiters */
15689             if (   has_opening_bracket + has_opening_colon < 2
15690                 || has_terminating_bracket + has_terminating_colon < 2)
15691             {
15692                 temp_max--;
15693             }
15694 
15695             /* See if the input name is close to a legal one */
15696             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15697 
15698                 /* Short circuit call if the lengths are too far apart to be
15699                  * able to match */
15700                 if (abs( (int) (name_len - posix_name_lengths[i]))
15701                     > temp_max)
15702                 {
15703                     continue;
15704                 }
15705 
15706                 if (edit_distance(input_text,
15707                                   posix_names[i],
15708                                   name_len,
15709                                   posix_name_lengths[i],
15710                                   temp_max
15711                                  )
15712                     > -1)
15713                 { /* If it is close, it probably was intended to be a class */
15714                     goto probably_meant_to_be;
15715                 }
15716             }
15717 
15718             /* Here the input name is not close enough to a valid class name
15719              * for us to consider it to be intended to be a posix class.  If
15720              * we haven't already done so, and the parse found a character that
15721              * could have been terminators for the name, but which we absorbed
15722              * as typos during the first pass, repeat the parse, signalling it
15723              * to stop at that character */
15724             if (possible_end && possible_end != (char *) -1) {
15725                 possible_end = (char *) -1;
15726                 p = name_start;
15727                 goto parse_name;
15728             }
15729 
15730             /* Here neither pass found a close-enough class name */
15731             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15732         }
15733 
15734     probably_meant_to_be:
15735 
15736         /* Here we think that a posix specification was intended.  Update any
15737          * parse pointer */
15738         if (updated_parse_ptr) {
15739             *updated_parse_ptr = (char *) p;
15740         }
15741 
15742         /* If a posix class name was intended but incorrectly specified, we
15743          * output or return the warnings */
15744         if (found_problem) {
15745 
15746             /* We set flags for these issues in the parse loop above instead of
15747              * adding them to the list of warnings, because we can parse it
15748              * twice, and we only want one warning instance */
15749             if (has_upper) {
15750                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15751             }
15752             if (has_blank) {
15753                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15754             }
15755             if (has_semi_colon) {
15756                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15757             }
15758             else if (! has_terminating_colon) {
15759                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15760             }
15761             if (! has_terminating_bracket) {
15762                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15763             }
15764 
15765             if (   posix_warnings
15766                 && RExC_warn_text
15767                 && av_top_index(RExC_warn_text) > -1)
15768             {
15769                 *posix_warnings = RExC_warn_text;
15770             }
15771         }
15772         else if (class_number != OOB_NAMEDCLASS) {
15773             /* If it is a known class, return the class.  The class number
15774              * #defines are structured so each complement is +1 to the normal
15775              * one */
15776             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15777         }
15778         else if (! check_only) {
15779 
15780             /* Here, it is an unrecognized class.  This is an error (unless the
15781             * call is to check only, which we've already handled above) */
15782             const char * const complement_string = (complement)
15783                                                    ? "^"
15784                                                    : "";
15785             RExC_parse = (char *) p;
15786             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15787                         complement_string,
15788                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15789         }
15790     }
15791 
15792     return OOB_NAMEDCLASS;
15793 }
15794 #undef ADD_POSIX_WARNING
15795 
15796 STATIC unsigned  int
S_regex_set_precedence(const U8 my_operator)15797 S_regex_set_precedence(const U8 my_operator) {
15798 
15799     /* Returns the precedence in the (?[...]) construct of the input operator,
15800      * specified by its character representation.  The precedence follows
15801      * general Perl rules, but it extends this so that ')' and ']' have (low)
15802      * precedence even though they aren't really operators */
15803 
15804     switch (my_operator) {
15805         case '!':
15806             return 5;
15807         case '&':
15808             return 4;
15809         case '^':
15810         case '|':
15811         case '+':
15812         case '-':
15813             return 3;
15814         case ')':
15815             return 2;
15816         case ']':
15817             return 1;
15818     }
15819 
15820     NOT_REACHED; /* NOTREACHED */
15821     return 0;   /* Silence compiler warning */
15822 }
15823 
15824 STATIC regnode_offset
S_handle_regex_sets(pTHX_ RExC_state_t * pRExC_state,SV ** return_invlist,I32 * flagp,U32 depth,char * const oregcomp_parse)15825 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15826                     I32 *flagp, U32 depth,
15827                     char * const oregcomp_parse)
15828 {
15829     /* Handle the (?[...]) construct to do set operations */
15830 
15831     U8 curchar;                     /* Current character being parsed */
15832     UV start, end;	            /* End points of code point ranges */
15833     SV* final = NULL;               /* The end result inversion list */
15834     SV* result_string;              /* 'final' stringified */
15835     AV* stack;                      /* stack of operators and operands not yet
15836                                        resolved */
15837     AV* fence_stack = NULL;         /* A stack containing the positions in
15838                                        'stack' of where the undealt-with left
15839                                        parens would be if they were actually
15840                                        put there */
15841     /* The 'volatile' is a workaround for an optimiser bug
15842      * in Solaris Studio 12.3. See RT #127455 */
15843     volatile IV fence = 0;          /* Position of where most recent undealt-
15844                                        with left paren in stack is; -1 if none.
15845                                      */
15846     STRLEN len;                     /* Temporary */
15847     regnode_offset node;                  /* Temporary, and final regnode returned by
15848                                        this function */
15849     const bool save_fold = FOLD;    /* Temporary */
15850     char *save_end, *save_parse;    /* Temporaries */
15851     const bool in_locale = LOC;     /* we turn off /l during processing */
15852 
15853     GET_RE_DEBUG_FLAGS_DECL;
15854 
15855     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15856 
15857     DEBUG_PARSE("xcls");
15858 
15859     if (in_locale) {
15860         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15861     }
15862 
15863     /* The use of this operator implies /u.  This is required so that the
15864      * compile time values are valid in all runtime cases */
15865     REQUIRE_UNI_RULES(flagp, 0);
15866 
15867     ckWARNexperimental(RExC_parse,
15868                        WARN_EXPERIMENTAL__REGEX_SETS,
15869                        "The regex_sets feature is experimental");
15870 
15871     /* Everything in this construct is a metacharacter.  Operands begin with
15872      * either a '\' (for an escape sequence), or a '[' for a bracketed
15873      * character class.  Any other character should be an operator, or
15874      * parenthesis for grouping.  Both types of operands are handled by calling
15875      * regclass() to parse them.  It is called with a parameter to indicate to
15876      * return the computed inversion list.  The parsing here is implemented via
15877      * a stack.  Each entry on the stack is a single character representing one
15878      * of the operators; or else a pointer to an operand inversion list. */
15879 
15880 #define IS_OPERATOR(a) SvIOK(a)
15881 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
15882 
15883     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
15884      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15885      * with pronouncing it called it Reverse Polish instead, but now that YOU
15886      * know how to pronounce it you can use the correct term, thus giving due
15887      * credit to the person who invented it, and impressing your geek friends.
15888      * Wikipedia says that the pronounciation of "Ł" has been changing so that
15889      * it is now more like an English initial W (as in wonk) than an L.)
15890      *
15891      * This means that, for example, 'a | b & c' is stored on the stack as
15892      *
15893      * c  [4]
15894      * b  [3]
15895      * &  [2]
15896      * a  [1]
15897      * |  [0]
15898      *
15899      * where the numbers in brackets give the stack [array] element number.
15900      * In this implementation, parentheses are not stored on the stack.
15901      * Instead a '(' creates a "fence" so that the part of the stack below the
15902      * fence is invisible except to the corresponding ')' (this allows us to
15903      * replace testing for parens, by using instead subtraction of the fence
15904      * position).  As new operands are processed they are pushed onto the stack
15905      * (except as noted in the next paragraph).  New operators of higher
15906      * precedence than the current final one are inserted on the stack before
15907      * the lhs operand (so that when the rhs is pushed next, everything will be
15908      * in the correct positions shown above.  When an operator of equal or
15909      * lower precedence is encountered in parsing, all the stacked operations
15910      * of equal or higher precedence are evaluated, leaving the result as the
15911      * top entry on the stack.  This makes higher precedence operations
15912      * evaluate before lower precedence ones, and causes operations of equal
15913      * precedence to left associate.
15914      *
15915      * The only unary operator '!' is immediately pushed onto the stack when
15916      * encountered.  When an operand is encountered, if the top of the stack is
15917      * a '!", the complement is immediately performed, and the '!' popped.  The
15918      * resulting value is treated as a new operand, and the logic in the
15919      * previous paragraph is executed.  Thus in the expression
15920      *      [a] + ! [b]
15921      * the stack looks like
15922      *
15923      * !
15924      * a
15925      * +
15926      *
15927      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15928      * becomes
15929      *
15930      * !b
15931      * a
15932      * +
15933      *
15934      * A ')' is treated as an operator with lower precedence than all the
15935      * aforementioned ones, which causes all operations on the stack above the
15936      * corresponding '(' to be evaluated down to a single resultant operand.
15937      * Then the fence for the '(' is removed, and the operand goes through the
15938      * algorithm above, without the fence.
15939      *
15940      * A separate stack is kept of the fence positions, so that the position of
15941      * the latest so-far unbalanced '(' is at the top of it.
15942      *
15943      * The ']' ending the construct is treated as the lowest operator of all,
15944      * so that everything gets evaluated down to a single operand, which is the
15945      * result */
15946 
15947     sv_2mortal((SV *)(stack = newAV()));
15948     sv_2mortal((SV *)(fence_stack = newAV()));
15949 
15950     while (RExC_parse < RExC_end) {
15951         I32 top_index;              /* Index of top-most element in 'stack' */
15952         SV** top_ptr;               /* Pointer to top 'stack' element */
15953         SV* current = NULL;         /* To contain the current inversion list
15954                                        operand */
15955         SV* only_to_avoid_leaks;
15956 
15957         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15958                                 TRUE /* Force /x */ );
15959         if (RExC_parse >= RExC_end) {   /* Fail */
15960             break;
15961         }
15962 
15963         curchar = UCHARAT(RExC_parse);
15964 
15965 redo_curchar:
15966 
15967 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15968                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15969         DEBUG_U(dump_regex_sets_structures(pRExC_state,
15970                                            stack, fence, fence_stack));
15971 #endif
15972 
15973         top_index = av_tindex_skip_len_mg(stack);
15974 
15975         switch (curchar) {
15976             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15977             char stacked_operator;  /* The topmost operator on the 'stack'. */
15978             SV* lhs;                /* Operand to the left of the operator */
15979             SV* rhs;                /* Operand to the right of the operator */
15980             SV* fence_ptr;          /* Pointer to top element of the fence
15981                                        stack */
15982 
15983             case '(':
15984 
15985                 if (   RExC_parse < RExC_end - 2
15986                     && UCHARAT(RExC_parse + 1) == '?'
15987                     && UCHARAT(RExC_parse + 2) == '^')
15988                 {
15989                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
15990                      * This happens when we have some thing like
15991                      *
15992                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15993                      *   ...
15994                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15995                      *
15996                      * Here we would be handling the interpolated
15997                      * '$thai_or_lao'.  We handle this by a recursive call to
15998                      * ourselves which returns the inversion list the
15999                      * interpolated expression evaluates to.  We use the flags
16000                      * from the interpolated pattern. */
16001                     U32 save_flags = RExC_flags;
16002                     const char * save_parse;
16003 
16004                     RExC_parse += 2;        /* Skip past the '(?' */
16005                     save_parse = RExC_parse;
16006 
16007                     /* Parse the flags for the '(?'.  We already know the first
16008                      * flag to parse is a '^' */
16009                     parse_lparen_question_flags(pRExC_state);
16010 
16011                     if (   RExC_parse >= RExC_end - 4
16012                         || UCHARAT(RExC_parse) != ':'
16013                         || UCHARAT(++RExC_parse) != '('
16014                         || UCHARAT(++RExC_parse) != '?'
16015                         || UCHARAT(++RExC_parse) != '[')
16016                     {
16017 
16018                         /* In combination with the above, this moves the
16019                          * pointer to the point just after the first erroneous
16020                          * character. */
16021                         if (RExC_parse >= RExC_end - 4) {
16022                             RExC_parse = RExC_end;
16023                         }
16024                         else if (RExC_parse != save_parse) {
16025                             RExC_parse += (UTF)
16026                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
16027                                           : 1;
16028                         }
16029                         vFAIL("Expecting '(?flags:(?[...'");
16030                     }
16031 
16032                     /* Recurse, with the meat of the embedded expression */
16033                     RExC_parse++;
16034                     if (! handle_regex_sets(pRExC_state, &current, flagp,
16035                                                     depth+1, oregcomp_parse))
16036                     {
16037                         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16038                     }
16039 
16040                     /* Here, 'current' contains the embedded expression's
16041                      * inversion list, and RExC_parse points to the trailing
16042                      * ']'; the next character should be the ')' */
16043                     RExC_parse++;
16044                     if (UCHARAT(RExC_parse) != ')')
16045                         vFAIL("Expecting close paren for nested extended charclass");
16046 
16047                     /* Then the ')' matching the original '(' handled by this
16048                      * case: statement */
16049                     RExC_parse++;
16050                     if (UCHARAT(RExC_parse) != ')')
16051                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
16052 
16053                     RExC_flags = save_flags;
16054                     goto handle_operand;
16055                 }
16056 
16057                 /* A regular '('.  Look behind for illegal syntax */
16058                 if (top_index - fence >= 0) {
16059                     /* If the top entry on the stack is an operator, it had
16060                      * better be a '!', otherwise the entry below the top
16061                      * operand should be an operator */
16062                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16063                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16064                         || (   IS_OPERAND(*top_ptr)
16065                             && (   top_index - fence < 1
16066                                 || ! (stacked_ptr = av_fetch(stack,
16067                                                              top_index - 1,
16068                                                              FALSE))
16069                                 || ! IS_OPERATOR(*stacked_ptr))))
16070                     {
16071                         RExC_parse++;
16072                         vFAIL("Unexpected '(' with no preceding operator");
16073                     }
16074                 }
16075 
16076                 /* Stack the position of this undealt-with left paren */
16077                 av_push(fence_stack, newSViv(fence));
16078                 fence = top_index + 1;
16079                 break;
16080 
16081             case '\\':
16082                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16083                  * multi-char folds are allowed.  */
16084                 if (!regclass(pRExC_state, flagp, depth+1,
16085                               TRUE, /* means parse just the next thing */
16086                               FALSE, /* don't allow multi-char folds */
16087                               FALSE, /* don't silence non-portable warnings.  */
16088                               TRUE,  /* strict */
16089                               FALSE, /* Require return to be an ANYOF */
16090                               &current))
16091                 {
16092                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16093                     goto regclass_failed;
16094                 }
16095 
16096                 /* regclass() will return with parsing just the \ sequence,
16097                  * leaving the parse pointer at the next thing to parse */
16098                 RExC_parse--;
16099                 goto handle_operand;
16100 
16101             case '[':   /* Is a bracketed character class */
16102             {
16103                 /* See if this is a [:posix:] class. */
16104                 bool is_posix_class = (OOB_NAMEDCLASS
16105                             < handle_possible_posix(pRExC_state,
16106                                                 RExC_parse + 1,
16107                                                 NULL,
16108                                                 NULL,
16109                                                 TRUE /* checking only */));
16110                 /* If it is a posix class, leave the parse pointer at the '['
16111                  * to fool regclass() into thinking it is part of a
16112                  * '[[:posix:]]'. */
16113                 if (! is_posix_class) {
16114                     RExC_parse++;
16115                 }
16116 
16117                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16118                  * multi-char folds are allowed.  */
16119                 if (!regclass(pRExC_state, flagp, depth+1,
16120                                 is_posix_class, /* parse the whole char
16121                                                     class only if not a
16122                                                     posix class */
16123                                 FALSE, /* don't allow multi-char folds */
16124                                 TRUE, /* silence non-portable warnings. */
16125                                 TRUE, /* strict */
16126                                 FALSE, /* Require return to be an ANYOF */
16127                                 &current))
16128                 {
16129                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16130                     goto regclass_failed;
16131                 }
16132 
16133                 if (! current) {
16134                     break;
16135                 }
16136 
16137                 /* function call leaves parse pointing to the ']', except if we
16138                  * faked it */
16139                 if (is_posix_class) {
16140                     RExC_parse--;
16141                 }
16142 
16143                 goto handle_operand;
16144             }
16145 
16146             case ']':
16147                 if (top_index >= 1) {
16148                     goto join_operators;
16149                 }
16150 
16151                 /* Only a single operand on the stack: are done */
16152                 goto done;
16153 
16154             case ')':
16155                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16156                     if (UCHARAT(RExC_parse - 1) == ']')  {
16157                         break;
16158                     }
16159                     RExC_parse++;
16160                     vFAIL("Unexpected ')'");
16161                 }
16162 
16163                 /* If nothing after the fence, is missing an operand */
16164                 if (top_index - fence < 0) {
16165                     RExC_parse++;
16166                     goto bad_syntax;
16167                 }
16168                 /* If at least two things on the stack, treat this as an
16169                   * operator */
16170                 if (top_index - fence >= 1) {
16171                     goto join_operators;
16172                 }
16173 
16174                 /* Here only a single thing on the fenced stack, and there is a
16175                  * fence.  Get rid of it */
16176                 fence_ptr = av_pop(fence_stack);
16177                 assert(fence_ptr);
16178                 fence = SvIV(fence_ptr);
16179                 SvREFCNT_dec_NN(fence_ptr);
16180                 fence_ptr = NULL;
16181 
16182                 if (fence < 0) {
16183                     fence = 0;
16184                 }
16185 
16186                 /* Having gotten rid of the fence, we pop the operand at the
16187                  * stack top and process it as a newly encountered operand */
16188                 current = av_pop(stack);
16189                 if (IS_OPERAND(current)) {
16190                     goto handle_operand;
16191                 }
16192 
16193                 RExC_parse++;
16194                 goto bad_syntax;
16195 
16196             case '&':
16197             case '|':
16198             case '+':
16199             case '-':
16200             case '^':
16201 
16202                 /* These binary operators should have a left operand already
16203                  * parsed */
16204                 if (   top_index - fence < 0
16205                     || top_index - fence == 1
16206                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16207                     || ! IS_OPERAND(*top_ptr))
16208                 {
16209                     goto unexpected_binary;
16210                 }
16211 
16212                 /* If only the one operand is on the part of the stack visible
16213                  * to us, we just place this operator in the proper position */
16214                 if (top_index - fence < 2) {
16215 
16216                     /* Place the operator before the operand */
16217 
16218                     SV* lhs = av_pop(stack);
16219                     av_push(stack, newSVuv(curchar));
16220                     av_push(stack, lhs);
16221                     break;
16222                 }
16223 
16224                 /* But if there is something else on the stack, we need to
16225                  * process it before this new operator if and only if the
16226                  * stacked operation has equal or higher precedence than the
16227                  * new one */
16228 
16229              join_operators:
16230 
16231                 /* The operator on the stack is supposed to be below both its
16232                  * operands */
16233                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16234                     || IS_OPERAND(*stacked_ptr))
16235                 {
16236                     /* But if not, it's legal and indicates we are completely
16237                      * done if and only if we're currently processing a ']',
16238                      * which should be the final thing in the expression */
16239                     if (curchar == ']') {
16240                         goto done;
16241                     }
16242 
16243                   unexpected_binary:
16244                     RExC_parse++;
16245                     vFAIL2("Unexpected binary operator '%c' with no "
16246                            "preceding operand", curchar);
16247                 }
16248                 stacked_operator = (char) SvUV(*stacked_ptr);
16249 
16250                 if (regex_set_precedence(curchar)
16251                     > regex_set_precedence(stacked_operator))
16252                 {
16253                     /* Here, the new operator has higher precedence than the
16254                      * stacked one.  This means we need to add the new one to
16255                      * the stack to await its rhs operand (and maybe more
16256                      * stuff).  We put it before the lhs operand, leaving
16257                      * untouched the stacked operator and everything below it
16258                      * */
16259                     lhs = av_pop(stack);
16260                     assert(IS_OPERAND(lhs));
16261 
16262                     av_push(stack, newSVuv(curchar));
16263                     av_push(stack, lhs);
16264                     break;
16265                 }
16266 
16267                 /* Here, the new operator has equal or lower precedence than
16268                  * what's already there.  This means the operation already
16269                  * there should be performed now, before the new one. */
16270 
16271                 rhs = av_pop(stack);
16272                 if (! IS_OPERAND(rhs)) {
16273 
16274                     /* This can happen when a ! is not followed by an operand,
16275                      * like in /(?[\t &!])/ */
16276                     goto bad_syntax;
16277                 }
16278 
16279                 lhs = av_pop(stack);
16280 
16281                 if (! IS_OPERAND(lhs)) {
16282 
16283                     /* This can happen when there is an empty (), like in
16284                      * /(?[[0]+()+])/ */
16285                     goto bad_syntax;
16286                 }
16287 
16288                 switch (stacked_operator) {
16289                     case '&':
16290                         _invlist_intersection(lhs, rhs, &rhs);
16291                         break;
16292 
16293                     case '|':
16294                     case '+':
16295                         _invlist_union(lhs, rhs, &rhs);
16296                         break;
16297 
16298                     case '-':
16299                         _invlist_subtract(lhs, rhs, &rhs);
16300                         break;
16301 
16302                     case '^':   /* The union minus the intersection */
16303                     {
16304                         SV* i = NULL;
16305                         SV* u = NULL;
16306 
16307                         _invlist_union(lhs, rhs, &u);
16308                         _invlist_intersection(lhs, rhs, &i);
16309                         _invlist_subtract(u, i, &rhs);
16310                         SvREFCNT_dec_NN(i);
16311                         SvREFCNT_dec_NN(u);
16312                         break;
16313                     }
16314                 }
16315                 SvREFCNT_dec(lhs);
16316 
16317                 /* Here, the higher precedence operation has been done, and the
16318                  * result is in 'rhs'.  We overwrite the stacked operator with
16319                  * the result.  Then we redo this code to either push the new
16320                  * operator onto the stack or perform any higher precedence
16321                  * stacked operation */
16322                 only_to_avoid_leaks = av_pop(stack);
16323                 SvREFCNT_dec(only_to_avoid_leaks);
16324                 av_push(stack, rhs);
16325                 goto redo_curchar;
16326 
16327             case '!':   /* Highest priority, right associative */
16328 
16329                 /* If what's already at the top of the stack is another '!",
16330                  * they just cancel each other out */
16331                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16332                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16333                 {
16334                     only_to_avoid_leaks = av_pop(stack);
16335                     SvREFCNT_dec(only_to_avoid_leaks);
16336                 }
16337                 else { /* Otherwise, since it's right associative, just push
16338                           onto the stack */
16339                     av_push(stack, newSVuv(curchar));
16340                 }
16341                 break;
16342 
16343             default:
16344                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16345                 if (RExC_parse >= RExC_end) {
16346                     break;
16347                 }
16348                 vFAIL("Unexpected character");
16349 
16350           handle_operand:
16351 
16352             /* Here 'current' is the operand.  If something is already on the
16353              * stack, we have to check if it is a !.  But first, the code above
16354              * may have altered the stack in the time since we earlier set
16355              * 'top_index'.  */
16356 
16357             top_index = av_tindex_skip_len_mg(stack);
16358             if (top_index - fence >= 0) {
16359                 /* If the top entry on the stack is an operator, it had better
16360                  * be a '!', otherwise the entry below the top operand should
16361                  * be an operator */
16362                 top_ptr = av_fetch(stack, top_index, FALSE);
16363                 assert(top_ptr);
16364                 if (IS_OPERATOR(*top_ptr)) {
16365 
16366                     /* The only permissible operator at the top of the stack is
16367                      * '!', which is applied immediately to this operand. */
16368                     curchar = (char) SvUV(*top_ptr);
16369                     if (curchar != '!') {
16370                         SvREFCNT_dec(current);
16371                         vFAIL2("Unexpected binary operator '%c' with no "
16372                                 "preceding operand", curchar);
16373                     }
16374 
16375                     _invlist_invert(current);
16376 
16377                     only_to_avoid_leaks = av_pop(stack);
16378                     SvREFCNT_dec(only_to_avoid_leaks);
16379 
16380                     /* And we redo with the inverted operand.  This allows
16381                      * handling multiple ! in a row */
16382                     goto handle_operand;
16383                 }
16384                           /* Single operand is ok only for the non-binary ')'
16385                            * operator */
16386                 else if ((top_index - fence == 0 && curchar != ')')
16387                          || (top_index - fence > 0
16388                              && (! (stacked_ptr = av_fetch(stack,
16389                                                            top_index - 1,
16390                                                            FALSE))
16391                                  || IS_OPERAND(*stacked_ptr))))
16392                 {
16393                     SvREFCNT_dec(current);
16394                     vFAIL("Operand with no preceding operator");
16395                 }
16396             }
16397 
16398             /* Here there was nothing on the stack or the top element was
16399              * another operand.  Just add this new one */
16400             av_push(stack, current);
16401 
16402         } /* End of switch on next parse token */
16403 
16404         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16405     } /* End of loop parsing through the construct */
16406 
16407     vFAIL("Syntax error in (?[...])");
16408 
16409   done:
16410 
16411     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16412         if (RExC_parse < RExC_end) {
16413             RExC_parse++;
16414         }
16415 
16416         vFAIL("Unexpected ']' with no following ')' in (?[...");
16417     }
16418 
16419     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16420         vFAIL("Unmatched (");
16421     }
16422 
16423     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16424         || ((final = av_pop(stack)) == NULL)
16425         || ! IS_OPERAND(final)
16426         || ! is_invlist(final)
16427         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16428     {
16429       bad_syntax:
16430         SvREFCNT_dec(final);
16431         vFAIL("Incomplete expression within '(?[ ])'");
16432     }
16433 
16434     /* Here, 'final' is the resultant inversion list from evaluating the
16435      * expression.  Return it if so requested */
16436     if (return_invlist) {
16437         *return_invlist = final;
16438         return END;
16439     }
16440 
16441     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16442      * expecting a string of ranges and individual code points */
16443     invlist_iterinit(final);
16444     result_string = newSVpvs("");
16445     while (invlist_iternext(final, &start, &end)) {
16446         if (start == end) {
16447             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16448         }
16449         else {
16450             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16451                                                      start,          end);
16452         }
16453     }
16454 
16455     /* About to generate an ANYOF (or similar) node from the inversion list we
16456      * have calculated */
16457     save_parse = RExC_parse;
16458     RExC_parse = SvPV(result_string, len);
16459     save_end = RExC_end;
16460     RExC_end = RExC_parse + len;
16461     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16462 
16463     /* We turn off folding around the call, as the class we have constructed
16464      * already has all folding taken into consideration, and we don't want
16465      * regclass() to add to that */
16466     RExC_flags &= ~RXf_PMf_FOLD;
16467     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16468      * folds are allowed.  */
16469     node = regclass(pRExC_state, flagp, depth+1,
16470                     FALSE, /* means parse the whole char class */
16471                     FALSE, /* don't allow multi-char folds */
16472                     TRUE, /* silence non-portable warnings.  The above may very
16473                              well have generated non-portable code points, but
16474                              they're valid on this machine */
16475                     FALSE, /* similarly, no need for strict */
16476                     FALSE, /* Require return to be an ANYOF */
16477                     NULL
16478                 );
16479 
16480     RESTORE_WARNINGS;
16481     RExC_parse = save_parse + 1;
16482     RExC_end = save_end;
16483     SvREFCNT_dec_NN(final);
16484     SvREFCNT_dec_NN(result_string);
16485 
16486     if (save_fold) {
16487         RExC_flags |= RXf_PMf_FOLD;
16488     }
16489 
16490     if (!node) {
16491         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16492         goto regclass_failed;
16493     }
16494 
16495     /* Fix up the node type if we are in locale.  (We have pretended we are
16496      * under /u for the purposes of regclass(), as this construct will only
16497      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16498      * as to cause any warnings about bad locales to be output in regexec.c),
16499      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16500      * reason we above forbid optimization into something other than an ANYOF
16501      * node is simply to minimize the number of code changes in regexec.c.
16502      * Otherwise we would have to create new EXACTish node types and deal with
16503      * them.  This decision could be revisited should this construct become
16504      * popular.
16505      *
16506      * (One might think we could look at the resulting ANYOF node and suppress
16507      * the flag if everything is above 255, as those would be UTF-8 only,
16508      * but this isn't true, as the components that led to that result could
16509      * have been locale-affected, and just happen to cancel each other out
16510      * under UTF-8 locales.) */
16511     if (in_locale) {
16512         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16513 
16514         assert(OP(REGNODE_p(node)) == ANYOF);
16515 
16516         OP(REGNODE_p(node)) = ANYOFL;
16517         ANYOF_FLAGS(REGNODE_p(node))
16518                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16519     }
16520 
16521     nextchar(pRExC_state);
16522     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16523     return node;
16524 
16525   regclass_failed:
16526     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16527                                                                 (UV) *flagp);
16528 }
16529 
16530 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16531 
16532 STATIC void
S_dump_regex_sets_structures(pTHX_ RExC_state_t * pRExC_state,AV * stack,const IV fence,AV * fence_stack)16533 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16534                              AV * stack, const IV fence, AV * fence_stack)
16535 {   /* Dumps the stacks in handle_regex_sets() */
16536 
16537     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16538     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16539     SSize_t i;
16540 
16541     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16542 
16543     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16544 
16545     if (stack_top < 0) {
16546         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16547     }
16548     else {
16549         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16550         for (i = stack_top; i >= 0; i--) {
16551             SV ** element_ptr = av_fetch(stack, i, FALSE);
16552             if (! element_ptr) {
16553             }
16554 
16555             if (IS_OPERATOR(*element_ptr)) {
16556                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16557                                             (int) i, (int) SvIV(*element_ptr));
16558             }
16559             else {
16560                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16561                 sv_dump(*element_ptr);
16562             }
16563         }
16564     }
16565 
16566     if (fence_stack_top < 0) {
16567         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16568     }
16569     else {
16570         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16571         for (i = fence_stack_top; i >= 0; i--) {
16572             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16573             if (! element_ptr) {
16574             }
16575 
16576             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16577                                             (int) i, (int) SvIV(*element_ptr));
16578         }
16579     }
16580 }
16581 
16582 #endif
16583 
16584 #undef IS_OPERATOR
16585 #undef IS_OPERAND
16586 
16587 STATIC void
S_add_above_Latin1_folds(pTHX_ RExC_state_t * pRExC_state,const U8 cp,SV ** invlist)16588 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16589 {
16590     /* This adds the Latin1/above-Latin1 folding rules.
16591      *
16592      * This should be called only for a Latin1-range code points, cp, which is
16593      * known to be involved in a simple fold with other code points above
16594      * Latin1.  It would give false results if /aa has been specified.
16595      * Multi-char folds are outside the scope of this, and must be handled
16596      * specially. */
16597 
16598     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16599 
16600     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16601 
16602     /* The rules that are valid for all Unicode versions are hard-coded in */
16603     switch (cp) {
16604         case 'k':
16605         case 'K':
16606           *invlist =
16607              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16608             break;
16609         case 's':
16610         case 'S':
16611           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16612             break;
16613         case MICRO_SIGN:
16614           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16615           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16616             break;
16617         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16618         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16619           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16620             break;
16621         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16622           *invlist = add_cp_to_invlist(*invlist,
16623                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16624             break;
16625 
16626         default:    /* Other code points are checked against the data for the
16627                        current Unicode version */
16628           {
16629             Size_t folds_count;
16630             unsigned int first_fold;
16631             const unsigned int * remaining_folds;
16632             UV folded_cp;
16633 
16634             if (isASCII(cp)) {
16635                 folded_cp = toFOLD(cp);
16636             }
16637             else {
16638                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16639                 Size_t dummy_len;
16640                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16641             }
16642 
16643             if (folded_cp > 255) {
16644                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16645             }
16646 
16647             folds_count = _inverse_folds(folded_cp, &first_fold,
16648                                                     &remaining_folds);
16649             if (folds_count == 0) {
16650 
16651                 /* Use deprecated warning to increase the chances of this being
16652                  * output */
16653                 ckWARN2reg_d(RExC_parse,
16654                         "Perl folding rules are not up-to-date for 0x%02X;"
16655                         " please use the perlbug utility to report;", cp);
16656             }
16657             else {
16658                 unsigned int i;
16659 
16660                 if (first_fold > 255) {
16661                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16662                 }
16663                 for (i = 0; i < folds_count - 1; i++) {
16664                     if (remaining_folds[i] > 255) {
16665                         *invlist = add_cp_to_invlist(*invlist,
16666                                                     remaining_folds[i]);
16667                     }
16668                 }
16669             }
16670             break;
16671          }
16672     }
16673 }
16674 
16675 STATIC void
S_output_posix_warnings(pTHX_ RExC_state_t * pRExC_state,AV * posix_warnings)16676 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16677 {
16678     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16679      * warnings. */
16680 
16681     SV * msg;
16682     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16683 
16684     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16685 
16686     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16687         return;
16688     }
16689 
16690     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16691         if (first_is_fatal) {           /* Avoid leaking this */
16692             av_undef(posix_warnings);   /* This isn't necessary if the
16693                                             array is mortal, but is a
16694                                             fail-safe */
16695             (void) sv_2mortal(msg);
16696             PREPARE_TO_DIE;
16697         }
16698         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16699         SvREFCNT_dec_NN(msg);
16700     }
16701 
16702     UPDATE_WARNINGS_LOC(RExC_parse);
16703 }
16704 
16705 STATIC AV *
S_add_multi_match(pTHX_ AV * multi_char_matches,SV * multi_string,const STRLEN cp_count)16706 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16707 {
16708     /* This adds the string scalar <multi_string> to the array
16709      * <multi_char_matches>.  <multi_string> is known to have exactly
16710      * <cp_count> code points in it.  This is used when constructing a
16711      * bracketed character class and we find something that needs to match more
16712      * than a single character.
16713      *
16714      * <multi_char_matches> is actually an array of arrays.  Each top-level
16715      * element is an array that contains all the strings known so far that are
16716      * the same length.  And that length (in number of code points) is the same
16717      * as the index of the top-level array.  Hence, the [2] element is an
16718      * array, each element thereof is a string containing TWO code points;
16719      * while element [3] is for strings of THREE characters, and so on.  Since
16720      * this is for multi-char strings there can never be a [0] nor [1] element.
16721      *
16722      * When we rewrite the character class below, we will do so such that the
16723      * longest strings are written first, so that it prefers the longest
16724      * matching strings first.  This is done even if it turns out that any
16725      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16726      * Christiansen has agreed that this is ok.  This makes the test for the
16727      * ligature 'ffi' come before the test for 'ff', for example */
16728 
16729     AV* this_array;
16730     AV** this_array_ptr;
16731 
16732     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16733 
16734     if (! multi_char_matches) {
16735         multi_char_matches = newAV();
16736     }
16737 
16738     if (av_exists(multi_char_matches, cp_count)) {
16739         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16740         this_array = *this_array_ptr;
16741     }
16742     else {
16743         this_array = newAV();
16744         av_store(multi_char_matches, cp_count,
16745                  (SV*) this_array);
16746     }
16747     av_push(this_array, multi_string);
16748 
16749     return multi_char_matches;
16750 }
16751 
16752 /* The names of properties whose definitions are not known at compile time are
16753  * stored in this SV, after a constant heading.  So if the length has been
16754  * changed since initialization, then there is a run-time definition. */
16755 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16756                                         (SvCUR(listsv) != initial_listsv_len)
16757 
16758 /* There is a restricted set of white space characters that are legal when
16759  * ignoring white space in a bracketed character class.  This generates the
16760  * code to skip them.
16761  *
16762  * There is a line below that uses the same white space criteria but is outside
16763  * this macro.  Both here and there must use the same definition */
16764 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
16765     STMT_START {                                                        \
16766         if (do_skip) {                                                  \
16767             while (isBLANK_A(UCHARAT(p)))                               \
16768             {                                                           \
16769                 p++;                                                    \
16770             }                                                           \
16771         }                                                               \
16772     } STMT_END
16773 
16774 STATIC regnode_offset
S_regclass(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,U32 depth,const bool stop_at_1,bool allow_mutiple_chars,const bool silence_non_portable,const bool strict,bool optimizable,SV ** ret_invlist)16775 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16776                  const bool stop_at_1,  /* Just parse the next thing, don't
16777                                            look for a full character class */
16778                  bool allow_mutiple_chars,
16779                  const bool silence_non_portable,   /* Don't output warnings
16780                                                        about too large
16781                                                        characters */
16782                  const bool strict,
16783                  bool optimizable,                  /* ? Allow a non-ANYOF return
16784                                                        node */
16785                  SV** ret_invlist  /* Return an inversion list, not a node */
16786           )
16787 {
16788     /* parse a bracketed class specification.  Most of these will produce an
16789      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16790      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
16791      * under /i with multi-character folds: it will be rewritten following the
16792      * paradigm of this example, where the <multi-fold>s are characters which
16793      * fold to multiple character sequences:
16794      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16795      * gets effectively rewritten as:
16796      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16797      * reg() gets called (recursively) on the rewritten version, and this
16798      * function will return what it constructs.  (Actually the <multi-fold>s
16799      * aren't physically removed from the [abcdefghi], it's just that they are
16800      * ignored in the recursion by means of a flag:
16801      * <RExC_in_multi_char_class>.)
16802      *
16803      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16804      * characters, with the corresponding bit set if that character is in the
16805      * list.  For characters above this, an inversion list is used.  There
16806      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16807      * determinable at compile time
16808      *
16809      * On success, returns the offset at which any next node should be placed
16810      * into the regex engine program being compiled.
16811      *
16812      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16813      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16814      * UTF-8
16815      */
16816 
16817     dVAR;
16818     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16819     IV range = 0;
16820     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16821     regnode_offset ret = -1;    /* Initialized to an illegal value */
16822     STRLEN numlen;
16823     int namedclass = OOB_NAMEDCLASS;
16824     char *rangebegin = NULL;
16825     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
16826                                aren't available at the time this was called */
16827     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16828 				      than just initialized.  */
16829     SV* properties = NULL;    /* Code points that match \p{} \P{} */
16830     SV* posixes = NULL;     /* Code points that match classes like [:word:],
16831                                extended beyond the Latin1 range.  These have to
16832                                be kept separate from other code points for much
16833                                of this function because their handling  is
16834                                different under /i, and for most classes under
16835                                /d as well */
16836     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
16837                                separate for a while from the non-complemented
16838                                versions because of complications with /d
16839                                matching */
16840     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16841                                   treated more simply than the general case,
16842                                   leading to less compilation and execution
16843                                   work */
16844     UV element_count = 0;   /* Number of distinct elements in the class.
16845 			       Optimizations may be possible if this is tiny */
16846     AV * multi_char_matches = NULL; /* Code points that fold to more than one
16847                                        character; used under /i */
16848     UV n;
16849     char * stop_ptr = RExC_end;    /* where to stop parsing */
16850 
16851     /* ignore unescaped whitespace? */
16852     const bool skip_white = cBOOL(   ret_invlist
16853                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16854 
16855     /* inversion list of code points this node matches only when the target
16856      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
16857      * /d) */
16858     SV* upper_latin1_only_utf8_matches = NULL;
16859 
16860     /* Inversion list of code points this node matches regardless of things
16861      * like locale, folding, utf8ness of the target string */
16862     SV* cp_list = NULL;
16863 
16864     /* Like cp_list, but code points on this list need to be checked for things
16865      * that fold to/from them under /i */
16866     SV* cp_foldable_list = NULL;
16867 
16868     /* Like cp_list, but code points on this list are valid only when the
16869      * runtime locale is UTF-8 */
16870     SV* only_utf8_locale_list = NULL;
16871 
16872     /* In a range, if one of the endpoints is non-character-set portable,
16873      * meaning that it hard-codes a code point that may mean a different
16874      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16875      * mnemonic '\t' which each mean the same character no matter which
16876      * character set the platform is on. */
16877     unsigned int non_portable_endpoint = 0;
16878 
16879     /* Is the range unicode? which means on a platform that isn't 1-1 native
16880      * to Unicode (i.e. non-ASCII), each code point in it should be considered
16881      * to be a Unicode value.  */
16882     bool unicode_range = FALSE;
16883     bool invert = FALSE;    /* Is this class to be complemented */
16884 
16885     bool warn_super = ALWAYS_WARN_SUPER;
16886 
16887     const char * orig_parse = RExC_parse;
16888 
16889     /* This variable is used to mark where the end in the input is of something
16890      * that looks like a POSIX construct but isn't.  During the parse, when
16891      * something looks like it could be such a construct is encountered, it is
16892      * checked for being one, but not if we've already checked this area of the
16893      * input.  Only after this position is reached do we check again */
16894     char *not_posix_region_end = RExC_parse - 1;
16895 
16896     AV* posix_warnings = NULL;
16897     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16898     U8 op = END;    /* The returned node-type, initialized to an impossible
16899                        one.  */
16900     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
16901     U32 posixl = 0;       /* bit field of posix classes matched under /l */
16902 
16903 
16904 /* Flags as to what things aren't knowable until runtime.  (Note that these are
16905  * mutually exclusive.) */
16906 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
16907                                             haven't been defined as of yet */
16908 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
16909                                             UTF-8 or not */
16910 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
16911                                             what gets folded */
16912     U32 has_runtime_dependency = 0;     /* OR of the above flags */
16913 
16914     GET_RE_DEBUG_FLAGS_DECL;
16915 
16916     PERL_ARGS_ASSERT_REGCLASS;
16917 #ifndef DEBUGGING
16918     PERL_UNUSED_ARG(depth);
16919 #endif
16920 
16921 
16922     /* If wants an inversion list returned, we can't optimize to something
16923      * else. */
16924     if (ret_invlist) {
16925         optimizable = FALSE;
16926     }
16927 
16928     DEBUG_PARSE("clas");
16929 
16930 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
16931     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
16932                                    && UNICODE_DOT_DOT_VERSION == 0)
16933     allow_mutiple_chars = FALSE;
16934 #endif
16935 
16936     /* We include the /i status at the beginning of this so that we can
16937      * know it at runtime */
16938     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
16939     initial_listsv_len = SvCUR(listsv);
16940     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
16941 
16942     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16943 
16944     assert(RExC_parse <= RExC_end);
16945 
16946     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
16947 	RExC_parse++;
16948         invert = TRUE;
16949         allow_mutiple_chars = FALSE;
16950         MARK_NAUGHTY(1);
16951         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16952     }
16953 
16954     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16955     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16956         int maybe_class = handle_possible_posix(pRExC_state,
16957                                                 RExC_parse,
16958                                                 &not_posix_region_end,
16959                                                 NULL,
16960                                                 TRUE /* checking only */);
16961         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16962             ckWARN4reg(not_posix_region_end,
16963                     "POSIX syntax [%c %c] belongs inside character classes%s",
16964                     *RExC_parse, *RExC_parse,
16965                     (maybe_class == OOB_NAMEDCLASS)
16966                     ? ((POSIXCC_NOTYET(*RExC_parse))
16967                         ? " (but this one isn't implemented)"
16968                         : " (but this one isn't fully valid)")
16969                     : ""
16970                     );
16971         }
16972     }
16973 
16974     /* If the caller wants us to just parse a single element, accomplish this
16975      * by faking the loop ending condition */
16976     if (stop_at_1 && RExC_end > RExC_parse) {
16977         stop_ptr = RExC_parse + 1;
16978     }
16979 
16980     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16981     if (UCHARAT(RExC_parse) == ']')
16982 	goto charclassloop;
16983 
16984     while (1) {
16985 
16986         if (   posix_warnings
16987             && av_tindex_skip_len_mg(posix_warnings) >= 0
16988             && RExC_parse > not_posix_region_end)
16989         {
16990             /* Warnings about posix class issues are considered tentative until
16991              * we are far enough along in the parse that we can no longer
16992              * change our mind, at which point we output them.  This is done
16993              * each time through the loop so that a later class won't zap them
16994              * before they have been dealt with. */
16995             output_posix_warnings(pRExC_state, posix_warnings);
16996         }
16997 
16998         if  (RExC_parse >= stop_ptr) {
16999             break;
17000         }
17001 
17002         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17003 
17004         if  (UCHARAT(RExC_parse) == ']') {
17005             break;
17006         }
17007 
17008       charclassloop:
17009 
17010 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17011         save_value = value;
17012         save_prevvalue = prevvalue;
17013 
17014 	if (!range) {
17015 	    rangebegin = RExC_parse;
17016 	    element_count++;
17017             non_portable_endpoint = 0;
17018 	}
17019 	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17020 	    value = utf8n_to_uvchr((U8*)RExC_parse,
17021 				   RExC_end - RExC_parse,
17022 				   &numlen, UTF8_ALLOW_DEFAULT);
17023 	    RExC_parse += numlen;
17024 	}
17025 	else
17026 	    value = UCHARAT(RExC_parse++);
17027 
17028         if (value == '[') {
17029             char * posix_class_end;
17030             namedclass = handle_possible_posix(pRExC_state,
17031                                                RExC_parse,
17032                                                &posix_class_end,
17033                                                do_posix_warnings ? &posix_warnings : NULL,
17034                                                FALSE    /* die if error */);
17035             if (namedclass > OOB_NAMEDCLASS) {
17036 
17037                 /* If there was an earlier attempt to parse this particular
17038                  * posix class, and it failed, it was a false alarm, as this
17039                  * successful one proves */
17040                 if (   posix_warnings
17041                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17042                     && not_posix_region_end >= RExC_parse
17043                     && not_posix_region_end <= posix_class_end)
17044                 {
17045                     av_undef(posix_warnings);
17046                 }
17047 
17048                 RExC_parse = posix_class_end;
17049             }
17050             else if (namedclass == OOB_NAMEDCLASS) {
17051                 not_posix_region_end = posix_class_end;
17052             }
17053             else {
17054                 namedclass = OOB_NAMEDCLASS;
17055             }
17056         }
17057         else if (   RExC_parse - 1 > not_posix_region_end
17058                  && MAYBE_POSIXCC(value))
17059         {
17060             (void) handle_possible_posix(
17061                         pRExC_state,
17062                         RExC_parse - 1,  /* -1 because parse has already been
17063                                             advanced */
17064                         &not_posix_region_end,
17065                         do_posix_warnings ? &posix_warnings : NULL,
17066                         TRUE /* checking only */);
17067         }
17068         else if (  strict && ! skip_white
17069                  && (   _generic_isCC(value, _CC_VERTSPACE)
17070                      || is_VERTWS_cp_high(value)))
17071         {
17072             vFAIL("Literal vertical space in [] is illegal except under /x");
17073         }
17074         else if (value == '\\') {
17075             /* Is a backslash; get the code point of the char after it */
17076 
17077             if (RExC_parse >= RExC_end) {
17078                 vFAIL("Unmatched [");
17079             }
17080 
17081 	    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17082 		value = utf8n_to_uvchr((U8*)RExC_parse,
17083 				   RExC_end - RExC_parse,
17084 				   &numlen, UTF8_ALLOW_DEFAULT);
17085 		RExC_parse += numlen;
17086 	    }
17087 	    else
17088 		value = UCHARAT(RExC_parse++);
17089 
17090 	    /* Some compilers cannot handle switching on 64-bit integer
17091 	     * values, therefore value cannot be an UV.  Yes, this will
17092 	     * be a problem later if we want switch on Unicode.
17093 	     * A similar issue a little bit later when switching on
17094 	     * namedclass. --jhi */
17095 
17096             /* If the \ is escaping white space when white space is being
17097              * skipped, it means that that white space is wanted literally, and
17098              * is already in 'value'.  Otherwise, need to translate the escape
17099              * into what it signifies. */
17100             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17101 
17102 	    case 'w':	namedclass = ANYOF_WORDCHAR;	break;
17103 	    case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
17104 	    case 's':	namedclass = ANYOF_SPACE;	break;
17105 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
17106 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
17107 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
17108 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
17109 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
17110 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
17111 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
17112             case 'N':  /* Handle \N{NAME} in class */
17113                 {
17114                     const char * const backslash_N_beg = RExC_parse - 2;
17115                     int cp_count;
17116 
17117                     if (! grok_bslash_N(pRExC_state,
17118                                         NULL,      /* No regnode */
17119                                         &value,    /* Yes single value */
17120                                         &cp_count, /* Multiple code pt count */
17121                                         flagp,
17122                                         strict,
17123                                         depth)
17124                     ) {
17125 
17126                         if (*flagp & NEED_UTF8)
17127                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17128 
17129                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17130 
17131                         if (cp_count < 0) {
17132                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17133                         }
17134                         else if (cp_count == 0) {
17135                             ckWARNreg(RExC_parse,
17136                               "Ignoring zero length \\N{} in character class");
17137                         }
17138                         else { /* cp_count > 1 */
17139                             assert(cp_count > 1);
17140                             if (! RExC_in_multi_char_class) {
17141                                 if ( ! allow_mutiple_chars
17142                                     || invert
17143                                     || range
17144                                     || *RExC_parse == '-')
17145                                 {
17146                                     if (strict) {
17147                                         RExC_parse--;
17148                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
17149                                     }
17150                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17151                                     break; /* <value> contains the first code
17152                                               point. Drop out of the switch to
17153                                               process it */
17154                                 }
17155                                 else {
17156                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17157                                                  RExC_parse - backslash_N_beg);
17158                                     multi_char_matches
17159                                         = add_multi_match(multi_char_matches,
17160                                                           multi_char_N,
17161                                                           cp_count);
17162                                 }
17163                             }
17164                         } /* End of cp_count != 1 */
17165 
17166                         /* This element should not be processed further in this
17167                          * class */
17168                         element_count--;
17169                         value = save_value;
17170                         prevvalue = save_prevvalue;
17171                         continue;   /* Back to top of loop to get next char */
17172                     }
17173 
17174                     /* Here, is a single code point, and <value> contains it */
17175                     unicode_range = TRUE;   /* \N{} are Unicode */
17176                 }
17177                 break;
17178 	    case 'p':
17179 	    case 'P':
17180 		{
17181 		char *e;
17182 
17183 		/* \p means they want Unicode semantics */
17184 		REQUIRE_UNI_RULES(flagp, 0);
17185 
17186 		if (RExC_parse >= RExC_end)
17187 		    vFAIL2("Empty \\%c", (U8)value);
17188 		if (*RExC_parse == '{') {
17189 		    const U8 c = (U8)value;
17190 		    e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17191                     if (!e) {
17192                         RExC_parse++;
17193                         vFAIL2("Missing right brace on \\%c{}", c);
17194                     }
17195 
17196                     RExC_parse++;
17197 
17198                     /* White space is allowed adjacent to the braces and after
17199                      * any '^', even when not under /x */
17200                     while (isSPACE(*RExC_parse)) {
17201                          RExC_parse++;
17202 		    }
17203 
17204 		    if (UCHARAT(RExC_parse) == '^') {
17205 
17206                         /* toggle.  (The rhs xor gets the single bit that
17207                          * differs between P and p; the other xor inverts just
17208                          * that bit) */
17209                         value ^= 'P' ^ 'p';
17210 
17211                         RExC_parse++;
17212                         while (isSPACE(*RExC_parse)) {
17213                             RExC_parse++;
17214                         }
17215                     }
17216 
17217                     if (e == RExC_parse)
17218                         vFAIL2("Empty \\%c{}", c);
17219 
17220 		    n = e - RExC_parse;
17221 		    while (isSPACE(*(RExC_parse + n - 1)))
17222 		        n--;
17223 
17224 		}   /* The \p isn't immediately followed by a '{' */
17225 		else if (! isALPHA(*RExC_parse)) {
17226                     RExC_parse += (UTF)
17227                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17228                                   : 1;
17229                     vFAIL2("Character following \\%c must be '{' or a "
17230                            "single-character Unicode property name",
17231                            (U8) value);
17232                 }
17233                 else {
17234 		    e = RExC_parse;
17235 		    n = 1;
17236 		}
17237 		{
17238                     char* name = RExC_parse;
17239 
17240                     /* Any message returned about expanding the definition */
17241                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17242 
17243                     /* If set TRUE, the property is user-defined as opposed to
17244                      * official Unicode */
17245                     bool user_defined = FALSE;
17246 
17247                     SV * prop_definition = parse_uniprop_string(
17248                                             name, n, UTF, FOLD,
17249                                             FALSE, /* This is compile-time */
17250 
17251                                             /* We can't defer this defn when
17252                                              * the full result is required in
17253                                              * this call */
17254                                             ! cBOOL(ret_invlist),
17255 
17256                                             &user_defined,
17257                                             msg,
17258                                             0 /* Base level */
17259                                            );
17260                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17261                         assert(prop_definition == NULL);
17262                         RExC_parse = e + 1;
17263                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17264                                                thing so, or else the display is
17265                                                mojibake */
17266                             RExC_utf8 = TRUE;
17267                         }
17268 			/* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17269                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17270                                     SvCUR(msg), SvPVX(msg)));
17271                     }
17272 
17273                     if (! is_invlist(prop_definition)) {
17274 
17275                         /* Here, the definition isn't known, so we have gotten
17276                          * returned a string that will be evaluated if and when
17277                          * encountered at runtime.  We add it to the list of
17278                          * such properties, along with whether it should be
17279                          * complemented or not */
17280                         if (value == 'P') {
17281                             sv_catpvs(listsv, "!");
17282                         }
17283                         else {
17284                             sv_catpvs(listsv, "+");
17285                         }
17286                         sv_catsv(listsv, prop_definition);
17287 
17288                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17289 
17290                         /* We don't know yet what this matches, so have to flag
17291                          * it */
17292                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17293                     }
17294                     else {
17295                         assert (prop_definition && is_invlist(prop_definition));
17296 
17297                         /* Here we do have the complete property definition
17298                          *
17299                          * Temporary workaround for [perl #133136].  For this
17300                          * precise input that is in the .t that is failing,
17301                          * load utf8.pm, which is what the test wants, so that
17302                          * that .t passes */
17303                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17304                                         "foo\\p{Alnum}")
17305                             && ! hv_common(GvHVn(PL_incgv),
17306                                            NULL,
17307                                            "utf8.pm", sizeof("utf8.pm") - 1,
17308                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17309                         {
17310                             require_pv("utf8.pm");
17311                         }
17312 
17313                         if (! user_defined &&
17314                             /* We warn on matching an above-Unicode code point
17315                              * if the match would return true, except don't
17316                              * warn for \p{All}, which has exactly one element
17317                              * = 0 */
17318                             (_invlist_contains_cp(prop_definition, 0x110000)
17319                                 && (! (_invlist_len(prop_definition) == 1
17320                                        && *invlist_array(prop_definition) == 0))))
17321                         {
17322                             warn_super = TRUE;
17323                         }
17324 
17325                         /* Invert if asking for the complement */
17326                         if (value == 'P') {
17327 			    _invlist_union_complement_2nd(properties,
17328                                                           prop_definition,
17329                                                           &properties);
17330                         }
17331                         else {
17332                             _invlist_union(properties, prop_definition, &properties);
17333 			}
17334                     }
17335                 }
17336 
17337 		RExC_parse = e + 1;
17338                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17339                                                 named */
17340 		}
17341 		break;
17342 	    case 'n':	value = '\n';			break;
17343 	    case 'r':	value = '\r';			break;
17344 	    case 't':	value = '\t';			break;
17345 	    case 'f':	value = '\f';			break;
17346 	    case 'b':	value = '\b';			break;
17347 	    case 'e':	value = ESC_NATIVE;             break;
17348 	    case 'a':	value = '\a';                   break;
17349 	    case 'o':
17350 		RExC_parse--;	/* function expects to be pointed at the 'o' */
17351 		{
17352 		    const char* error_msg;
17353 		    bool valid = grok_bslash_o(&RExC_parse,
17354                                                RExC_end,
17355 					       &value,
17356 					       &error_msg,
17357                                                TO_OUTPUT_WARNINGS(RExC_parse),
17358                                                strict,
17359                                                silence_non_portable,
17360                                                UTF);
17361 		    if (! valid) {
17362 			vFAIL(error_msg);
17363 		    }
17364                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17365 		}
17366                 non_portable_endpoint++;
17367 		break;
17368 	    case 'x':
17369 		RExC_parse--;	/* function expects to be pointed at the 'x' */
17370 		{
17371 		    const char* error_msg;
17372 		    bool valid = grok_bslash_x(&RExC_parse,
17373                                                RExC_end,
17374 					       &value,
17375 					       &error_msg,
17376 					       TO_OUTPUT_WARNINGS(RExC_parse),
17377                                                strict,
17378                                                silence_non_portable,
17379                                                UTF);
17380                     if (! valid) {
17381 			vFAIL(error_msg);
17382 		    }
17383                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17384 		}
17385                 non_portable_endpoint++;
17386 		break;
17387 	    case 'c':
17388 		value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17389                 UPDATE_WARNINGS_LOC(RExC_parse);
17390 		RExC_parse++;
17391                 non_portable_endpoint++;
17392 		break;
17393 	    case '0': case '1': case '2': case '3': case '4':
17394 	    case '5': case '6': case '7':
17395 		{
17396 		    /* Take 1-3 octal digits */
17397 		    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17398                     numlen = (strict) ? 4 : 3;
17399                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17400 		    RExC_parse += numlen;
17401                     if (numlen != 3) {
17402                         if (strict) {
17403                             RExC_parse += (UTF)
17404                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17405                                           : 1;
17406                             vFAIL("Need exactly 3 octal digits");
17407                         }
17408                         else if (   numlen < 3 /* like \08, \178 */
17409                                  && RExC_parse < RExC_end
17410                                  && isDIGIT(*RExC_parse)
17411                                  && ckWARN(WARN_REGEXP))
17412                         {
17413                             reg_warn_non_literal_string(
17414                                  RExC_parse + 1,
17415                                  form_short_octal_warning(RExC_parse, numlen));
17416                         }
17417                     }
17418                     non_portable_endpoint++;
17419 		    break;
17420 		}
17421 	    default:
17422 		/* Allow \_ to not give an error */
17423 		if (isWORDCHAR(value) && value != '_') {
17424                     if (strict) {
17425                         vFAIL2("Unrecognized escape \\%c in character class",
17426                                (int)value);
17427                     }
17428                     else {
17429                         ckWARN2reg(RExC_parse,
17430                             "Unrecognized escape \\%c in character class passed through",
17431                             (int)value);
17432                     }
17433 		}
17434 		break;
17435 	    }   /* End of switch on char following backslash */
17436 	} /* end of handling backslash escape sequences */
17437 
17438         /* Here, we have the current token in 'value' */
17439 
17440 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17441             U8 classnum;
17442 
17443 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17444 	     * literal, as is the character that began the false range, i.e.
17445 	     * the 'a' in the examples */
17446 	    if (range) {
17447                 const int w = (RExC_parse >= rangebegin)
17448                                 ? RExC_parse - rangebegin
17449                                 : 0;
17450                 if (strict) {
17451                     vFAIL2utf8f(
17452                         "False [] range \"%" UTF8f "\"",
17453                         UTF8fARG(UTF, w, rangebegin));
17454                 }
17455                 else {
17456                     ckWARN2reg(RExC_parse,
17457                         "False [] range \"%" UTF8f "\"",
17458                         UTF8fARG(UTF, w, rangebegin));
17459                     cp_list = add_cp_to_invlist(cp_list, '-');
17460                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17461                                                             prevvalue);
17462                 }
17463 
17464 		range = 0; /* this was not a true range */
17465                 element_count += 2; /* So counts for three values */
17466 	    }
17467 
17468             classnum = namedclass_to_classnum(namedclass);
17469 
17470 	    if (LOC && namedclass < ANYOF_POSIXL_MAX
17471 #ifndef HAS_ISASCII
17472                 && classnum != _CC_ASCII
17473 #endif
17474             ) {
17475                 SV* scratch_list = NULL;
17476 
17477                 /* What the Posix classes (like \w, [:space:]) match isn't
17478                  * generally knowable under locale until actual match time.  A
17479                  * special node is used for these which has extra space for a
17480                  * bitmap, with a bit reserved for each named class that is to
17481                  * be matched against.  (This isn't needed for \p{} and
17482                  * pseudo-classes, as they are not affected by locale, and
17483                  * hence are dealt with separately.)  However, if a named class
17484                  * and its complement are both present, then it matches
17485                  * everything, and there is no runtime dependency.  Odd numbers
17486                  * are the complements of the next lower number, so xor works.
17487                  * (Note that something like [\w\D] should match everything,
17488                  * because \d should be a proper subset of \w.  But rather than
17489                  * trust that the locale is well behaved, we leave this to
17490                  * runtime to sort out) */
17491                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17492                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17493                     POSIXL_ZERO(posixl);
17494                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17495                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17496                     continue;   /* We could ignore the rest of the class, but
17497                                    best to parse it for any errors */
17498                 }
17499                 else { /* Here, isn't the complement of any already parsed
17500                           class */
17501                     POSIXL_SET(posixl, namedclass);
17502                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17503                     anyof_flags |= ANYOF_MATCHES_POSIXL;
17504 
17505                     /* The above-Latin1 characters are not subject to locale
17506                      * rules.  Just add them to the unconditionally-matched
17507                      * list */
17508 
17509                     /* Get the list of the above-Latin1 code points this
17510                      * matches */
17511                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17512                                             PL_XPosix_ptrs[classnum],
17513 
17514                                             /* Odd numbers are complements,
17515                                              * like NDIGIT, NASCII, ... */
17516                                             namedclass % 2 != 0,
17517                                             &scratch_list);
17518                     /* Checking if 'cp_list' is NULL first saves an extra
17519                      * clone.  Its reference count will be decremented at the
17520                      * next union, etc, or if this is the only instance, at the
17521                      * end of the routine */
17522                     if (! cp_list) {
17523                         cp_list = scratch_list;
17524                     }
17525                     else {
17526                         _invlist_union(cp_list, scratch_list, &cp_list);
17527                         SvREFCNT_dec_NN(scratch_list);
17528                     }
17529                     continue;   /* Go get next character */
17530                 }
17531             }
17532             else {
17533 
17534                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17535                  * matter (or is a Unicode property, which is skipped here). */
17536                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17537                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17538 
17539                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17540                          * nor /l make a difference in what these match,
17541                          * therefore we just add what they match to cp_list. */
17542                         if (classnum != _CC_VERTSPACE) {
17543                             assert(   namedclass == ANYOF_HORIZWS
17544                                    || namedclass == ANYOF_NHORIZWS);
17545 
17546                             /* It turns out that \h is just a synonym for
17547                              * XPosixBlank */
17548                             classnum = _CC_BLANK;
17549                         }
17550 
17551                         _invlist_union_maybe_complement_2nd(
17552                                 cp_list,
17553                                 PL_XPosix_ptrs[classnum],
17554                                 namedclass % 2 != 0,    /* Complement if odd
17555                                                           (NHORIZWS, NVERTWS)
17556                                                         */
17557                                 &cp_list);
17558                     }
17559                 }
17560                 else if (   AT_LEAST_UNI_SEMANTICS
17561                          || classnum == _CC_ASCII
17562                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17563                                                    || classnum == _CC_XDIGIT)))
17564                 {
17565                     /* We usually have to worry about /d affecting what POSIX
17566                      * classes match, with special code needed because we won't
17567                      * know until runtime what all matches.  But there is no
17568                      * extra work needed under /u and /a; and [:ascii:] is
17569                      * unaffected by /d; and :digit: and :xdigit: don't have
17570                      * runtime differences under /d.  So we can special case
17571                      * these, and avoid some extra work below, and at runtime.
17572                      * */
17573                     _invlist_union_maybe_complement_2nd(
17574                                                      simple_posixes,
17575                                                       ((AT_LEAST_ASCII_RESTRICTED)
17576                                                        ? PL_Posix_ptrs[classnum]
17577                                                        : PL_XPosix_ptrs[classnum]),
17578                                                      namedclass % 2 != 0,
17579                                                      &simple_posixes);
17580                 }
17581                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17582                            complement and use nposixes */
17583                     SV** posixes_ptr = namedclass % 2 == 0
17584                                        ? &posixes
17585                                        : &nposixes;
17586                     _invlist_union_maybe_complement_2nd(
17587                                                      *posixes_ptr,
17588                                                      PL_XPosix_ptrs[classnum],
17589                                                      namedclass % 2 != 0,
17590                                                      posixes_ptr);
17591                 }
17592 	    }
17593 	} /* end of namedclass \blah */
17594 
17595         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17596 
17597         /* If 'range' is set, 'value' is the ending of a range--check its
17598          * validity.  (If value isn't a single code point in the case of a
17599          * range, we should have figured that out above in the code that
17600          * catches false ranges).  Later, we will handle each individual code
17601          * point in the range.  If 'range' isn't set, this could be the
17602          * beginning of a range, so check for that by looking ahead to see if
17603          * the next real character to be processed is the range indicator--the
17604          * minus sign */
17605 
17606 	if (range) {
17607 #ifdef EBCDIC
17608             /* For unicode ranges, we have to test that the Unicode as opposed
17609              * to the native values are not decreasing.  (Above 255, there is
17610              * no difference between native and Unicode) */
17611 	    if (unicode_range && prevvalue < 255 && value < 255) {
17612                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17613                     goto backwards_range;
17614                 }
17615             }
17616             else
17617 #endif
17618 	    if (prevvalue > value) /* b-a */ {
17619 		int w;
17620 #ifdef EBCDIC
17621               backwards_range:
17622 #endif
17623                 w = RExC_parse - rangebegin;
17624                 vFAIL2utf8f(
17625                     "Invalid [] range \"%" UTF8f "\"",
17626                     UTF8fARG(UTF, w, rangebegin));
17627                 NOT_REACHED; /* NOTREACHED */
17628 	    }
17629 	}
17630 	else {
17631             prevvalue = value; /* save the beginning of the potential range */
17632             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17633                 && *RExC_parse == '-')
17634             {
17635                 char* next_char_ptr = RExC_parse + 1;
17636 
17637                 /* Get the next real char after the '-' */
17638                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17639 
17640                 /* If the '-' is at the end of the class (just before the ']',
17641                  * it is a literal minus; otherwise it is a range */
17642                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17643                     RExC_parse = next_char_ptr;
17644 
17645                     /* a bad range like \w-, [:word:]- ? */
17646                     if (namedclass > OOB_NAMEDCLASS) {
17647                         if (strict || ckWARN(WARN_REGEXP)) {
17648                             const int w = RExC_parse >= rangebegin
17649                                           ?  RExC_parse - rangebegin
17650                                           : 0;
17651                             if (strict) {
17652                                 vFAIL4("False [] range \"%*.*s\"",
17653                                     w, w, rangebegin);
17654                             }
17655                             else {
17656                                 vWARN4(RExC_parse,
17657                                     "False [] range \"%*.*s\"",
17658                                     w, w, rangebegin);
17659                             }
17660                         }
17661                         cp_list = add_cp_to_invlist(cp_list, '-');
17662                         element_count++;
17663                     } else
17664                         range = 1;	/* yeah, it's a range! */
17665                     continue;	/* but do it the next time */
17666                 }
17667 	    }
17668 	}
17669 
17670         if (namedclass > OOB_NAMEDCLASS) {
17671             continue;
17672         }
17673 
17674         /* Here, we have a single value this time through the loop, and
17675          * <prevvalue> is the beginning of the range, if any; or <value> if
17676          * not. */
17677 
17678 	/* non-Latin1 code point implies unicode semantics. */
17679 	if (value > 255) {
17680             REQUIRE_UNI_RULES(flagp, 0);
17681 	}
17682 
17683         /* Ready to process either the single value, or the completed range.
17684          * For single-valued non-inverted ranges, we consider the possibility
17685          * of multi-char folds.  (We made a conscious decision to not do this
17686          * for the other cases because it can often lead to non-intuitive
17687          * results.  For example, you have the peculiar case that:
17688          *  "s s" =~ /^[^\xDF]+$/i => Y
17689          *  "ss"  =~ /^[^\xDF]+$/i => N
17690          *
17691          * See [perl #89750] */
17692         if (FOLD && allow_mutiple_chars && value == prevvalue) {
17693             if (    value == LATIN_SMALL_LETTER_SHARP_S
17694                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17695                                                         value)))
17696             {
17697                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17698 
17699                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17700                 STRLEN foldlen;
17701 
17702                 UV folded = _to_uni_fold_flags(
17703                                 value,
17704                                 foldbuf,
17705                                 &foldlen,
17706                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17707                                                    ? FOLD_FLAGS_NOMIX_ASCII
17708                                                    : 0)
17709                                 );
17710 
17711                 /* Here, <folded> should be the first character of the
17712                  * multi-char fold of <value>, with <foldbuf> containing the
17713                  * whole thing.  But, if this fold is not allowed (because of
17714                  * the flags), <fold> will be the same as <value>, and should
17715                  * be processed like any other character, so skip the special
17716                  * handling */
17717                 if (folded != value) {
17718 
17719                     /* Skip if we are recursed, currently parsing the class
17720                      * again.  Otherwise add this character to the list of
17721                      * multi-char folds. */
17722                     if (! RExC_in_multi_char_class) {
17723                         STRLEN cp_count = utf8_length(foldbuf,
17724                                                       foldbuf + foldlen);
17725                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17726 
17727                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17728 
17729                         multi_char_matches
17730                                         = add_multi_match(multi_char_matches,
17731                                                           multi_fold,
17732                                                           cp_count);
17733 
17734                     }
17735 
17736                     /* This element should not be processed further in this
17737                      * class */
17738                     element_count--;
17739                     value = save_value;
17740                     prevvalue = save_prevvalue;
17741                     continue;
17742                 }
17743             }
17744         }
17745 
17746         if (strict && ckWARN(WARN_REGEXP)) {
17747             if (range) {
17748 
17749                 /* If the range starts above 255, everything is portable and
17750                  * likely to be so for any forseeable character set, so don't
17751                  * warn. */
17752                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17753                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17754                 }
17755                 else if (prevvalue != value) {
17756 
17757                     /* Under strict, ranges that stop and/or end in an ASCII
17758                      * printable should have each end point be a portable value
17759                      * for it (preferably like 'A', but we don't warn if it is
17760                      * a (portable) Unicode name or code point), and the range
17761                      * must be be all digits or all letters of the same case.
17762                      * Otherwise, the range is non-portable and unclear as to
17763                      * what it contains */
17764                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17765                         && (          non_portable_endpoint
17766                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17767                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17768                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17769                     ))) {
17770                         vWARN(RExC_parse, "Ranges of ASCII printables should"
17771                                           " be some subset of \"0-9\","
17772                                           " \"A-Z\", or \"a-z\"");
17773                     }
17774                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17775                         SSize_t index_start;
17776                         SSize_t index_final;
17777 
17778                         /* But the nature of Unicode and languages mean we
17779                          * can't do the same checks for above-ASCII ranges,
17780                          * except in the case of digit ones.  These should
17781                          * contain only digits from the same group of 10.  The
17782                          * ASCII case is handled just above.  Hence here, the
17783                          * range could be a range of digits.  First some
17784                          * unlikely special cases.  Grandfather in that a range
17785                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17786                          * if its starting value is one of the 10 digits prior
17787                          * to it.  This is because it is an alternate way of
17788                          * writing 19D1, and some people may expect it to be in
17789                          * that group.  But it is bad, because it won't give
17790                          * the expected results.  In Unicode 5.2 it was
17791                          * considered to be in that group (of 11, hence), but
17792                          * this was fixed in the next version */
17793 
17794                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17795                             goto warn_bad_digit_range;
17796                         }
17797                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
17798                                           &&     value <= 0x1D7FF))
17799                         {
17800                             /* This is the only other case currently in Unicode
17801                              * where the algorithm below fails.  The code
17802                              * points just above are the end points of a single
17803                              * range containing only decimal digits.  It is 5
17804                              * different series of 0-9.  All other ranges of
17805                              * digits currently in Unicode are just a single
17806                              * series.  (And mktables will notify us if a later
17807                              * Unicode version breaks this.)
17808                              *
17809                              * If the range being checked is at most 9 long,
17810                              * and the digit values represented are in
17811                              * numerical order, they are from the same series.
17812                              * */
17813                             if (         value - prevvalue > 9
17814                                 ||    (((    value - 0x1D7CE) % 10)
17815                                      <= (prevvalue - 0x1D7CE) % 10))
17816                             {
17817                                 goto warn_bad_digit_range;
17818                             }
17819                         }
17820                         else {
17821 
17822                             /* For all other ranges of digits in Unicode, the
17823                              * algorithm is just to check if both end points
17824                              * are in the same series, which is the same range.
17825                              * */
17826                             index_start = _invlist_search(
17827                                                     PL_XPosix_ptrs[_CC_DIGIT],
17828                                                     prevvalue);
17829 
17830                             /* Warn if the range starts and ends with a digit,
17831                              * and they are not in the same group of 10. */
17832                             if (   index_start >= 0
17833                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17834                                 && (index_final =
17835                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17836                                                     value)) != index_start
17837                                 && index_final >= 0
17838                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17839                             {
17840                               warn_bad_digit_range:
17841                                 vWARN(RExC_parse, "Ranges of digits should be"
17842                                                   " from the same group of"
17843                                                   " 10");
17844                             }
17845                         }
17846                     }
17847                 }
17848             }
17849             if ((! range || prevvalue == value) && non_portable_endpoint) {
17850                 if (isPRINT_A(value)) {
17851                     char literal[3];
17852                     unsigned d = 0;
17853                     if (isBACKSLASHED_PUNCT(value)) {
17854                         literal[d++] = '\\';
17855                     }
17856                     literal[d++] = (char) value;
17857                     literal[d++] = '\0';
17858 
17859                     vWARN4(RExC_parse,
17860                            "\"%.*s\" is more clearly written simply as \"%s\"",
17861                            (int) (RExC_parse - rangebegin),
17862                            rangebegin,
17863                            literal
17864                         );
17865                 }
17866                 else if isMNEMONIC_CNTRL(value) {
17867                     vWARN4(RExC_parse,
17868                            "\"%.*s\" is more clearly written simply as \"%s\"",
17869                            (int) (RExC_parse - rangebegin),
17870                            rangebegin,
17871                            cntrl_to_mnemonic((U8) value)
17872                         );
17873                 }
17874             }
17875         }
17876 
17877         /* Deal with this element of the class */
17878 
17879 #ifndef EBCDIC
17880         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17881                                                     prevvalue, value);
17882 #else
17883         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17884          * that don't require special handling, we can just add the range like
17885          * we do for ASCII platforms */
17886         if ((UNLIKELY(prevvalue == 0) && value >= 255)
17887             || ! (prevvalue < 256
17888                     && (unicode_range
17889                         || (! non_portable_endpoint
17890                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17891                                 || (isUPPER_A(prevvalue)
17892                                     && isUPPER_A(value)))))))
17893         {
17894             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17895                                                         prevvalue, value);
17896         }
17897         else {
17898             /* Here, requires special handling.  This can be because it is a
17899              * range whose code points are considered to be Unicode, and so
17900              * must be individually translated into native, or because its a
17901              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17902              * EBCDIC, but we have defined them to include only the "expected"
17903              * upper or lower case ASCII alphabetics.  Subranges above 255 are
17904              * the same in native and Unicode, so can be added as a range */
17905             U8 start = NATIVE_TO_LATIN1(prevvalue);
17906             unsigned j;
17907             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17908             for (j = start; j <= end; j++) {
17909                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17910             }
17911             if (value > 255) {
17912                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17913                                                             256, value);
17914             }
17915         }
17916 #endif
17917 
17918 	range = 0; /* this range (if it was one) is done now */
17919     } /* End of loop through all the text within the brackets */
17920 
17921     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17922         output_posix_warnings(pRExC_state, posix_warnings);
17923     }
17924 
17925     /* If anything in the class expands to more than one character, we have to
17926      * deal with them by building up a substitute parse string, and recursively
17927      * calling reg() on it, instead of proceeding */
17928     if (multi_char_matches) {
17929 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17930         I32 cp_count;
17931 	STRLEN len;
17932 	char *save_end = RExC_end;
17933 	char *save_parse = RExC_parse;
17934 	char *save_start = RExC_start;
17935         Size_t constructed_prefix_len = 0; /* This gives the length of the
17936                                               constructed portion of the
17937                                               substitute parse. */
17938         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17939                                        a "|" */
17940         I32 reg_flags;
17941 
17942         assert(! invert);
17943         /* Only one level of recursion allowed */
17944         assert(RExC_copy_start_in_constructed == RExC_precomp);
17945 
17946 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17947            because too confusing */
17948         if (invert) {
17949             sv_catpvs(substitute_parse, "(?:");
17950         }
17951 #endif
17952 
17953         /* Look at the longest folds first */
17954         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17955                         cp_count > 0;
17956                         cp_count--)
17957         {
17958 
17959             if (av_exists(multi_char_matches, cp_count)) {
17960                 AV** this_array_ptr;
17961                 SV* this_sequence;
17962 
17963                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17964                                                  cp_count, FALSE);
17965                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17966                                                                 &PL_sv_undef)
17967                 {
17968                     if (! first_time) {
17969                         sv_catpvs(substitute_parse, "|");
17970                     }
17971                     first_time = FALSE;
17972 
17973                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17974                 }
17975             }
17976         }
17977 
17978         /* If the character class contains anything else besides these
17979          * multi-character folds, have to include it in recursive parsing */
17980         if (element_count) {
17981             sv_catpvs(substitute_parse, "|[");
17982             constructed_prefix_len = SvCUR(substitute_parse);
17983             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17984 
17985             /* Put in a closing ']' only if not going off the end, as otherwise
17986              * we are adding something that really isn't there */
17987             if (RExC_parse < RExC_end) {
17988                 sv_catpvs(substitute_parse, "]");
17989             }
17990         }
17991 
17992         sv_catpvs(substitute_parse, ")");
17993 #if 0
17994         if (invert) {
17995             /* This is a way to get the parse to skip forward a whole named
17996              * sequence instead of matching the 2nd character when it fails the
17997              * first */
17998             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17999         }
18000 #endif
18001 
18002         /* Set up the data structure so that any errors will be properly
18003          * reported.  See the comments at the definition of
18004          * REPORT_LOCATION_ARGS for details */
18005         RExC_copy_start_in_input = (char *) orig_parse;
18006 	RExC_start = RExC_parse = SvPV(substitute_parse, len);
18007         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18008 	RExC_end = RExC_parse + len;
18009         RExC_in_multi_char_class = 1;
18010 
18011 	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18012 
18013         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18014 
18015         /* And restore so can parse the rest of the pattern */
18016         RExC_parse = save_parse;
18017 	RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18018 	RExC_end = save_end;
18019 	RExC_in_multi_char_class = 0;
18020         SvREFCNT_dec_NN(multi_char_matches);
18021         return ret;
18022     }
18023 
18024     /* If folding, we calculate all characters that could fold to or from the
18025      * ones already on the list */
18026     if (cp_foldable_list) {
18027         if (FOLD) {
18028             UV start, end;	/* End points of code point ranges */
18029 
18030             SV* fold_intersection = NULL;
18031             SV** use_list;
18032 
18033             /* Our calculated list will be for Unicode rules.  For locale
18034              * matching, we have to keep a separate list that is consulted at
18035              * runtime only when the locale indicates Unicode rules (and we
18036              * don't include potential matches in the ASCII/Latin1 range, as
18037              * any code point could fold to any other, based on the run-time
18038              * locale).   For non-locale, we just use the general list */
18039             if (LOC) {
18040                 use_list = &only_utf8_locale_list;
18041             }
18042             else {
18043                 use_list = &cp_list;
18044             }
18045 
18046             /* Only the characters in this class that participate in folds need
18047              * be checked.  Get the intersection of this class and all the
18048              * possible characters that are foldable.  This can quickly narrow
18049              * down a large class */
18050             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18051                                   &fold_intersection);
18052 
18053             /* Now look at the foldable characters in this class individually */
18054             invlist_iterinit(fold_intersection);
18055             while (invlist_iternext(fold_intersection, &start, &end)) {
18056                 UV j;
18057                 UV folded;
18058 
18059                 /* Look at every character in the range */
18060                 for (j = start; j <= end; j++) {
18061                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18062                     STRLEN foldlen;
18063                     unsigned int k;
18064                     Size_t folds_count;
18065                     unsigned int first_fold;
18066                     const unsigned int * remaining_folds;
18067 
18068                     if (j < 256) {
18069 
18070                         /* Under /l, we don't know what code points below 256
18071                          * fold to, except we do know the MICRO SIGN folds to
18072                          * an above-255 character if the locale is UTF-8, so we
18073                          * add it to the special list (in *use_list)  Otherwise
18074                          * we know now what things can match, though some folds
18075                          * are valid under /d only if the target is UTF-8.
18076                          * Those go in a separate list */
18077                         if (      IS_IN_SOME_FOLD_L1(j)
18078                             && ! (LOC && j != MICRO_SIGN))
18079                         {
18080 
18081                             /* ASCII is always matched; non-ASCII is matched
18082                              * only under Unicode rules (which could happen
18083                              * under /l if the locale is a UTF-8 one */
18084                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18085                                 *use_list = add_cp_to_invlist(*use_list,
18086                                                             PL_fold_latin1[j]);
18087                             }
18088                             else if (j != PL_fold_latin1[j]) {
18089                                 upper_latin1_only_utf8_matches
18090                                         = add_cp_to_invlist(
18091                                                 upper_latin1_only_utf8_matches,
18092                                                 PL_fold_latin1[j]);
18093                             }
18094                         }
18095 
18096                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18097                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18098                         {
18099                             add_above_Latin1_folds(pRExC_state,
18100                                                    (U8) j,
18101                                                    use_list);
18102                         }
18103                         continue;
18104                     }
18105 
18106                     /* Here is an above Latin1 character.  We don't have the
18107                      * rules hard-coded for it.  First, get its fold.  This is
18108                      * the simple fold, as the multi-character folds have been
18109                      * handled earlier and separated out */
18110                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18111                                                         (ASCII_FOLD_RESTRICTED)
18112                                                         ? FOLD_FLAGS_NOMIX_ASCII
18113                                                         : 0);
18114 
18115                     /* Single character fold of above Latin1.  Add everything
18116                      * in its fold closure to the list that this node should
18117                      * match. */
18118                     folds_count = _inverse_folds(folded, &first_fold,
18119                                                     &remaining_folds);
18120                     for (k = 0; k <= folds_count; k++) {
18121                         UV c = (k == 0)     /* First time through use itself */
18122                                 ? folded
18123                                 : (k == 1)  /* 2nd time use, the first fold */
18124                                    ? first_fold
18125 
18126                                      /* Then the remaining ones */
18127                                    : remaining_folds[k-2];
18128 
18129                         /* /aa doesn't allow folds between ASCII and non- */
18130                         if ((   ASCII_FOLD_RESTRICTED
18131                             && (isASCII(c) != isASCII(j))))
18132                         {
18133                             continue;
18134                         }
18135 
18136                         /* Folds under /l which cross the 255/256 boundary are
18137                          * added to a separate list.  (These are valid only
18138                          * when the locale is UTF-8.) */
18139                         if (c < 256 && LOC) {
18140                             *use_list = add_cp_to_invlist(*use_list, c);
18141                             continue;
18142                         }
18143 
18144                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18145                         {
18146                             cp_list = add_cp_to_invlist(cp_list, c);
18147                         }
18148                         else {
18149                             /* Similarly folds involving non-ascii Latin1
18150                              * characters under /d are added to their list */
18151                             upper_latin1_only_utf8_matches
18152                                     = add_cp_to_invlist(
18153                                                 upper_latin1_only_utf8_matches,
18154                                                 c);
18155                         }
18156                     }
18157                 }
18158             }
18159             SvREFCNT_dec_NN(fold_intersection);
18160         }
18161 
18162         /* Now that we have finished adding all the folds, there is no reason
18163          * to keep the foldable list separate */
18164         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18165 	SvREFCNT_dec_NN(cp_foldable_list);
18166     }
18167 
18168     /* And combine the result (if any) with any inversion lists from posix
18169      * classes.  The lists are kept separate up to now because we don't want to
18170      * fold the classes */
18171     if (simple_posixes) {   /* These are the classes known to be unaffected by
18172                                /a, /aa, and /d */
18173         if (cp_list) {
18174             _invlist_union(cp_list, simple_posixes, &cp_list);
18175             SvREFCNT_dec_NN(simple_posixes);
18176         }
18177         else {
18178             cp_list = simple_posixes;
18179         }
18180     }
18181     if (posixes || nposixes) {
18182         if (! DEPENDS_SEMANTICS) {
18183 
18184             /* For everything but /d, we can just add the current 'posixes' and
18185              * 'nposixes' to the main list */
18186             if (posixes) {
18187                 if (cp_list) {
18188                     _invlist_union(cp_list, posixes, &cp_list);
18189                     SvREFCNT_dec_NN(posixes);
18190                 }
18191                 else {
18192                     cp_list = posixes;
18193                 }
18194             }
18195             if (nposixes) {
18196                 if (cp_list) {
18197                     _invlist_union(cp_list, nposixes, &cp_list);
18198                     SvREFCNT_dec_NN(nposixes);
18199                 }
18200                 else {
18201                     cp_list = nposixes;
18202                 }
18203             }
18204         }
18205         else {
18206             /* Under /d, things like \w match upper Latin1 characters only if
18207              * the target string is in UTF-8.  But things like \W match all the
18208              * upper Latin1 characters if the target string is not in UTF-8.
18209              *
18210              * Handle the case with something like \W separately */
18211             if (nposixes) {
18212                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18213 
18214                 /* A complemented posix class matches all upper Latin1
18215                  * characters if not in UTF-8.  And it matches just certain
18216                  * ones when in UTF-8.  That means those certain ones are
18217                  * matched regardless, so can just be added to the
18218                  * unconditional list */
18219                 if (cp_list) {
18220                     _invlist_union(cp_list, nposixes, &cp_list);
18221                     SvREFCNT_dec_NN(nposixes);
18222                     nposixes = NULL;
18223                 }
18224                 else {
18225                     cp_list = nposixes;
18226                 }
18227 
18228                 /* Likewise for 'posixes' */
18229                 _invlist_union(posixes, cp_list, &cp_list);
18230                 SvREFCNT_dec(posixes);
18231 
18232                 /* Likewise for anything else in the range that matched only
18233                  * under UTF-8 */
18234                 if (upper_latin1_only_utf8_matches) {
18235                     _invlist_union(cp_list,
18236                                    upper_latin1_only_utf8_matches,
18237                                    &cp_list);
18238                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18239                     upper_latin1_only_utf8_matches = NULL;
18240                 }
18241 
18242                 /* If we don't match all the upper Latin1 characters regardless
18243                  * of UTF-8ness, we have to set a flag to match the rest when
18244                  * not in UTF-8 */
18245                 _invlist_subtract(only_non_utf8_list, cp_list,
18246                                   &only_non_utf8_list);
18247                 if (_invlist_len(only_non_utf8_list) != 0) {
18248                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18249                 }
18250                 SvREFCNT_dec_NN(only_non_utf8_list);
18251             }
18252             else {
18253                 /* Here there were no complemented posix classes.  That means
18254                  * the upper Latin1 characters in 'posixes' match only when the
18255                  * target string is in UTF-8.  So we have to add them to the
18256                  * list of those types of code points, while adding the
18257                  * remainder to the unconditional list.
18258                  *
18259                  * First calculate what they are */
18260                 SV* nonascii_but_latin1_properties = NULL;
18261                 _invlist_intersection(posixes, PL_UpperLatin1,
18262                                       &nonascii_but_latin1_properties);
18263 
18264                 /* And add them to the final list of such characters. */
18265                 _invlist_union(upper_latin1_only_utf8_matches,
18266                                nonascii_but_latin1_properties,
18267                                &upper_latin1_only_utf8_matches);
18268 
18269                 /* Remove them from what now becomes the unconditional list */
18270                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18271                                   &posixes);
18272 
18273                 /* And add those unconditional ones to the final list */
18274                 if (cp_list) {
18275                     _invlist_union(cp_list, posixes, &cp_list);
18276                     SvREFCNT_dec_NN(posixes);
18277                     posixes = NULL;
18278                 }
18279                 else {
18280                     cp_list = posixes;
18281                 }
18282 
18283                 SvREFCNT_dec(nonascii_but_latin1_properties);
18284 
18285                 /* Get rid of any characters from the conditional list that we
18286                  * now know are matched unconditionally, which may make that
18287                  * list empty */
18288                 _invlist_subtract(upper_latin1_only_utf8_matches,
18289                                   cp_list,
18290                                   &upper_latin1_only_utf8_matches);
18291                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18292                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18293                     upper_latin1_only_utf8_matches = NULL;
18294                 }
18295             }
18296         }
18297     }
18298 
18299     /* And combine the result (if any) with any inversion list from properties.
18300      * The lists are kept separate up to now so that we can distinguish the two
18301      * in regards to matching above-Unicode.  A run-time warning is generated
18302      * if a Unicode property is matched against a non-Unicode code point. But,
18303      * we allow user-defined properties to match anything, without any warning,
18304      * and we also suppress the warning if there is a portion of the character
18305      * class that isn't a Unicode property, and which matches above Unicode, \W
18306      * or [\x{110000}] for example.
18307      * (Note that in this case, unlike the Posix one above, there is no
18308      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18309      * forces Unicode semantics */
18310     if (properties) {
18311         if (cp_list) {
18312 
18313             /* If it matters to the final outcome, see if a non-property
18314              * component of the class matches above Unicode.  If so, the
18315              * warning gets suppressed.  This is true even if just a single
18316              * such code point is specified, as, though not strictly correct if
18317              * another such code point is matched against, the fact that they
18318              * are using above-Unicode code points indicates they should know
18319              * the issues involved */
18320             if (warn_super) {
18321                 warn_super = ! (invert
18322                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18323             }
18324 
18325             _invlist_union(properties, cp_list, &cp_list);
18326             SvREFCNT_dec_NN(properties);
18327         }
18328         else {
18329             cp_list = properties;
18330         }
18331 
18332         if (warn_super) {
18333             anyof_flags
18334              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18335 
18336             /* Because an ANYOF node is the only one that warns, this node
18337              * can't be optimized into something else */
18338             optimizable = FALSE;
18339         }
18340     }
18341 
18342     /* Here, we have calculated what code points should be in the character
18343      * class.
18344      *
18345      * Now we can see about various optimizations.  Fold calculation (which we
18346      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18347      * would invert to include K, which under /i would match k, which it
18348      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18349      * folded until runtime */
18350 
18351     /* If we didn't do folding, it's because some information isn't available
18352      * until runtime; set the run-time fold flag for these  We know to set the
18353      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18354      * at least one 0-255 range code point */
18355     if (LOC && FOLD) {
18356 
18357         /* Some things on the list might be unconditionally included because of
18358          * other components.  Remove them, and clean up the list if it goes to
18359          * 0 elements */
18360         if (only_utf8_locale_list && cp_list) {
18361             _invlist_subtract(only_utf8_locale_list, cp_list,
18362                               &only_utf8_locale_list);
18363 
18364             if (_invlist_len(only_utf8_locale_list) == 0) {
18365                 SvREFCNT_dec_NN(only_utf8_locale_list);
18366                 only_utf8_locale_list = NULL;
18367             }
18368         }
18369         if (    only_utf8_locale_list
18370             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18371                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18372         {
18373             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18374             anyof_flags
18375                  |= ANYOFL_FOLD
18376                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18377         }
18378         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18379             UV start, end;
18380             invlist_iterinit(cp_list);
18381             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18382                 anyof_flags |= ANYOFL_FOLD;
18383                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18384             }
18385             invlist_iterfinish(cp_list);
18386         }
18387     }
18388     else if (   DEPENDS_SEMANTICS
18389              && (    upper_latin1_only_utf8_matches
18390                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18391     {
18392         RExC_seen_d_op = TRUE;
18393         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18394     }
18395 
18396     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18397      * compile time. */
18398     if (     cp_list
18399         &&   invert
18400         && ! has_runtime_dependency)
18401     {
18402         _invlist_invert(cp_list);
18403 
18404 	/* Clear the invert flag since have just done it here */
18405 	invert = FALSE;
18406     }
18407 
18408     if (ret_invlist) {
18409         *ret_invlist = cp_list;
18410 
18411         return RExC_emit;
18412     }
18413 
18414     /* All possible optimizations below still have these characteristics.
18415      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18416      * routine) */
18417     *flagp |= HASWIDTH|SIMPLE;
18418 
18419     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18420         RExC_contains_locale = 1;
18421     }
18422 
18423     /* Some character classes are equivalent to other nodes.  Such nodes take
18424      * up less room, and some nodes require fewer operations to execute, than
18425      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18426      * improve efficiency. */
18427 
18428     if (optimizable) {
18429         PERL_UINT_FAST8_T i;
18430         Size_t partial_cp_count = 0;
18431         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18432         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18433 
18434         if (cp_list) { /* Count the code points in enough ranges that we would
18435                           see all the ones possible in any fold in this version
18436                           of Unicode */
18437 
18438             invlist_iterinit(cp_list);
18439             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18440                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18441                     break;
18442                 }
18443                 partial_cp_count += end[i] - start[i] + 1;
18444             }
18445 
18446             invlist_iterfinish(cp_list);
18447         }
18448 
18449         /* If we know at compile time that this matches every possible code
18450          * point, any run-time dependencies don't matter */
18451         if (start[0] == 0 && end[0] == UV_MAX) {
18452             if (invert) {
18453                 ret = reganode(pRExC_state, OPFAIL, 0);
18454             }
18455             else {
18456                 ret = reg_node(pRExC_state, SANY);
18457                 MARK_NAUGHTY(1);
18458             }
18459             goto not_anyof;
18460         }
18461 
18462         /* Similarly, for /l posix classes, if both a class and its
18463          * complement match, any run-time dependencies don't matter */
18464         if (posixl) {
18465             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18466                                                         namedclass += 2)
18467             {
18468                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18469                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18470                 {
18471                     if (invert) {
18472                         ret = reganode(pRExC_state, OPFAIL, 0);
18473                     }
18474                     else {
18475                         ret = reg_node(pRExC_state, SANY);
18476                         MARK_NAUGHTY(1);
18477                     }
18478                     goto not_anyof;
18479                 }
18480             }
18481             /* For well-behaved locales, some classes are subsets of others,
18482              * so complementing the subset and including the non-complemented
18483              * superset should match everything, like [\D[:alnum:]], and
18484              * [[:^alpha:][:alnum:]], but some implementations of locales are
18485              * buggy, and khw thinks its a bad idea to have optimization change
18486              * behavior, even if it avoids an OS bug in a given case */
18487 
18488 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18489 
18490             /* If is a single posix /l class, can optimize to just that op.
18491              * Such a node will not match anything in the Latin1 range, as that
18492              * is not determinable until runtime, but will match whatever the
18493              * class does outside that range.  (Note that some classes won't
18494              * match anything outside the range, like [:ascii:]) */
18495             if (    isSINGLE_BIT_SET(posixl)
18496                 && (partial_cp_count == 0 || start[0] > 255))
18497             {
18498                 U8 classnum;
18499                 SV * class_above_latin1 = NULL;
18500                 bool already_inverted;
18501                 bool are_equivalent;
18502 
18503                 /* Compute which bit is set, which is the same thing as, e.g.,
18504                  * ANYOF_CNTRL.  From
18505                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18506                  * */
18507                 static const int MultiplyDeBruijnBitPosition2[32] =
18508                     {
18509                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18510                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18511                     };
18512 
18513                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18514                                                           * 0x077CB531U) >> 27];
18515                 classnum = namedclass_to_classnum(namedclass);
18516 
18517                 /* The named classes are such that the inverted number is one
18518                  * larger than the non-inverted one */
18519                 already_inverted = namedclass
18520                                  - classnum_to_namedclass(classnum);
18521 
18522                 /* Create an inversion list of the official property, inverted
18523                  * if the constructed node list is inverted, and restricted to
18524                  * only the above latin1 code points, which are the only ones
18525                  * known at compile time */
18526                 _invlist_intersection_maybe_complement_2nd(
18527                                                     PL_AboveLatin1,
18528                                                     PL_XPosix_ptrs[classnum],
18529                                                     already_inverted,
18530                                                     &class_above_latin1);
18531                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18532                                                                         FALSE);
18533                 SvREFCNT_dec_NN(class_above_latin1);
18534 
18535                 if (are_equivalent) {
18536 
18537                     /* Resolve the run-time inversion flag with this possibly
18538                      * inverted class */
18539                     invert = invert ^ already_inverted;
18540 
18541                     ret = reg_node(pRExC_state,
18542                                    POSIXL + invert * (NPOSIXL - POSIXL));
18543                     FLAGS(REGNODE_p(ret)) = classnum;
18544                     goto not_anyof;
18545                 }
18546             }
18547         }
18548 
18549         /* khw can't think of any other possible transformation involving
18550          * these. */
18551         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18552             goto is_anyof;
18553         }
18554 
18555         if (! has_runtime_dependency) {
18556 
18557             /* If the list is empty, nothing matches.  This happens, for
18558              * example, when a Unicode property that doesn't match anything is
18559              * the only element in the character class (perluniprops.pod notes
18560              * such properties). */
18561             if (partial_cp_count == 0) {
18562                 if (invert) {
18563                     ret = reg_node(pRExC_state, SANY);
18564                 }
18565                 else {
18566                     ret = reganode(pRExC_state, OPFAIL, 0);
18567                 }
18568 
18569                 goto not_anyof;
18570             }
18571 
18572             /* If matches everything but \n */
18573             if (   start[0] == 0 && end[0] == '\n' - 1
18574                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18575             {
18576                 assert (! invert);
18577                 ret = reg_node(pRExC_state, REG_ANY);
18578                 MARK_NAUGHTY(1);
18579                 goto not_anyof;
18580             }
18581         }
18582 
18583         /* Next see if can optimize classes that contain just a few code points
18584          * into an EXACTish node.  The reason to do this is to let the
18585          * optimizer join this node with adjacent EXACTish ones.
18586          *
18587          * An EXACTFish node can be generated even if not under /i, and vice
18588          * versa.  But care must be taken.  An EXACTFish node has to be such
18589          * that it only matches precisely the code points in the class, but we
18590          * want to generate the least restrictive one that does that, to
18591          * increase the odds of being able to join with an adjacent node.  For
18592          * example, if the class contains [kK], we have to make it an EXACTFAA
18593          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18594          * /i or not is irrelevant in this case.  Less obvious is the pattern
18595          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18596          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18597          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18598          * that includes \X{02BC}, there is a multi-char fold that does, and so
18599          * the node generated for it must be an EXACTFish one.  On the other
18600          * hand qr/:/i should generate a plain EXACT node since the colon
18601          * participates in no fold whatsoever, and having it EXACT tells the
18602          * optimizer the target string cannot match unless it has a colon in
18603          * it.
18604          *
18605          * We don't typically generate an EXACTish node if doing so would
18606          * require changing the pattern to UTF-8, as that affects /d and
18607          * otherwise is slower.  However, under /i, not changing to UTF-8 can
18608          * miss some potential multi-character folds.  We calculate the
18609          * EXACTish node, and then decide if something would be missed if we
18610          * don't upgrade */
18611         if (   ! posixl
18612             && ! invert
18613 
18614                 /* Only try if there are no more code points in the class than
18615                  * in the max possible fold */
18616             &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
18617 
18618             && (start[0] < 256 || UTF || FOLD))
18619         {
18620             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18621             {
18622                 /* We can always make a single code point class into an
18623                  * EXACTish node. */
18624 
18625                 if (LOC) {
18626 
18627                     /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
18628                      * as that means there is a fold not known until runtime so
18629                      * shows as only a single code point here. */
18630                     op = (FOLD) ? EXACTFL : EXACTL;
18631                 }
18632                 else if (! FOLD) { /* Not /l and not /i */
18633                     op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
18634                 }
18635                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18636                                               small */
18637 
18638                     /* Under /i, it gets a little tricky.  A code point that
18639                      * doesn't participate in a fold should be an EXACT node.
18640                      * We know this one isn't the result of a simple fold, or
18641                      * there'd be more than one code point in the list, but it
18642                      * could be part of a multi- character fold.  In that case
18643                      * we better not create an EXACT node, as we would wrongly
18644                      * be telling the optimizer that this code point must be in
18645                      * the target string, and that is wrong.  This is because
18646                      * if the sequence around this code point forms a
18647                      * multi-char fold, what needs to be in the string could be
18648                      * the code point that folds to the sequence.
18649                      *
18650                      * This handles the case of below-255 code points, as we
18651                      * have an easy look up for those.  The next clause handles
18652                      * the above-256 one */
18653                     op = IS_IN_SOME_FOLD_L1(start[0])
18654                          ? EXACTFU
18655                          : EXACT;
18656                 }
18657                 else {  /* /i, larger code point.  Since we are under /i, and
18658                            have just this code point, we know that it can't
18659                            fold to something else, so PL_InMultiCharFold
18660                            applies to it */
18661                     op = _invlist_contains_cp(PL_InMultiCharFold,
18662                                               start[0])
18663                          ? EXACTFU_ONLY8
18664                          : EXACT_ONLY8;
18665                 }
18666 
18667                 value = start[0];
18668             }
18669             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18670                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18671             {
18672                 /* Here, the only runtime dependency, if any, is from /d, and
18673                  * the class matches more than one code point, and the lowest
18674                  * code point participates in some fold.  It might be that the
18675                  * other code points are /i equivalent to this one, and hence
18676                  * they would representable by an EXACTFish node.  Above, we
18677                  * eliminated classes that contain too many code points to be
18678                  * EXACTFish, with the test for MAX_FOLD_FROMS
18679                  *
18680                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18681                  * We do this because we have EXACTFAA at our disposal for the
18682                  * ASCII range */
18683                 if (partial_cp_count == 2 && isASCII(start[0])) {
18684 
18685                     /* The only ASCII characters that participate in folds are
18686                      * alphabetics */
18687                     assert(isALPHA(start[0]));
18688                     if (   end[0] == start[0]   /* First range is a single
18689                                                    character, so 2nd exists */
18690                         && isALPHA_FOLD_EQ(start[0], start[1]))
18691                     {
18692 
18693                         /* Here, is part of an ASCII fold pair */
18694 
18695                         if (   ASCII_FOLD_RESTRICTED
18696                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18697                         {
18698                             /* If the second clause just above was true, it
18699                              * means we can't be under /i, or else the list
18700                              * would have included more than this fold pair.
18701                              * Therefore we have to exclude the possibility of
18702                              * whatever else it is that folds to these, by
18703                              * using EXACTFAA */
18704                             op = EXACTFAA;
18705                         }
18706                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18707 
18708                             /* Here, there's no simple fold that start[0] is part
18709                              * of, but there is a multi-character one.  If we
18710                              * are not under /i, we want to exclude that
18711                              * possibility; if under /i, we want to include it
18712                              * */
18713                             op = (FOLD) ? EXACTFU : EXACTFAA;
18714                         }
18715                         else {
18716 
18717                             /* Here, the only possible fold start[0] particpates in
18718                              * is with start[1].  /i or not isn't relevant */
18719                             op = EXACTFU;
18720                         }
18721 
18722                         value = toFOLD(start[0]);
18723                     }
18724                 }
18725                 else if (  ! upper_latin1_only_utf8_matches
18726                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18727                                                                           == 2
18728                              && PL_fold_latin1[
18729                                invlist_highest(upper_latin1_only_utf8_matches)]
18730                              == start[0]))
18731                 {
18732                     /* Here, the smallest character is non-ascii or there are
18733                      * more than 2 code points matched by this node.  Also, we
18734                      * either don't have /d UTF-8 dependent matches, or if we
18735                      * do, they look like they could be a single character that
18736                      * is the fold of the lowest one in the always-match list.
18737                      * This test quickly excludes most of the false positives
18738                      * when there are /d UTF-8 depdendent matches.  These are
18739                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18740                      * SMALL LETTER A WITH GRAVE iff the target string is
18741                      * UTF-8.  (We don't have to worry above about exceeding
18742                      * the array bounds of PL_fold_latin1[] because any code
18743                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18744                      *
18745                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18746                      * points) in the ASCII range, so we can't use it here to
18747                      * artificially restrict the fold domain, so we check if
18748                      * the class does or does not match some EXACTFish node.
18749                      * Further, if we aren't under /i, and and the folded-to
18750                      * character is part of a multi-character fold, we can't do
18751                      * this optimization, as the sequence around it could be
18752                      * that multi-character fold, and we don't here know the
18753                      * context, so we have to assume it is that multi-char
18754                      * fold, to prevent potential bugs.
18755                      *
18756                      * To do the general case, we first find the fold of the
18757                      * lowest code point (which may be higher than the lowest
18758                      * one), then find everything that folds to it.  (The data
18759                      * structure we have only maps from the folded code points,
18760                      * so we have to do the earlier step.) */
18761 
18762                     Size_t foldlen;
18763                     U8 foldbuf[UTF8_MAXBYTES_CASE];
18764                     UV folded = _to_uni_fold_flags(start[0],
18765                                                         foldbuf, &foldlen, 0);
18766                     unsigned int first_fold;
18767                     const unsigned int * remaining_folds;
18768                     Size_t folds_to_this_cp_count = _inverse_folds(
18769                                                             folded,
18770                                                             &first_fold,
18771                                                             &remaining_folds);
18772                     Size_t folds_count = folds_to_this_cp_count + 1;
18773                     SV * fold_list = _new_invlist(folds_count);
18774                     unsigned int i;
18775 
18776                     /* If there are UTF-8 dependent matches, create a temporary
18777                      * list of what this node matches, including them. */
18778                     SV * all_cp_list = NULL;
18779                     SV ** use_this_list = &cp_list;
18780 
18781                     if (upper_latin1_only_utf8_matches) {
18782                         all_cp_list = _new_invlist(0);
18783                         use_this_list = &all_cp_list;
18784                         _invlist_union(cp_list,
18785                                        upper_latin1_only_utf8_matches,
18786                                        use_this_list);
18787                     }
18788 
18789                     /* Having gotten everything that participates in the fold
18790                      * containing the lowest code point, we turn that into an
18791                      * inversion list, making sure everything is included. */
18792                     fold_list = add_cp_to_invlist(fold_list, start[0]);
18793                     fold_list = add_cp_to_invlist(fold_list, folded);
18794                     if (folds_to_this_cp_count > 0) {
18795                         fold_list = add_cp_to_invlist(fold_list, first_fold);
18796                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
18797                             fold_list = add_cp_to_invlist(fold_list,
18798                                                         remaining_folds[i]);
18799                         }
18800                     }
18801 
18802                     /* If the fold list is identical to what's in this ANYOF
18803                      * node, the node can be represented by an EXACTFish one
18804                      * instead */
18805                     if (_invlistEQ(*use_this_list, fold_list,
18806                                    0 /* Don't complement */ )
18807                     ) {
18808 
18809                         /* But, we have to be careful, as mentioned above.
18810                          * Just the right sequence of characters could match
18811                          * this if it is part of a multi-character fold.  That
18812                          * IS what we want if we are under /i.  But it ISN'T
18813                          * what we want if not under /i, as it could match when
18814                          * it shouldn't.  So, when we aren't under /i and this
18815                          * character participates in a multi-char fold, we
18816                          * don't optimize into an EXACTFish node.  So, for each
18817                          * case below we have to check if we are folding
18818                          * and if not, if it is not part of a multi-char fold.
18819                          * */
18820                         if (start[0] > 255) {    /* Highish code point */
18821                             if (FOLD || ! _invlist_contains_cp(
18822                                             PL_InMultiCharFold, folded))
18823                             {
18824                                 op = (LOC)
18825                                      ? EXACTFLU8
18826                                      : (ASCII_FOLD_RESTRICTED)
18827                                        ? EXACTFAA
18828                                        : EXACTFU_ONLY8;
18829                                 value = folded;
18830                             }
18831                         }   /* Below, the lowest code point < 256 */
18832                         else if (    FOLD
18833                                  &&  folded == 's'
18834                                  &&  DEPENDS_SEMANTICS)
18835                         {   /* An EXACTF node containing a single character
18836                                 's', can be an EXACTFU if it doesn't get
18837                                 joined with an adjacent 's' */
18838                             op = EXACTFU_S_EDGE;
18839                             value = folded;
18840                         }
18841                         else if (    FOLD
18842                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
18843                         {
18844                             if (upper_latin1_only_utf8_matches) {
18845                                 op = EXACTF;
18846 
18847                                 /* We can't use the fold, as that only matches
18848                                  * under UTF-8 */
18849                                 value = start[0];
18850                             }
18851                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
18852                                      && ! UTF)
18853                             {   /* EXACTFUP is a special node for this
18854                                    character */
18855                                 op = (ASCII_FOLD_RESTRICTED)
18856                                      ? EXACTFAA
18857                                      : EXACTFUP;
18858                                 value = MICRO_SIGN;
18859                             }
18860                             else if (     ASCII_FOLD_RESTRICTED
18861                                      && ! isASCII(start[0]))
18862                             {   /* For ASCII under /iaa, we can use EXACTFU
18863                                    below */
18864                                 op = EXACTFAA;
18865                                 value = folded;
18866                             }
18867                             else {
18868                                 op = EXACTFU;
18869                                 value = folded;
18870                             }
18871                         }
18872                     }
18873 
18874                     SvREFCNT_dec_NN(fold_list);
18875                     SvREFCNT_dec(all_cp_list);
18876                 }
18877             }
18878 
18879             if (op != END) {
18880 
18881                 /* Here, we have calculated what EXACTish node we would use.
18882                  * But we don't use it if it would require converting the
18883                  * pattern to UTF-8, unless not using it could cause us to miss
18884                  * some folds (hence be buggy) */
18885 
18886                 if (! UTF && value > 255) {
18887                     SV * in_multis = NULL;
18888 
18889                     assert(FOLD);
18890 
18891                     /* If there is no code point that is part of a multi-char
18892                      * fold, then there aren't any matches, so we don't do this
18893                      * optimization.  Otherwise, it could match depending on
18894                      * the context around us, so we do upgrade */
18895                     _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
18896                     if (UNLIKELY(_invlist_len(in_multis) != 0)) {
18897                         REQUIRE_UTF8(flagp);
18898                     }
18899                     else {
18900                         op = END;
18901                     }
18902                 }
18903 
18904                 if (op != END) {
18905                     U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
18906 
18907                     ret = regnode_guts(pRExC_state, op, len, "exact");
18908                     FILL_NODE(ret, op);
18909                     RExC_emit += 1 + STR_SZ(len);
18910                     STR_LEN(REGNODE_p(ret)) = len;
18911                     if (len == 1) {
18912                         *STRING(REGNODE_p(ret)) = (U8) value;
18913                     }
18914                     else {
18915                         uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
18916                     }
18917                     goto not_anyof;
18918                 }
18919             }
18920         }
18921 
18922         if (! has_runtime_dependency) {
18923 
18924             /* See if this can be turned into an ANYOFM node.  Think about the
18925              * bit patterns in two different bytes.  In some positions, the
18926              * bits in each will be 1; and in other positions both will be 0;
18927              * and in some positions the bit will be 1 in one byte, and 0 in
18928              * the other.  Let 'n' be the number of positions where the bits
18929              * differ.  We create a mask which has exactly 'n' 0 bits, each in
18930              * a position where the two bytes differ.  Now take the set of all
18931              * bytes that when ANDed with the mask yield the same result.  That
18932              * set has 2**n elements, and is representable by just two 8 bit
18933              * numbers: the result and the mask.  Importantly, matching the set
18934              * can be vectorized by creating a word full of the result bytes,
18935              * and a word full of the mask bytes, yielding a significant speed
18936              * up.  Here, see if this node matches such a set.  As a concrete
18937              * example consider [01], and the byte representing '0' which is
18938              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
18939              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
18940              * 0x30.  Any other bytes ANDed yield something else.  So [01],
18941              * which is a common usage, is optimizable into ANYOFM, and can
18942              * benefit from the speed up.  We can only do this on UTF-8
18943              * invariant bytes, because they have the same bit patterns under
18944              * UTF-8 as not. */
18945             PERL_UINT_FAST8_T inverted = 0;
18946 #ifdef EBCDIC
18947             const PERL_UINT_FAST8_T max_permissible = 0xFF;
18948 #else
18949             const PERL_UINT_FAST8_T max_permissible = 0x7F;
18950 #endif
18951             /* If doesn't fit the criteria for ANYOFM, invert and try again.
18952              * If that works we will instead later generate an NANYOFM, and
18953              * invert back when through */
18954             if (invlist_highest(cp_list) > max_permissible) {
18955                 _invlist_invert(cp_list);
18956                 inverted = 1;
18957             }
18958 
18959             if (invlist_highest(cp_list) <= max_permissible) {
18960                 UV this_start, this_end;
18961                 UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
18962                 U8 bits_differing = 0;
18963                 Size_t full_cp_count = 0;
18964                 bool first_time = TRUE;
18965 
18966                 /* Go through the bytes and find the bit positions that differ
18967                  * */
18968                 invlist_iterinit(cp_list);
18969                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
18970                     unsigned int i = this_start;
18971 
18972                     if (first_time) {
18973                         if (! UVCHR_IS_INVARIANT(i)) {
18974                             goto done_anyofm;
18975                         }
18976 
18977                         first_time = FALSE;
18978                         lowest_cp = this_start;
18979 
18980                         /* We have set up the code point to compare with.
18981                          * Don't compare it with itself */
18982                         i++;
18983                     }
18984 
18985                     /* Find the bit positions that differ from the lowest code
18986                      * point in the node.  Keep track of all such positions by
18987                      * OR'ing */
18988                     for (; i <= this_end; i++) {
18989                         if (! UVCHR_IS_INVARIANT(i)) {
18990                             goto done_anyofm;
18991                         }
18992 
18993                         bits_differing  |= i ^ lowest_cp;
18994                     }
18995 
18996                     full_cp_count += this_end - this_start + 1;
18997                 }
18998                 invlist_iterfinish(cp_list);
18999 
19000                 /* At the end of the loop, we count how many bits differ from
19001                  * the bits in lowest code point, call the count 'd'.  If the
19002                  * set we found contains 2**d elements, it is the closure of
19003                  * all code points that differ only in those bit positions.  To
19004                  * convince yourself of that, first note that the number in the
19005                  * closure must be a power of 2, which we test for.  The only
19006                  * way we could have that count and it be some differing set,
19007                  * is if we got some code points that don't differ from the
19008                  * lowest code point in any position, but do differ from each
19009                  * other in some other position.  That means one code point has
19010                  * a 1 in that position, and another has a 0.  But that would
19011                  * mean that one of them differs from the lowest code point in
19012                  * that position, which possibility we've already excluded.  */
19013                 if (  (inverted || full_cp_count > 1)
19014                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19015                 {
19016                     U8 ANYOFM_mask;
19017 
19018                     op = ANYOFM + inverted;;
19019 
19020                     /* We need to make the bits that differ be 0's */
19021                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19022 
19023                     /* The argument is the lowest code point */
19024                     ret = reganode(pRExC_state, op, lowest_cp);
19025                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19026                 }
19027             }
19028           done_anyofm:
19029 
19030             if (inverted) {
19031                 _invlist_invert(cp_list);
19032             }
19033 
19034             if (op != END) {
19035                 goto not_anyof;
19036             }
19037         }
19038 
19039         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19040             PERL_UINT_FAST8_T type;
19041             SV * intersection = NULL;
19042             SV* d_invlist = NULL;
19043 
19044             /* See if this matches any of the POSIX classes.  The POSIXA and
19045              * POSIXD ones are about the same speed as ANYOF ops, but take less
19046              * room; the ones that have above-Latin1 code point matches are
19047              * somewhat faster than ANYOF.  */
19048 
19049             for (type = POSIXA; type >= POSIXD; type--) {
19050                 int posix_class;
19051 
19052                 if (type == POSIXL) {   /* But not /l posix classes */
19053                     continue;
19054                 }
19055 
19056                 for (posix_class = 0;
19057                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19058                      posix_class++)
19059                 {
19060                     SV** our_code_points = &cp_list;
19061                     SV** official_code_points;
19062                     int try_inverted;
19063 
19064                     if (type == POSIXA) {
19065                         official_code_points = &PL_Posix_ptrs[posix_class];
19066                     }
19067                     else {
19068                         official_code_points = &PL_XPosix_ptrs[posix_class];
19069                     }
19070 
19071                     /* Skip non-existent classes of this type.  e.g. \v only
19072                      * has an entry in PL_XPosix_ptrs */
19073                     if (! *official_code_points) {
19074                         continue;
19075                     }
19076 
19077                     /* Try both the regular class, and its inversion */
19078                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19079                         bool this_inverted = invert ^ try_inverted;
19080 
19081                         if (type != POSIXD) {
19082 
19083                             /* This class that isn't /d can't match if we have
19084                              * /d dependencies */
19085                             if (has_runtime_dependency
19086                                                     & HAS_D_RUNTIME_DEPENDENCY)
19087                             {
19088                                 continue;
19089                             }
19090                         }
19091                         else /* is /d */ if (! this_inverted) {
19092 
19093                             /* /d classes don't match anything non-ASCII below
19094                              * 256 unconditionally (which cp_list contains) */
19095                             _invlist_intersection(cp_list, PL_UpperLatin1,
19096                                                            &intersection);
19097                             if (_invlist_len(intersection) != 0) {
19098                                 continue;
19099                             }
19100 
19101                             SvREFCNT_dec(d_invlist);
19102                             d_invlist = invlist_clone(cp_list, NULL);
19103 
19104                             /* But under UTF-8 it turns into using /u rules.
19105                              * Add the things it matches under these conditions
19106                              * so that we check below that these are identical
19107                              * to what the tested class should match */
19108                             if (upper_latin1_only_utf8_matches) {
19109                                 _invlist_union(
19110                                             d_invlist,
19111                                             upper_latin1_only_utf8_matches,
19112                                             &d_invlist);
19113                             }
19114                             our_code_points = &d_invlist;
19115                         }
19116                         else {  /* POSIXD, inverted.  If this doesn't have this
19117                                    flag set, it isn't /d. */
19118                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19119                             {
19120                                 continue;
19121                             }
19122                             our_code_points = &cp_list;
19123                         }
19124 
19125                         /* Here, have weeded out some things.  We want to see
19126                          * if the list of characters this node contains
19127                          * ('*our_code_points') precisely matches those of the
19128                          * class we are currently checking against
19129                          * ('*official_code_points'). */
19130                         if (_invlistEQ(*our_code_points,
19131                                        *official_code_points,
19132                                        try_inverted))
19133                         {
19134                             /* Here, they precisely match.  Optimize this ANYOF
19135                              * node into its equivalent POSIX one of the
19136                              * correct type, possibly inverted */
19137                             ret = reg_node(pRExC_state, (try_inverted)
19138                                                         ? type + NPOSIXA
19139                                                                 - POSIXA
19140                                                         : type);
19141                             FLAGS(REGNODE_p(ret)) = posix_class;
19142                             SvREFCNT_dec(d_invlist);
19143                             SvREFCNT_dec(intersection);
19144                             goto not_anyof;
19145                         }
19146                     }
19147                 }
19148             }
19149             SvREFCNT_dec(d_invlist);
19150             SvREFCNT_dec(intersection);
19151         }
19152 
19153         /* If didn't find an optimization and there is no need for a
19154         * bitmap, optimize to indicate that */
19155         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19156             && ! LOC
19157             && ! upper_latin1_only_utf8_matches
19158             &&   anyof_flags == 0)
19159         {
19160             UV highest_cp = invlist_highest(cp_list);
19161 
19162             /* If the lowest and highest code point in the class have the same
19163              * UTF-8 first byte, then all do, and we can store that byte for
19164              * regexec.c to use so that it can more quickly scan the target
19165              * string for potential matches for this class.  We co-opt the the
19166              * flags field for this.  Zero means, they don't have the same
19167              * first byte.  We do accept here very large code points (for
19168              * future use), but don't bother with this optimization for them,
19169              * as it would cause other complications */
19170             if (highest_cp > IV_MAX) {
19171                 anyof_flags = 0;
19172             }
19173             else {
19174                 U8 low_utf8[UTF8_MAXBYTES+1];
19175                 U8 high_utf8[UTF8_MAXBYTES+1];
19176 
19177                 (void) uvchr_to_utf8(low_utf8, start[0]);
19178                 (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
19179 
19180                 anyof_flags = (low_utf8[0] == high_utf8[0])
19181                             ? low_utf8[0]
19182                             : 0;
19183             }
19184 
19185             op = ANYOFH;
19186         }
19187     }   /* End of seeing if can optimize it into a different node */
19188 
19189   is_anyof: /* It's going to be an ANYOF node. */
19190     if (op != ANYOFH) {
19191         op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19192              ? ANYOFD
19193              : ((posixl)
19194                 ? ANYOFPOSIXL
19195                 : ((LOC)
19196                    ? ANYOFL
19197                    : ANYOF));
19198     }
19199 
19200     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19201     FILL_NODE(ret, op);        /* We set the argument later */
19202     RExC_emit += 1 + regarglen[op];
19203     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19204 
19205     /* Here, <cp_list> contains all the code points we can determine at
19206      * compile time that match under all conditions.  Go through it, and
19207      * for things that belong in the bitmap, put them there, and delete from
19208      * <cp_list>.  While we are at it, see if everything above 255 is in the
19209      * list, and if so, set a flag to speed up execution */
19210 
19211     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19212 
19213     if (posixl) {
19214         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19215     }
19216 
19217     if (invert) {
19218         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19219     }
19220 
19221     /* Here, the bitmap has been populated with all the Latin1 code points that
19222      * always match.  Can now add to the overall list those that match only
19223      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19224      * */
19225     if (upper_latin1_only_utf8_matches) {
19226 	if (cp_list) {
19227 	    _invlist_union(cp_list,
19228                            upper_latin1_only_utf8_matches,
19229                            &cp_list);
19230 	    SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19231 	}
19232 	else {
19233 	    cp_list = upper_latin1_only_utf8_matches;
19234 	}
19235         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19236     }
19237 
19238     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19239                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19240                    ? listsv : NULL,
19241                   only_utf8_locale_list);
19242     return ret;
19243 
19244   not_anyof:
19245 
19246     /* Here, the node is getting optimized into something that's not an ANYOF
19247      * one.  Finish up. */
19248 
19249     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19250                                            RExC_parse - orig_parse);;
19251     SvREFCNT_dec(cp_list);;
19252     return ret;
19253 }
19254 
19255 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19256 
19257 STATIC void
S_set_ANYOF_arg(pTHX_ RExC_state_t * const pRExC_state,regnode * const node,SV * const cp_list,SV * const runtime_defns,SV * const only_utf8_locale_list)19258 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19259                 regnode* const node,
19260                 SV* const cp_list,
19261                 SV* const runtime_defns,
19262                 SV* const only_utf8_locale_list)
19263 {
19264     /* Sets the arg field of an ANYOF-type node 'node', using information about
19265      * the node passed-in.  If there is nothing outside the node's bitmap, the
19266      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19267      * the count returned by add_data(), having allocated and stored an array,
19268      * av, as follows:
19269      *
19270      *  av[0] stores the inversion list defining this class as far as known at
19271      *        this time, or PL_sv_undef if nothing definite is now known.
19272      *  av[1] stores the inversion list of code points that match only if the
19273      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19274      *        av[2], or no entry otherwise.
19275      *  av[2] stores the list of user-defined properties whose subroutine
19276      *        definitions aren't known at this time, or no entry if none. */
19277 
19278     UV n;
19279 
19280     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19281 
19282     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19283         assert(! (ANYOF_FLAGS(node)
19284                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19285 	ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19286     }
19287     else {
19288 	AV * const av = newAV();
19289 	SV *rv;
19290 
19291         if (cp_list) {
19292             av_store(av, INVLIST_INDEX, cp_list);
19293         }
19294 
19295         if (only_utf8_locale_list) {
19296             av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
19297         }
19298 
19299         if (runtime_defns) {
19300             av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
19301         }
19302 
19303 	rv = newRV_noinc(MUTABLE_SV(av));
19304 	n = add_data(pRExC_state, STR_WITH_LEN("s"));
19305 	RExC_rxi->data->data[n] = (void*)rv;
19306 	ARG_SET(node, n);
19307     }
19308 }
19309 
19310 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19311 SV *
Perl__get_regclass_nonbitmap_data(pTHX_ const regexp * prog,const regnode * node,bool doinit,SV ** listsvp,SV ** only_utf8_locale_ptr,SV ** output_invlist)19312 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19313                                         const regnode* node,
19314                                         bool doinit,
19315                                         SV** listsvp,
19316                                         SV** only_utf8_locale_ptr,
19317                                         SV** output_invlist)
19318 
19319 {
19320     /* For internal core use only.
19321      * Returns the inversion list for the input 'node' in the regex 'prog'.
19322      * If <doinit> is 'true', will attempt to create the inversion list if not
19323      *    already done.
19324      * If <listsvp> is non-null, will return the printable contents of the
19325      *    property definition.  This can be used to get debugging information
19326      *    even before the inversion list exists, by calling this function with
19327      *    'doinit' set to false, in which case the components that will be used
19328      *    to eventually create the inversion list are returned  (in a printable
19329      *    form).
19330      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19331      *    store an inversion list of code points that should match only if the
19332      *    execution-time locale is a UTF-8 one.
19333      * If <output_invlist> is not NULL, it is where this routine is to store an
19334      *    inversion list of the code points that would be instead returned in
19335      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19336      *    when this parameter is used, is just the non-code point data that
19337      *    will go into creating the inversion list.  This currently should be just
19338      *    user-defined properties whose definitions were not known at compile
19339      *    time.  Using this parameter allows for easier manipulation of the
19340      *    inversion list's data by the caller.  It is illegal to call this
19341      *    function with this parameter set, but not <listsvp>
19342      *
19343      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19344      * that, in spite of this function's name, the inversion list it returns
19345      * may include the bitmap data as well */
19346 
19347     SV *si  = NULL;         /* Input initialization string */
19348     SV* invlist = NULL;
19349 
19350     RXi_GET_DECL(prog, progi);
19351     const struct reg_data * const data = prog ? progi->data : NULL;
19352 
19353     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19354     assert(! output_invlist || listsvp);
19355 
19356     if (data && data->count) {
19357 	const U32 n = ARG(node);
19358 
19359 	if (data->what[n] == 's') {
19360 	    SV * const rv = MUTABLE_SV(data->data[n]);
19361 	    AV * const av = MUTABLE_AV(SvRV(rv));
19362 	    SV **const ary = AvARRAY(av);
19363 
19364             invlist = ary[INVLIST_INDEX];
19365 
19366             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19367                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19368             }
19369 
19370             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19371                 si = ary[DEFERRED_USER_DEFINED_INDEX];
19372             }
19373 
19374 	    if (doinit && (si || invlist)) {
19375                 if (si) {
19376                     bool user_defined;
19377                     SV * msg = newSVpvs_flags("", SVs_TEMP);
19378 
19379                     SV * prop_definition = handle_user_defined_property(
19380                             "", 0, FALSE,   /* There is no \p{}, \P{} */
19381                             SvPVX_const(si)[1] - '0',   /* /i or not has been
19382                                                            stored here for just
19383                                                            this occasion */
19384                             TRUE,           /* run time */
19385                             FALSE,          /* This call must find the defn */
19386                             si,             /* The property definition  */
19387                             &user_defined,
19388                             msg,
19389                             0               /* base level call */
19390                            );
19391 
19392                     if (SvCUR(msg)) {
19393                         assert(prop_definition == NULL);
19394 
19395                         Perl_croak(aTHX_ "%" UTF8f,
19396                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19397                     }
19398 
19399                     if (invlist) {
19400                         _invlist_union(invlist, prop_definition, &invlist);
19401                         SvREFCNT_dec_NN(prop_definition);
19402                     }
19403                     else {
19404                         invlist = prop_definition;
19405                     }
19406 
19407                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19408                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19409 
19410                     av_store(av, INVLIST_INDEX, invlist);
19411                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19412                                  ? ONLY_LOCALE_MATCHES_INDEX:
19413                                  INVLIST_INDEX);
19414                     si = NULL;
19415                 }
19416 	    }
19417 	}
19418     }
19419 
19420     /* If requested, return a printable version of what this ANYOF node matches
19421      * */
19422     if (listsvp) {
19423 	SV* matches_string = NULL;
19424 
19425         /* This function can be called at compile-time, before everything gets
19426          * resolved, in which case we return the currently best available
19427          * information, which is the string that will eventually be used to do
19428          * that resolving, 'si' */
19429 	if (si) {
19430             /* Here, we only have 'si' (and possibly some passed-in data in
19431              * 'invlist', which is handled below)  If the caller only wants
19432              * 'si', use that.  */
19433             if (! output_invlist) {
19434                 matches_string = newSVsv(si);
19435             }
19436             else {
19437                 /* But if the caller wants an inversion list of the node, we
19438                  * need to parse 'si' and place as much as possible in the
19439                  * desired output inversion list, making 'matches_string' only
19440                  * contain the currently unresolvable things */
19441                 const char *si_string = SvPVX(si);
19442                 STRLEN remaining = SvCUR(si);
19443                 UV prev_cp = 0;
19444                 U8 count = 0;
19445 
19446                 /* Ignore everything before the first new-line */
19447                 while (*si_string != '\n' && remaining > 0) {
19448                     si_string++;
19449                     remaining--;
19450                 }
19451                 assert(remaining > 0);
19452 
19453                 si_string++;
19454                 remaining--;
19455 
19456                 while (remaining > 0) {
19457 
19458                     /* The data consists of just strings defining user-defined
19459                      * property names, but in prior incarnations, and perhaps
19460                      * somehow from pluggable regex engines, it could still
19461                      * hold hex code point definitions.  Each component of a
19462                      * range would be separated by a tab, and each range by a
19463                      * new-line.  If these are found, instead add them to the
19464                      * inversion list */
19465                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19466                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19467                     STRLEN len = remaining;
19468                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19469 
19470                     /* If the hex decode routine found something, it should go
19471                      * up to the next \n */
19472                     if (   *(si_string + len) == '\n') {
19473                         if (count) {    /* 2nd code point on line */
19474                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19475                         }
19476                         else {
19477                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19478                         }
19479                         count = 0;
19480                         goto prepare_for_next_iteration;
19481                     }
19482 
19483                     /* If the hex decode was instead for the lower range limit,
19484                      * save it, and go parse the upper range limit */
19485                     if (*(si_string + len) == '\t') {
19486                         assert(count == 0);
19487 
19488                         prev_cp = cp;
19489                         count = 1;
19490                       prepare_for_next_iteration:
19491                         si_string += len + 1;
19492                         remaining -= len + 1;
19493                         continue;
19494                     }
19495 
19496                     /* Here, didn't find a legal hex number.  Just add it from
19497                      * here to the next \n */
19498 
19499                     remaining -= len;
19500                     while (*(si_string + len) != '\n' && remaining > 0) {
19501                         remaining--;
19502                         len++;
19503                     }
19504                     if (*(si_string + len) == '\n') {
19505                         len++;
19506                         remaining--;
19507                     }
19508                     if (matches_string) {
19509                         sv_catpvn(matches_string, si_string, len - 1);
19510                     }
19511                     else {
19512                         matches_string = newSVpvn(si_string, len - 1);
19513                     }
19514                     si_string += len;
19515                     sv_catpvs(matches_string, " ");
19516                 } /* end of loop through the text */
19517 
19518                 assert(matches_string);
19519                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19520                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19521                 }
19522             } /* end of has an 'si' */
19523 	}
19524 
19525         /* Add the stuff that's already known */
19526         if (invlist) {
19527 
19528             /* Again, if the caller doesn't want the output inversion list, put
19529              * everything in 'matches-string' */
19530             if (! output_invlist) {
19531                 if ( ! matches_string) {
19532                     matches_string = newSVpvs("\n");
19533                 }
19534                 sv_catsv(matches_string, invlist_contents(invlist,
19535                                                   TRUE /* traditional style */
19536                                                   ));
19537             }
19538             else if (! *output_invlist) {
19539                 *output_invlist = invlist_clone(invlist, NULL);
19540             }
19541             else {
19542                 _invlist_union(*output_invlist, invlist, output_invlist);
19543             }
19544         }
19545 
19546 	*listsvp = matches_string;
19547     }
19548 
19549     return invlist;
19550 }
19551 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19552 
19553 /* reg_skipcomment()
19554 
19555    Absorbs an /x style # comment from the input stream,
19556    returning a pointer to the first character beyond the comment, or if the
19557    comment terminates the pattern without anything following it, this returns
19558    one past the final character of the pattern (in other words, RExC_end) and
19559    sets the REG_RUN_ON_COMMENT_SEEN flag.
19560 
19561    Note it's the callers responsibility to ensure that we are
19562    actually in /x mode
19563 
19564 */
19565 
19566 PERL_STATIC_INLINE char*
S_reg_skipcomment(RExC_state_t * pRExC_state,char * p)19567 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19568 {
19569     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19570 
19571     assert(*p == '#');
19572 
19573     while (p < RExC_end) {
19574         if (*(++p) == '\n') {
19575             return p+1;
19576         }
19577     }
19578 
19579     /* we ran off the end of the pattern without ending the comment, so we have
19580      * to add an \n when wrapping */
19581     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19582     return p;
19583 }
19584 
19585 STATIC void
S_skip_to_be_ignored_text(pTHX_ RExC_state_t * pRExC_state,char ** p,const bool force_to_xmod)19586 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19587                                 char ** p,
19588                                 const bool force_to_xmod
19589                          )
19590 {
19591     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19592      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19593      * is /x whitespace, advance '*p' so that on exit it points to the first
19594      * byte past all such white space and comments */
19595 
19596     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19597 
19598     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19599 
19600     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19601 
19602     for (;;) {
19603 	if (RExC_end - (*p) >= 3
19604 	    && *(*p)     == '('
19605 	    && *(*p + 1) == '?'
19606 	    && *(*p + 2) == '#')
19607 	{
19608 	    while (*(*p) != ')') {
19609 		if ((*p) == RExC_end)
19610 		    FAIL("Sequence (?#... not terminated");
19611 		(*p)++;
19612 	    }
19613 	    (*p)++;
19614 	    continue;
19615 	}
19616 
19617 	if (use_xmod) {
19618             const char * save_p = *p;
19619             while ((*p) < RExC_end) {
19620                 STRLEN len;
19621                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19622                     (*p) += len;
19623                 }
19624                 else if (*(*p) == '#') {
19625                     (*p) = reg_skipcomment(pRExC_state, (*p));
19626                 }
19627                 else {
19628                     break;
19629                 }
19630             }
19631             if (*p != save_p) {
19632                 continue;
19633             }
19634 	}
19635 
19636         break;
19637     }
19638 
19639     return;
19640 }
19641 
19642 /* nextchar()
19643 
19644    Advances the parse position by one byte, unless that byte is the beginning
19645    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
19646    those two cases, the parse position is advanced beyond all such comments and
19647    white space.
19648 
19649    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19650 */
19651 
19652 STATIC void
S_nextchar(pTHX_ RExC_state_t * pRExC_state)19653 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19654 {
19655     PERL_ARGS_ASSERT_NEXTCHAR;
19656 
19657     if (RExC_parse < RExC_end) {
19658         assert(   ! UTF
19659                || UTF8_IS_INVARIANT(*RExC_parse)
19660                || UTF8_IS_START(*RExC_parse));
19661 
19662         RExC_parse += (UTF)
19663                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
19664                       : 1;
19665 
19666         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19667                                 FALSE /* Don't force /x */ );
19668     }
19669 }
19670 
19671 STATIC void
S_change_engine_size(pTHX_ RExC_state_t * pRExC_state,const Ptrdiff_t size)19672 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19673 {
19674     /* 'size' is the delta to add or subtract from the current memory allocated
19675      * to the regex engine being constructed */
19676 
19677     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19678 
19679     RExC_size += size;
19680 
19681     Renewc(RExC_rxi,
19682            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
19683                                                 /* +1 for REG_MAGIC */
19684            char,
19685            regexp_internal);
19686     if ( RExC_rxi == NULL )
19687 	FAIL("Regexp out of space");
19688     RXi_SET(RExC_rx, RExC_rxi);
19689 
19690     RExC_emit_start = RExC_rxi->program;
19691     if (size > 0) {
19692         Zero(REGNODE_p(RExC_emit), size, regnode);
19693     }
19694 
19695 #ifdef RE_TRACK_PATTERN_OFFSETS
19696     Renew(RExC_offsets, 2*RExC_size+1, U32);
19697     if (size > 0) {
19698         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
19699     }
19700     RExC_offsets[0] = RExC_size;
19701 #endif
19702 }
19703 
19704 STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t * pRExC_state,const U8 op,const STRLEN extra_size,const char * const name)19705 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19706 {
19707     /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
19708      * and increments RExC_size and RExC_emit
19709      *
19710      * It returns the regnode's offset into the regex engine program */
19711 
19712     const regnode_offset ret = RExC_emit;
19713 
19714     GET_RE_DEBUG_FLAGS_DECL;
19715 
19716     PERL_ARGS_ASSERT_REGNODE_GUTS;
19717 
19718     SIZE_ALIGN(RExC_size);
19719     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19720     NODE_ALIGN_FILL(REGNODE_p(ret));
19721 #ifndef RE_TRACK_PATTERN_OFFSETS
19722     PERL_UNUSED_ARG(name);
19723     PERL_UNUSED_ARG(op);
19724 #else
19725     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19726 
19727     if (RExC_offsets) {         /* MJD */
19728 	MJD_OFFSET_DEBUG(
19729               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19730               name, __LINE__,
19731               PL_reg_name[op],
19732               (UV)(RExC_emit) > RExC_offsets[0]
19733 		? "Overwriting end of array!\n" : "OK",
19734               (UV)(RExC_emit),
19735               (UV)(RExC_parse - RExC_start),
19736               (UV)RExC_offsets[0]));
19737 	Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19738     }
19739 #endif
19740     return(ret);
19741 }
19742 
19743 /*
19744 - reg_node - emit a node
19745 */
19746 STATIC regnode_offset /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)19747 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19748 {
19749     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19750     regnode_offset ptr = ret;
19751 
19752     PERL_ARGS_ASSERT_REG_NODE;
19753 
19754     assert(regarglen[op] == 0);
19755 
19756     FILL_ADVANCE_NODE(ptr, op);
19757     RExC_emit = ptr;
19758     return(ret);
19759 }
19760 
19761 /*
19762 - reganode - emit a node with an argument
19763 */
19764 STATIC regnode_offset /* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)19765 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19766 {
19767     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19768     regnode_offset ptr = ret;
19769 
19770     PERL_ARGS_ASSERT_REGANODE;
19771 
19772     /* ANYOF are special cased to allow non-length 1 args */
19773     assert(regarglen[op] == 1);
19774 
19775     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19776     RExC_emit = ptr;
19777     return(ret);
19778 }
19779 
19780 STATIC regnode_offset
S_reg2Lanode(pTHX_ RExC_state_t * pRExC_state,const U8 op,const U32 arg1,const I32 arg2)19781 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19782 {
19783     /* emit a node with U32 and I32 arguments */
19784 
19785     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19786     regnode_offset ptr = ret;
19787 
19788     PERL_ARGS_ASSERT_REG2LANODE;
19789 
19790     assert(regarglen[op] == 2);
19791 
19792     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19793     RExC_emit = ptr;
19794     return(ret);
19795 }
19796 
19797 /*
19798 - reginsert - insert an operator in front of already-emitted operand
19799 *
19800 * That means that on exit 'operand' is the offset of the newly inserted
19801 * operator, and the original operand has been relocated.
19802 *
19803 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19804 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19805 *
19806 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19807 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19808 *
19809 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19810 */
19811 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,const U8 op,const regnode_offset operand,const U32 depth)19812 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19813                   const regnode_offset operand, const U32 depth)
19814 {
19815     regnode *src;
19816     regnode *dst;
19817     regnode *place;
19818     const int offset = regarglen[(U8)op];
19819     const int size = NODE_STEP_REGNODE + offset;
19820     GET_RE_DEBUG_FLAGS_DECL;
19821 
19822     PERL_ARGS_ASSERT_REGINSERT;
19823     PERL_UNUSED_CONTEXT;
19824     PERL_UNUSED_ARG(depth);
19825 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19826     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19827     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19828                                     studying. If this is wrong then we need to adjust RExC_recurse
19829                                     below like we do with RExC_open_parens/RExC_close_parens. */
19830     change_engine_size(pRExC_state, (Ptrdiff_t) size);
19831     src = REGNODE_p(RExC_emit);
19832     RExC_emit += size;
19833     dst = REGNODE_p(RExC_emit);
19834 
19835     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
19836      * and [perl #133871] shows this can lead to problems, so skip this
19837      * realignment of parens until a later pass when they are reliable */
19838     if (! IN_PARENS_PASS && RExC_open_parens) {
19839         int paren;
19840         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19841         /* remember that RExC_npar is rex->nparens + 1,
19842          * iow it is 1 more than the number of parens seen in
19843          * the pattern so far. */
19844         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19845             /* note, RExC_open_parens[0] is the start of the
19846              * regex, it can't move. RExC_close_parens[0] is the end
19847              * of the regex, it *can* move. */
19848             if ( paren && RExC_open_parens[paren] >= operand ) {
19849                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19850                 RExC_open_parens[paren] += size;
19851             } else {
19852                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19853             }
19854             if ( RExC_close_parens[paren] >= operand ) {
19855                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19856                 RExC_close_parens[paren] += size;
19857             } else {
19858                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19859             }
19860         }
19861     }
19862     if (RExC_end_op)
19863         RExC_end_op += size;
19864 
19865     while (src > REGNODE_p(operand)) {
19866 	StructCopy(--src, --dst, regnode);
19867 #ifdef RE_TRACK_PATTERN_OFFSETS
19868         if (RExC_offsets) {     /* MJD 20010112 */
19869 	    MJD_OFFSET_DEBUG(
19870                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19871                   "reginsert",
19872 		  __LINE__,
19873 		  PL_reg_name[op],
19874                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19875 		    ? "Overwriting end of array!\n" : "OK",
19876                   (UV)REGNODE_OFFSET(src),
19877                   (UV)REGNODE_OFFSET(dst),
19878                   (UV)RExC_offsets[0]));
19879 	    Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19880 	    Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19881         }
19882 #endif
19883     }
19884 
19885     place = REGNODE_p(operand);	/* Op node, where operand used to be. */
19886 #ifdef RE_TRACK_PATTERN_OFFSETS
19887     if (RExC_offsets) {         /* MJD */
19888 	MJD_OFFSET_DEBUG(
19889               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19890               "reginsert",
19891 	      __LINE__,
19892 	      PL_reg_name[op],
19893               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19894               ? "Overwriting end of array!\n" : "OK",
19895               (UV)REGNODE_OFFSET(place),
19896               (UV)(RExC_parse - RExC_start),
19897               (UV)RExC_offsets[0]));
19898 	Set_Node_Offset(place, RExC_parse);
19899 	Set_Node_Length(place, 1);
19900     }
19901 #endif
19902     src = NEXTOPER(place);
19903     FLAGS(place) = 0;
19904     FILL_NODE(operand, op);
19905 
19906     /* Zero out any arguments in the new node */
19907     Zero(src, offset, regnode);
19908 }
19909 
19910 /*
19911 - regtail - set the next-pointer at the end of a node chain of p to val.  If
19912             that value won't fit in the space available, instead returns FALSE.
19913             (Except asserts if we can't fit in the largest space the regex
19914             engine is designed for.)
19915 - SEE ALSO: regtail_study
19916 */
19917 STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,const regnode_offset p,const regnode_offset val,const U32 depth)19918 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19919                 const regnode_offset p,
19920                 const regnode_offset val,
19921                 const U32 depth)
19922 {
19923     regnode_offset scan;
19924     GET_RE_DEBUG_FLAGS_DECL;
19925 
19926     PERL_ARGS_ASSERT_REGTAIL;
19927 #ifndef DEBUGGING
19928     PERL_UNUSED_ARG(depth);
19929 #endif
19930 
19931     /* Find last node. */
19932     scan = (regnode_offset) p;
19933     for (;;) {
19934 	regnode * const temp = regnext(REGNODE_p(scan));
19935         DEBUG_PARSE_r({
19936             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19937             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19938             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
19939                 SvPV_nolen_const(RExC_mysv), scan,
19940                     (temp == NULL ? "->" : ""),
19941                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19942             );
19943         });
19944         if (temp == NULL)
19945             break;
19946         scan = REGNODE_OFFSET(temp);
19947     }
19948 
19949     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19950         assert((UV) (val - scan) <= U32_MAX);
19951         ARG_SET(REGNODE_p(scan), val - scan);
19952     }
19953     else {
19954         if (val - scan > U16_MAX) {
19955             /* Populate this with something that won't loop and will likely
19956              * lead to a crash if the caller ignores the failure return, and
19957              * execution continues */
19958             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
19959             return FALSE;
19960         }
19961         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19962     }
19963 
19964     return TRUE;
19965 }
19966 
19967 #ifdef DEBUGGING
19968 /*
19969 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19970 - Look for optimizable sequences at the same time.
19971 - currently only looks for EXACT chains.
19972 
19973 This is experimental code. The idea is to use this routine to perform
19974 in place optimizations on branches and groups as they are constructed,
19975 with the long term intention of removing optimization from study_chunk so
19976 that it is purely analytical.
19977 
19978 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19979 to control which is which.
19980 
19981 This used to return a value that was ignored.  It was a problem that it is
19982 #ifdef'd to be another function that didn't return a value.  khw has changed it
19983 so both currently return a pass/fail return.
19984 
19985 */
19986 /* TODO: All four parms should be const */
19987 
19988 STATIC bool
S_regtail_study(pTHX_ RExC_state_t * pRExC_state,regnode_offset p,const regnode_offset val,U32 depth)19989 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19990                       const regnode_offset val, U32 depth)
19991 {
19992     regnode_offset scan;
19993     U8 exact = PSEUDO;
19994 #ifdef EXPERIMENTAL_INPLACESCAN
19995     I32 min = 0;
19996 #endif
19997     GET_RE_DEBUG_FLAGS_DECL;
19998 
19999     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20000 
20001 
20002     /* Find last node. */
20003 
20004     scan = p;
20005     for (;;) {
20006         regnode * const temp = regnext(REGNODE_p(scan));
20007 #ifdef EXPERIMENTAL_INPLACESCAN
20008         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20009 	    bool unfolded_multi_char;	/* Unexamined in this routine */
20010             if (join_exact(pRExC_state, scan, &min,
20011                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20012                 return TRUE; /* Was return EXACT */
20013 	}
20014 #endif
20015         if ( exact ) {
20016             switch (OP(REGNODE_p(scan))) {
20017                 case EXACT:
20018                 case EXACT_ONLY8:
20019                 case EXACTL:
20020                 case EXACTF:
20021                 case EXACTFU_S_EDGE:
20022                 case EXACTFAA_NO_TRIE:
20023                 case EXACTFAA:
20024                 case EXACTFU:
20025                 case EXACTFU_ONLY8:
20026                 case EXACTFLU8:
20027                 case EXACTFUP:
20028                 case EXACTFL:
20029                         if( exact == PSEUDO )
20030                             exact= OP(REGNODE_p(scan));
20031                         else if ( exact != OP(REGNODE_p(scan)) )
20032                             exact= 0;
20033                 case NOTHING:
20034                     break;
20035                 default:
20036                     exact= 0;
20037             }
20038         }
20039         DEBUG_PARSE_r({
20040             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20041             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20042             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
20043                 SvPV_nolen_const(RExC_mysv),
20044                 scan,
20045                 PL_reg_name[exact]);
20046         });
20047 	if (temp == NULL)
20048 	    break;
20049 	scan = REGNODE_OFFSET(temp);
20050     }
20051     DEBUG_PARSE_r({
20052         DEBUG_PARSE_MSG("");
20053         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20054         Perl_re_printf( aTHX_
20055                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20056 		      SvPV_nolen_const(RExC_mysv),
20057 		      (IV)val,
20058 		      (IV)(val - scan)
20059         );
20060     });
20061     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20062         assert((UV) (val - scan) <= U32_MAX);
20063 	ARG_SET(REGNODE_p(scan), val - scan);
20064     }
20065     else {
20066         if (val - scan > U16_MAX) {
20067             /* Populate this with something that won't loop and will likely
20068              * lead to a crash if the caller ignores the failure return, and
20069              * execution continues */
20070             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20071             return FALSE;
20072         }
20073 	NEXT_OFF(REGNODE_p(scan)) = val - scan;
20074     }
20075 
20076     return TRUE; /* Was 'return exact' */
20077 }
20078 #endif
20079 
20080 STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n)20081 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20082 
20083     /* Returns an inversion list of all the code points matched by the
20084      * ANYOFM/NANYOFM node 'n' */
20085 
20086     SV * cp_list = _new_invlist(-1);
20087     const U8 lowest = (U8) ARG(n);
20088     unsigned int i;
20089     U8 count = 0;
20090     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20091 
20092     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20093 
20094     /* Starting with the lowest code point, any code point that ANDed with the
20095      * mask yields the lowest code point is in the set */
20096     for (i = lowest; i <= 0xFF; i++) {
20097         if ((i & FLAGS(n)) == ARG(n)) {
20098             cp_list = add_cp_to_invlist(cp_list, i);
20099             count++;
20100 
20101             /* We know how many code points (a power of two) that are in the
20102              * set.  No use looking once we've got that number */
20103             if (count >= needed) break;
20104         }
20105     }
20106 
20107     if (OP(n) == NANYOFM) {
20108         _invlist_invert(cp_list);
20109     }
20110     return cp_list;
20111 }
20112 
20113 /*
20114  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20115  */
20116 #ifdef DEBUGGING
20117 
20118 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)20119 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20120 {
20121     int bit;
20122     int set=0;
20123 
20124     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20125 
20126     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20127         if (flags & (1<<bit)) {
20128             if (!set++ && lead)
20129                 Perl_re_printf( aTHX_  "%s", lead);
20130             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20131         }
20132     }
20133     if (lead)  {
20134         if (set)
20135             Perl_re_printf( aTHX_  "\n");
20136         else
20137             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20138     }
20139 }
20140 
20141 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)20142 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20143 {
20144     int bit;
20145     int set=0;
20146     regex_charset cs;
20147 
20148     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20149 
20150     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20151         if (flags & (1<<bit)) {
20152 	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
20153 		continue;
20154 	    }
20155             if (!set++ && lead)
20156                 Perl_re_printf( aTHX_  "%s", lead);
20157             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20158         }
20159     }
20160     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20161             if (!set++ && lead) {
20162                 Perl_re_printf( aTHX_  "%s", lead);
20163             }
20164             switch (cs) {
20165                 case REGEX_UNICODE_CHARSET:
20166                     Perl_re_printf( aTHX_  "UNICODE");
20167                     break;
20168                 case REGEX_LOCALE_CHARSET:
20169                     Perl_re_printf( aTHX_  "LOCALE");
20170                     break;
20171                 case REGEX_ASCII_RESTRICTED_CHARSET:
20172                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20173                     break;
20174                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20175                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20176                     break;
20177                 default:
20178                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20179                     break;
20180             }
20181     }
20182     if (lead)  {
20183         if (set)
20184             Perl_re_printf( aTHX_  "\n");
20185         else
20186             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20187     }
20188 }
20189 #endif
20190 
20191 void
Perl_regdump(pTHX_ const regexp * r)20192 Perl_regdump(pTHX_ const regexp *r)
20193 {
20194 #ifdef DEBUGGING
20195     int i;
20196     SV * const sv = sv_newmortal();
20197     SV *dsv= sv_newmortal();
20198     RXi_GET_DECL(r, ri);
20199     GET_RE_DEBUG_FLAGS_DECL;
20200 
20201     PERL_ARGS_ASSERT_REGDUMP;
20202 
20203     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20204 
20205     /* Header fields of interest. */
20206     for (i = 0; i < 2; i++) {
20207         if (r->substrs->data[i].substr) {
20208             RE_PV_QUOTED_DECL(s, 0, dsv,
20209                             SvPVX_const(r->substrs->data[i].substr),
20210                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20211                             PL_dump_re_max_len);
20212             Perl_re_printf( aTHX_
20213                           "%s %s%s at %" IVdf "..%" UVuf " ",
20214                           i ? "floating" : "anchored",
20215                           s,
20216                           RE_SV_TAIL(r->substrs->data[i].substr),
20217                           (IV)r->substrs->data[i].min_offset,
20218                           (UV)r->substrs->data[i].max_offset);
20219         }
20220         else if (r->substrs->data[i].utf8_substr) {
20221             RE_PV_QUOTED_DECL(s, 1, dsv,
20222                             SvPVX_const(r->substrs->data[i].utf8_substr),
20223                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20224                             30);
20225             Perl_re_printf( aTHX_
20226                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20227                           i ? "floating" : "anchored",
20228                           s,
20229                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20230                           (IV)r->substrs->data[i].min_offset,
20231                           (UV)r->substrs->data[i].max_offset);
20232         }
20233     }
20234 
20235     if (r->check_substr || r->check_utf8)
20236         Perl_re_printf( aTHX_
20237 		      (const char *)
20238 		      (   r->check_substr == r->substrs->data[1].substr
20239 		       && r->check_utf8   == r->substrs->data[1].utf8_substr
20240 		       ? "(checking floating" : "(checking anchored"));
20241     if (r->intflags & PREGf_NOSCAN)
20242         Perl_re_printf( aTHX_  " noscan");
20243     if (r->extflags & RXf_CHECK_ALL)
20244         Perl_re_printf( aTHX_  " isall");
20245     if (r->check_substr || r->check_utf8)
20246         Perl_re_printf( aTHX_  ") ");
20247 
20248     if (ri->regstclass) {
20249         regprop(r, sv, ri->regstclass, NULL, NULL);
20250         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20251     }
20252     if (r->intflags & PREGf_ANCH) {
20253         Perl_re_printf( aTHX_  "anchored");
20254         if (r->intflags & PREGf_ANCH_MBOL)
20255             Perl_re_printf( aTHX_  "(MBOL)");
20256         if (r->intflags & PREGf_ANCH_SBOL)
20257             Perl_re_printf( aTHX_  "(SBOL)");
20258         if (r->intflags & PREGf_ANCH_GPOS)
20259             Perl_re_printf( aTHX_  "(GPOS)");
20260         Perl_re_printf( aTHX_ " ");
20261     }
20262     if (r->intflags & PREGf_GPOS_SEEN)
20263         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20264     if (r->intflags & PREGf_SKIP)
20265         Perl_re_printf( aTHX_  "plus ");
20266     if (r->intflags & PREGf_IMPLICIT)
20267         Perl_re_printf( aTHX_  "implicit ");
20268     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20269     if (r->extflags & RXf_EVAL_SEEN)
20270         Perl_re_printf( aTHX_  "with eval ");
20271     Perl_re_printf( aTHX_  "\n");
20272     DEBUG_FLAGS_r({
20273         regdump_extflags("r->extflags: ", r->extflags);
20274         regdump_intflags("r->intflags: ", r->intflags);
20275     });
20276 #else
20277     PERL_ARGS_ASSERT_REGDUMP;
20278     PERL_UNUSED_CONTEXT;
20279     PERL_UNUSED_ARG(r);
20280 #endif	/* DEBUGGING */
20281 }
20282 
20283 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20284 #ifdef DEBUGGING
20285 
20286 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20287      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20288      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20289      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20290      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20291      || _CC_VERTSPACE != 15
20292 #   error Need to adjust order of anyofs[]
20293 #  endif
20294 static const char * const anyofs[] = {
20295     "\\w",
20296     "\\W",
20297     "\\d",
20298     "\\D",
20299     "[:alpha:]",
20300     "[:^alpha:]",
20301     "[:lower:]",
20302     "[:^lower:]",
20303     "[:upper:]",
20304     "[:^upper:]",
20305     "[:punct:]",
20306     "[:^punct:]",
20307     "[:print:]",
20308     "[:^print:]",
20309     "[:alnum:]",
20310     "[:^alnum:]",
20311     "[:graph:]",
20312     "[:^graph:]",
20313     "[:cased:]",
20314     "[:^cased:]",
20315     "\\s",
20316     "\\S",
20317     "[:blank:]",
20318     "[:^blank:]",
20319     "[:xdigit:]",
20320     "[:^xdigit:]",
20321     "[:cntrl:]",
20322     "[:^cntrl:]",
20323     "[:ascii:]",
20324     "[:^ascii:]",
20325     "\\v",
20326     "\\V"
20327 };
20328 #endif
20329 
20330 /*
20331 - regprop - printable representation of opcode, with run time support
20332 */
20333 
20334 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)20335 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20336 {
20337 #ifdef DEBUGGING
20338     dVAR;
20339     int k;
20340     RXi_GET_DECL(prog, progi);
20341     GET_RE_DEBUG_FLAGS_DECL;
20342 
20343     PERL_ARGS_ASSERT_REGPROP;
20344 
20345     SvPVCLEAR(sv);
20346 
20347     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
20348 	/* It would be nice to FAIL() here, but this may be called from
20349 	   regexec.c, and it would be hard to supply pRExC_state. */
20350 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20351                                               (int)OP(o), (int)REGNODE_MAX);
20352     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20353 
20354     k = PL_regkind[OP(o)];
20355 
20356     if (k == EXACT) {
20357 	sv_catpvs(sv, " ");
20358 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20359 	 * is a crude hack but it may be the best for now since
20360 	 * we have no flag "this EXACTish node was UTF-8"
20361 	 * --jhi */
20362 	pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20363                   PL_colors[0], PL_colors[1],
20364 		  PERL_PV_ESCAPE_UNI_DETECT |
20365 		  PERL_PV_ESCAPE_NONASCII   |
20366 		  PERL_PV_PRETTY_ELLIPSES   |
20367 		  PERL_PV_PRETTY_LTGT       |
20368 		  PERL_PV_PRETTY_NOCLEAR
20369 		  );
20370     } else if (k == TRIE) {
20371 	/* print the details of the trie in dumpuntil instead, as
20372 	 * progi->data isn't available here */
20373         const char op = OP(o);
20374         const U32 n = ARG(o);
20375         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20376                (reg_ac_data *)progi->data->data[n] :
20377                NULL;
20378         const reg_trie_data * const trie
20379 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20380 
20381         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20382         DEBUG_TRIE_COMPILE_r({
20383           if (trie->jump)
20384             sv_catpvs(sv, "(JUMP)");
20385           Perl_sv_catpvf(aTHX_ sv,
20386             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20387             (UV)trie->startstate,
20388             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20389             (UV)trie->wordcount,
20390             (UV)trie->minlen,
20391             (UV)trie->maxlen,
20392             (UV)TRIE_CHARCOUNT(trie),
20393             (UV)trie->uniquecharcount
20394           );
20395         });
20396         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20397             sv_catpvs(sv, "[");
20398             (void) put_charclass_bitmap_innards(sv,
20399                                                 ((IS_ANYOF_TRIE(op))
20400                                                  ? ANYOF_BITMAP(o)
20401                                                  : TRIE_BITMAP(trie)),
20402                                                 NULL,
20403                                                 NULL,
20404                                                 NULL,
20405                                                 FALSE
20406                                                );
20407             sv_catpvs(sv, "]");
20408         }
20409     } else if (k == CURLY) {
20410         U32 lo = ARG1(o), hi = ARG2(o);
20411 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20412 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20413         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20414         if (hi == REG_INFTY)
20415             sv_catpvs(sv, "INFTY");
20416         else
20417             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20418         sv_catpvs(sv, "}");
20419     }
20420     else if (k == WHILEM && o->flags)			/* Ordinal/of */
20421 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20422     else if (k == REF || k == OPEN || k == CLOSE
20423              || k == GROUPP || OP(o)==ACCEPT)
20424     {
20425         AV *name_list= NULL;
20426         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20427         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20428 	if ( RXp_PAREN_NAMES(prog) ) {
20429             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20430         } else if ( pRExC_state ) {
20431             name_list= RExC_paren_name_list;
20432         }
20433         if (name_list) {
20434             if ( k != REF || (OP(o) < NREF)) {
20435                 SV **name= av_fetch(name_list, parno, 0 );
20436 	        if (name)
20437 	            Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20438             }
20439             else {
20440                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20441                 I32 *nums=(I32*)SvPVX(sv_dat);
20442                 SV **name= av_fetch(name_list, nums[0], 0 );
20443                 I32 n;
20444                 if (name) {
20445                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20446                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20447 			   	    (n ? "," : ""), (IV)nums[n]);
20448                     }
20449                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20450                 }
20451             }
20452         }
20453         if ( k == REF && reginfo) {
20454             U32 n = ARG(o);  /* which paren pair */
20455             I32 ln = prog->offs[n].start;
20456             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20457                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20458             else if (ln == prog->offs[n].end)
20459                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20460             else {
20461                 const char *s = reginfo->strbeg + ln;
20462                 Perl_sv_catpvf(aTHX_ sv, ": ");
20463                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20464                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20465             }
20466         }
20467     } else if (k == GOSUB) {
20468         AV *name_list= NULL;
20469         if ( RXp_PAREN_NAMES(prog) ) {
20470             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20471         } else if ( pRExC_state ) {
20472             name_list= RExC_paren_name_list;
20473         }
20474 
20475         /* Paren and offset */
20476         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20477                 (int)((o + (int)ARG2L(o)) - progi->program) );
20478         if (name_list) {
20479             SV **name= av_fetch(name_list, ARG(o), 0 );
20480             if (name)
20481                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20482         }
20483     }
20484     else if (k == LOGICAL)
20485         /* 2: embedded, otherwise 1 */
20486 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20487     else if (k == ANYOF) {
20488 	const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
20489         bool do_sep = FALSE;    /* Do we need to separate various components of
20490                                    the output? */
20491         /* Set if there is still an unresolved user-defined property */
20492         SV *unresolved                = NULL;
20493 
20494         /* Things that are ignored except when the runtime locale is UTF-8 */
20495         SV *only_utf8_locale_invlist = NULL;
20496 
20497         /* Code points that don't fit in the bitmap */
20498         SV *nonbitmap_invlist = NULL;
20499 
20500         /* And things that aren't in the bitmap, but are small enough to be */
20501         SV* bitmap_range_not_in_bitmap = NULL;
20502 
20503         const bool inverted = flags & ANYOF_INVERT;
20504 
20505 	if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20506             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20507                 sv_catpvs(sv, "{utf8-locale-reqd}");
20508             }
20509             if (flags & ANYOFL_FOLD) {
20510                 sv_catpvs(sv, "{i}");
20511             }
20512         }
20513 
20514         /* If there is stuff outside the bitmap, get it */
20515         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20516             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20517                                                 &unresolved,
20518                                                 &only_utf8_locale_invlist,
20519                                                 &nonbitmap_invlist);
20520             /* The non-bitmap data may contain stuff that could fit in the
20521              * bitmap.  This could come from a user-defined property being
20522              * finally resolved when this call was done; or much more likely
20523              * because there are matches that require UTF-8 to be valid, and so
20524              * aren't in the bitmap.  This is teased apart later */
20525             _invlist_intersection(nonbitmap_invlist,
20526                                   PL_InBitmap,
20527                                   &bitmap_range_not_in_bitmap);
20528             /* Leave just the things that don't fit into the bitmap */
20529             _invlist_subtract(nonbitmap_invlist,
20530                               PL_InBitmap,
20531                               &nonbitmap_invlist);
20532         }
20533 
20534         /* Obey this flag to add all above-the-bitmap code points */
20535         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20536             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20537                                                       NUM_ANYOF_CODE_POINTS,
20538                                                       UV_MAX);
20539         }
20540 
20541         /* Ready to start outputting.  First, the initial left bracket */
20542 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20543 
20544         if (OP(o) != ANYOFH) {
20545             /* Then all the things that could fit in the bitmap */
20546             do_sep = put_charclass_bitmap_innards(sv,
20547                                                   ANYOF_BITMAP(o),
20548                                                   bitmap_range_not_in_bitmap,
20549                                                   only_utf8_locale_invlist,
20550                                                   o,
20551 
20552                                                   /* Can't try inverting for a
20553                                                    * better display if there
20554                                                    * are things that haven't
20555                                                    * been resolved */
20556                                                   unresolved != NULL);
20557             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20558 
20559             /* If there are user-defined properties which haven't been defined
20560              * yet, output them.  If the result is not to be inverted, it is
20561              * clearest to output them in a separate [] from the bitmap range
20562              * stuff.  If the result is to be complemented, we have to show
20563              * everything in one [], as the inversion applies to the whole
20564              * thing.  Use {braces} to separate them from anything in the
20565              * bitmap and anything above the bitmap. */
20566             if (unresolved) {
20567                 if (inverted) {
20568                     if (! do_sep) { /* If didn't output anything in the bitmap
20569                                      */
20570                         sv_catpvs(sv, "^");
20571                     }
20572                     sv_catpvs(sv, "{");
20573                 }
20574                 else if (do_sep) {
20575                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20576                                                       PL_colors[0]);
20577                 }
20578                 sv_catsv(sv, unresolved);
20579                 if (inverted) {
20580                     sv_catpvs(sv, "}");
20581                 }
20582                 do_sep = ! inverted;
20583             }
20584         }
20585 
20586         /* And, finally, add the above-the-bitmap stuff */
20587         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20588             SV* contents;
20589 
20590             /* See if truncation size is overridden */
20591             const STRLEN dump_len = (PL_dump_re_max_len > 256)
20592                                     ? PL_dump_re_max_len
20593                                     : 256;
20594 
20595             /* This is output in a separate [] */
20596             if (do_sep) {
20597                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20598             }
20599 
20600             /* And, for easy of understanding, it is shown in the
20601              * uncomplemented form if possible.  The one exception being if
20602              * there are unresolved items, where the inversion has to be
20603              * delayed until runtime */
20604             if (inverted && ! unresolved) {
20605                 _invlist_invert(nonbitmap_invlist);
20606                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20607             }
20608 
20609             contents = invlist_contents(nonbitmap_invlist,
20610                                         FALSE /* output suitable for catsv */
20611                                        );
20612 
20613             /* If the output is shorter than the permissible maximum, just do it. */
20614             if (SvCUR(contents) <= dump_len) {
20615                 sv_catsv(sv, contents);
20616             }
20617             else {
20618                 const char * contents_string = SvPVX(contents);
20619                 STRLEN i = dump_len;
20620 
20621                 /* Otherwise, start at the permissible max and work back to the
20622                  * first break possibility */
20623                 while (i > 0 && contents_string[i] != ' ') {
20624                     i--;
20625                 }
20626                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
20627                                        find a legal break */
20628                     i = dump_len;
20629                 }
20630 
20631                 sv_catpvn(sv, contents_string, i);
20632                 sv_catpvs(sv, "...");
20633             }
20634 
20635             SvREFCNT_dec_NN(contents);
20636             SvREFCNT_dec_NN(nonbitmap_invlist);
20637         }
20638 
20639         /* And finally the matching, closing ']' */
20640 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20641 
20642         if (OP(o) == ANYOFH && FLAGS(o) != 0) {
20643             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
20644         }
20645 
20646 
20647         SvREFCNT_dec(unresolved);
20648     }
20649     else if (k == ANYOFM) {
20650         SV * cp_list = get_ANYOFM_contents(o);
20651 
20652 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20653         if (OP(o) == NANYOFM) {
20654             _invlist_invert(cp_list);
20655         }
20656 
20657         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20658 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20659 
20660         SvREFCNT_dec(cp_list);
20661     }
20662     else if (k == POSIXD || k == NPOSIXD) {
20663         U8 index = FLAGS(o) * 2;
20664         if (index < C_ARRAY_LENGTH(anyofs)) {
20665             if (*anyofs[index] != '[')  {
20666                 sv_catpvs(sv, "[");
20667             }
20668             sv_catpv(sv, anyofs[index]);
20669             if (*anyofs[index] != '[')  {
20670                 sv_catpvs(sv, "]");
20671             }
20672         }
20673         else {
20674             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20675         }
20676     }
20677     else if (k == BOUND || k == NBOUND) {
20678         /* Must be synced with order of 'bound_type' in regcomp.h */
20679         const char * const bounds[] = {
20680             "",      /* Traditional */
20681             "{gcb}",
20682             "{lb}",
20683             "{sb}",
20684             "{wb}"
20685         };
20686         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20687         sv_catpv(sv, bounds[FLAGS(o)]);
20688     }
20689     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
20690 	Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
20691         if (o->next_off) {
20692             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
20693         }
20694 	Perl_sv_catpvf(aTHX_ sv, "]");
20695     }
20696     else if (OP(o) == SBOL)
20697         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20698 
20699     /* add on the verb argument if there is one */
20700     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20701         if ( ARG(o) )
20702             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20703                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20704         else
20705             sv_catpvs(sv, ":NULL");
20706     }
20707 #else
20708     PERL_UNUSED_CONTEXT;
20709     PERL_UNUSED_ARG(sv);
20710     PERL_UNUSED_ARG(o);
20711     PERL_UNUSED_ARG(prog);
20712     PERL_UNUSED_ARG(reginfo);
20713     PERL_UNUSED_ARG(pRExC_state);
20714 #endif	/* DEBUGGING */
20715 }
20716 
20717 
20718 
20719 SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)20720 Perl_re_intuit_string(pTHX_ REGEXP * const r)
20721 {				/* Assume that RE_INTUIT is set */
20722     struct regexp *const prog = ReANY(r);
20723     GET_RE_DEBUG_FLAGS_DECL;
20724 
20725     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20726     PERL_UNUSED_CONTEXT;
20727 
20728     DEBUG_COMPILE_r(
20729 	{
20730 	    const char * const s = SvPV_nolen_const(RX_UTF8(r)
20731 		      ? prog->check_utf8 : prog->check_substr);
20732 
20733 	    if (!PL_colorset) reginitcolors();
20734             Perl_re_printf( aTHX_
20735 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20736 		      PL_colors[4],
20737 		      RX_UTF8(r) ? "utf8 " : "",
20738 		      PL_colors[5], PL_colors[0],
20739 		      s,
20740 		      PL_colors[1],
20741 		      (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20742 	} );
20743 
20744     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20745     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20746 }
20747 
20748 /*
20749    pregfree()
20750 
20751    handles refcounting and freeing the perl core regexp structure. When
20752    it is necessary to actually free the structure the first thing it
20753    does is call the 'free' method of the regexp_engine associated to
20754    the regexp, allowing the handling of the void *pprivate; member
20755    first. (This routine is not overridable by extensions, which is why
20756    the extensions free is called first.)
20757 
20758    See regdupe and regdupe_internal if you change anything here.
20759 */
20760 #ifndef PERL_IN_XSUB_RE
20761 void
Perl_pregfree(pTHX_ REGEXP * r)20762 Perl_pregfree(pTHX_ REGEXP *r)
20763 {
20764     SvREFCNT_dec(r);
20765 }
20766 
20767 void
Perl_pregfree2(pTHX_ REGEXP * rx)20768 Perl_pregfree2(pTHX_ REGEXP *rx)
20769 {
20770     struct regexp *const r = ReANY(rx);
20771     GET_RE_DEBUG_FLAGS_DECL;
20772 
20773     PERL_ARGS_ASSERT_PREGFREE2;
20774 
20775     if (! r)
20776         return;
20777 
20778     if (r->mother_re) {
20779         ReREFCNT_dec(r->mother_re);
20780     } else {
20781         CALLREGFREE_PVT(rx); /* free the private data */
20782         SvREFCNT_dec(RXp_PAREN_NAMES(r));
20783     }
20784     if (r->substrs) {
20785         int i;
20786         for (i = 0; i < 2; i++) {
20787             SvREFCNT_dec(r->substrs->data[i].substr);
20788             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20789         }
20790 	Safefree(r->substrs);
20791     }
20792     RX_MATCH_COPY_FREE(rx);
20793 #ifdef PERL_ANY_COW
20794     SvREFCNT_dec(r->saved_copy);
20795 #endif
20796     Safefree(r->offs);
20797     SvREFCNT_dec(r->qr_anoncv);
20798     if (r->recurse_locinput)
20799         Safefree(r->recurse_locinput);
20800 }
20801 
20802 
20803 /*  reg_temp_copy()
20804 
20805     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20806     except that dsv will be created if NULL.
20807 
20808     This function is used in two main ways. First to implement
20809         $r = qr/....; $s = $$r;
20810 
20811     Secondly, it is used as a hacky workaround to the structural issue of
20812     match results
20813     being stored in the regexp structure which is in turn stored in
20814     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20815     could be PL_curpm in multiple contexts, and could require multiple
20816     result sets being associated with the pattern simultaneously, such
20817     as when doing a recursive match with (??{$qr})
20818 
20819     The solution is to make a lightweight copy of the regexp structure
20820     when a qr// is returned from the code executed by (??{$qr}) this
20821     lightweight copy doesn't actually own any of its data except for
20822     the starp/end and the actual regexp structure itself.
20823 
20824 */
20825 
20826 
20827 REGEXP *
Perl_reg_temp_copy(pTHX_ REGEXP * dsv,REGEXP * ssv)20828 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20829 {
20830     struct regexp *drx;
20831     struct regexp *const srx = ReANY(ssv);
20832     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20833 
20834     PERL_ARGS_ASSERT_REG_TEMP_COPY;
20835 
20836     if (!dsv)
20837 	dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20838     else {
20839         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
20840 
20841         /* our only valid caller, sv_setsv_flags(), should have done
20842          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
20843         assert(!SvOOK(dsv));
20844         assert(!SvIsCOW(dsv));
20845         assert(!SvROK(dsv));
20846 
20847         if (SvPVX_const(dsv)) {
20848             if (SvLEN(dsv))
20849                 Safefree(SvPVX(dsv));
20850             SvPVX(dsv) = NULL;
20851         }
20852         SvLEN_set(dsv, 0);
20853         SvCUR_set(dsv, 0);
20854 	SvOK_off((SV *)dsv);
20855 
20856 	if (islv) {
20857 	    /* For PVLVs, the head (sv_any) points to an XPVLV, while
20858              * the LV's xpvlenu_rx will point to a regexp body, which
20859              * we allocate here */
20860 	    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20861 	    assert(!SvPVX(dsv));
20862             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20863 	    temp->sv_any = NULL;
20864 	    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20865 	    SvREFCNT_dec_NN(temp);
20866 	    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20867 	       ing below will not set it. */
20868 	    SvCUR_set(dsv, SvCUR(ssv));
20869 	}
20870     }
20871     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20872        sv_force_normal(sv) is called.  */
20873     SvFAKE_on(dsv);
20874     drx = ReANY(dsv);
20875 
20876     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20877     SvPV_set(dsv, RX_WRAPPED(ssv));
20878     /* We share the same string buffer as the original regexp, on which we
20879        hold a reference count, incremented when mother_re is set below.
20880        The string pointer is copied here, being part of the regexp struct.
20881      */
20882     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20883 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20884     if (!islv)
20885         SvLEN_set(dsv, 0);
20886     if (srx->offs) {
20887         const I32 npar = srx->nparens+1;
20888         Newx(drx->offs, npar, regexp_paren_pair);
20889         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20890     }
20891     if (srx->substrs) {
20892         int i;
20893         Newx(drx->substrs, 1, struct reg_substr_data);
20894 	StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20895 
20896         for (i = 0; i < 2; i++) {
20897             SvREFCNT_inc_void(drx->substrs->data[i].substr);
20898             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20899         }
20900 
20901 	/* check_substr and check_utf8, if non-NULL, point to either their
20902 	   anchored or float namesakes, and don't hold a second reference.  */
20903     }
20904     RX_MATCH_COPIED_off(dsv);
20905 #ifdef PERL_ANY_COW
20906     drx->saved_copy = NULL;
20907 #endif
20908     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20909     SvREFCNT_inc_void(drx->qr_anoncv);
20910     if (srx->recurse_locinput)
20911         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20912 
20913     return dsv;
20914 }
20915 #endif
20916 
20917 
20918 /* regfree_internal()
20919 
20920    Free the private data in a regexp. This is overloadable by
20921    extensions. Perl takes care of the regexp structure in pregfree(),
20922    this covers the *pprivate pointer which technically perl doesn't
20923    know about, however of course we have to handle the
20924    regexp_internal structure when no extension is in use.
20925 
20926    Note this is called before freeing anything in the regexp
20927    structure.
20928  */
20929 
20930 void
Perl_regfree_internal(pTHX_ REGEXP * const rx)20931 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20932 {
20933     struct regexp *const r = ReANY(rx);
20934     RXi_GET_DECL(r, ri);
20935     GET_RE_DEBUG_FLAGS_DECL;
20936 
20937     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20938 
20939     if (! ri) {
20940         return;
20941     }
20942 
20943     DEBUG_COMPILE_r({
20944 	if (!PL_colorset)
20945 	    reginitcolors();
20946 	{
20947 	    SV *dsv= sv_newmortal();
20948             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20949                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20950             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20951                 PL_colors[4], PL_colors[5], s);
20952         }
20953     });
20954 
20955 #ifdef RE_TRACK_PATTERN_OFFSETS
20956     if (ri->u.offsets)
20957         Safefree(ri->u.offsets);             /* 20010421 MJD */
20958 #endif
20959     if (ri->code_blocks)
20960         S_free_codeblocks(aTHX_ ri->code_blocks);
20961 
20962     if (ri->data) {
20963 	int n = ri->data->count;
20964 
20965 	while (--n >= 0) {
20966           /* If you add a ->what type here, update the comment in regcomp.h */
20967 	    switch (ri->data->what[n]) {
20968 	    case 'a':
20969 	    case 'r':
20970 	    case 's':
20971 	    case 'S':
20972 	    case 'u':
20973 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20974 		break;
20975 	    case 'f':
20976 		Safefree(ri->data->data[n]);
20977 		break;
20978 	    case 'l':
20979 	    case 'L':
20980 	        break;
20981             case 'T':
20982                 { /* Aho Corasick add-on structure for a trie node.
20983                      Used in stclass optimization only */
20984                     U32 refcount;
20985                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20986 #ifdef USE_ITHREADS
20987                     dVAR;
20988 #endif
20989                     OP_REFCNT_LOCK;
20990                     refcount = --aho->refcount;
20991                     OP_REFCNT_UNLOCK;
20992                     if ( !refcount ) {
20993                         PerlMemShared_free(aho->states);
20994                         PerlMemShared_free(aho->fail);
20995 			 /* do this last!!!! */
20996                         PerlMemShared_free(ri->data->data[n]);
20997                         /* we should only ever get called once, so
20998                          * assert as much, and also guard the free
20999                          * which /might/ happen twice. At the least
21000                          * it will make code anlyzers happy and it
21001                          * doesn't cost much. - Yves */
21002                         assert(ri->regstclass);
21003                         if (ri->regstclass) {
21004                             PerlMemShared_free(ri->regstclass);
21005                             ri->regstclass = 0;
21006                         }
21007                     }
21008                 }
21009                 break;
21010 	    case 't':
21011 	        {
21012 	            /* trie structure. */
21013 	            U32 refcount;
21014 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21015 #ifdef USE_ITHREADS
21016                     dVAR;
21017 #endif
21018                     OP_REFCNT_LOCK;
21019                     refcount = --trie->refcount;
21020                     OP_REFCNT_UNLOCK;
21021                     if ( !refcount ) {
21022                         PerlMemShared_free(trie->charmap);
21023                         PerlMemShared_free(trie->states);
21024                         PerlMemShared_free(trie->trans);
21025                         if (trie->bitmap)
21026                             PerlMemShared_free(trie->bitmap);
21027                         if (trie->jump)
21028                             PerlMemShared_free(trie->jump);
21029 			PerlMemShared_free(trie->wordinfo);
21030                         /* do this last!!!! */
21031                         PerlMemShared_free(ri->data->data[n]);
21032 		    }
21033 		}
21034 		break;
21035 	    default:
21036 		Perl_croak(aTHX_ "panic: regfree data code '%c'",
21037                                                     ri->data->what[n]);
21038 	    }
21039 	}
21040 	Safefree(ri->data->what);
21041 	Safefree(ri->data);
21042     }
21043 
21044     Safefree(ri);
21045 }
21046 
21047 #define av_dup_inc(s, t)	MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21048 #define hv_dup_inc(s, t)	MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21049 #define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
21050 
21051 /*
21052    re_dup_guts - duplicate a regexp.
21053 
21054    This routine is expected to clone a given regexp structure. It is only
21055    compiled under USE_ITHREADS.
21056 
21057    After all of the core data stored in struct regexp is duplicated
21058    the regexp_engine.dupe method is used to copy any private data
21059    stored in the *pprivate pointer. This allows extensions to handle
21060    any duplication it needs to do.
21061 
21062    See pregfree() and regfree_internal() if you change anything here.
21063 */
21064 #if defined(USE_ITHREADS)
21065 #ifndef PERL_IN_XSUB_RE
21066 void
Perl_re_dup_guts(pTHX_ const REGEXP * sstr,REGEXP * dstr,CLONE_PARAMS * param)21067 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21068 {
21069     dVAR;
21070     I32 npar;
21071     const struct regexp *r = ReANY(sstr);
21072     struct regexp *ret = ReANY(dstr);
21073 
21074     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21075 
21076     npar = r->nparens+1;
21077     Newx(ret->offs, npar, regexp_paren_pair);
21078     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21079 
21080     if (ret->substrs) {
21081 	/* Do it this way to avoid reading from *r after the StructCopy().
21082 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
21083 	   cache, it doesn't matter.  */
21084         int i;
21085 	const bool anchored = r->check_substr
21086 	    ? r->check_substr == r->substrs->data[0].substr
21087 	    : r->check_utf8   == r->substrs->data[0].utf8_substr;
21088         Newx(ret->substrs, 1, struct reg_substr_data);
21089 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21090 
21091         for (i = 0; i < 2; i++) {
21092             ret->substrs->data[i].substr =
21093                         sv_dup_inc(ret->substrs->data[i].substr, param);
21094             ret->substrs->data[i].utf8_substr =
21095                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21096         }
21097 
21098 	/* check_substr and check_utf8, if non-NULL, point to either their
21099 	   anchored or float namesakes, and don't hold a second reference.  */
21100 
21101 	if (ret->check_substr) {
21102 	    if (anchored) {
21103 		assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21104 
21105 		ret->check_substr = ret->substrs->data[0].substr;
21106 		ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21107 	    } else {
21108 		assert(r->check_substr == r->substrs->data[1].substr);
21109 		assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21110 
21111 		ret->check_substr = ret->substrs->data[1].substr;
21112 		ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21113 	    }
21114 	} else if (ret->check_utf8) {
21115 	    if (anchored) {
21116 		ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21117 	    } else {
21118 		ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21119 	    }
21120 	}
21121     }
21122 
21123     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21124     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21125     if (r->recurse_locinput)
21126         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21127 
21128     if (ret->pprivate)
21129 	RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21130 
21131     if (RX_MATCH_COPIED(dstr))
21132 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21133     else
21134 	ret->subbeg = NULL;
21135 #ifdef PERL_ANY_COW
21136     ret->saved_copy = NULL;
21137 #endif
21138 
21139     /* Whether mother_re be set or no, we need to copy the string.  We
21140        cannot refrain from copying it when the storage points directly to
21141        our mother regexp, because that's
21142 	       1: a buffer in a different thread
21143 	       2: something we no longer hold a reference on
21144 	       so we need to copy it locally.  */
21145     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21146     /* set malloced length to a non-zero value so it will be freed
21147      * (otherwise in combination with SVf_FAKE it looks like an alien
21148      * buffer). It doesn't have to be the actual malloced size, since it
21149      * should never be grown */
21150     SvLEN_set(dstr, SvCUR(sstr)+1);
21151     ret->mother_re   = NULL;
21152 }
21153 #endif /* PERL_IN_XSUB_RE */
21154 
21155 /*
21156    regdupe_internal()
21157 
21158    This is the internal complement to regdupe() which is used to copy
21159    the structure pointed to by the *pprivate pointer in the regexp.
21160    This is the core version of the extension overridable cloning hook.
21161    The regexp structure being duplicated will be copied by perl prior
21162    to this and will be provided as the regexp *r argument, however
21163    with the /old/ structures pprivate pointer value. Thus this routine
21164    may override any copying normally done by perl.
21165 
21166    It returns a pointer to the new regexp_internal structure.
21167 */
21168 
21169 void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx,CLONE_PARAMS * param)21170 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21171 {
21172     dVAR;
21173     struct regexp *const r = ReANY(rx);
21174     regexp_internal *reti;
21175     int len;
21176     RXi_GET_DECL(r, ri);
21177 
21178     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21179 
21180     len = ProgLen(ri);
21181 
21182     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21183           char, regexp_internal);
21184     Copy(ri->program, reti->program, len+1, regnode);
21185 
21186 
21187     if (ri->code_blocks) {
21188 	int n;
21189 	Newx(reti->code_blocks, 1, struct reg_code_blocks);
21190 	Newx(reti->code_blocks->cb, ri->code_blocks->count,
21191                     struct reg_code_block);
21192 	Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21193              ri->code_blocks->count, struct reg_code_block);
21194 	for (n = 0; n < ri->code_blocks->count; n++)
21195 	     reti->code_blocks->cb[n].src_regex = (REGEXP*)
21196 		    sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21197         reti->code_blocks->count = ri->code_blocks->count;
21198         reti->code_blocks->refcnt = 1;
21199     }
21200     else
21201 	reti->code_blocks = NULL;
21202 
21203     reti->regstclass = NULL;
21204 
21205     if (ri->data) {
21206 	struct reg_data *d;
21207         const int count = ri->data->count;
21208 	int i;
21209 
21210 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21211 		char, struct reg_data);
21212 	Newx(d->what, count, U8);
21213 
21214 	d->count = count;
21215 	for (i = 0; i < count; i++) {
21216 	    d->what[i] = ri->data->what[i];
21217 	    switch (d->what[i]) {
21218 	        /* see also regcomp.h and regfree_internal() */
21219             case 'a': /* actually an AV, but the dup function is identical.
21220                          values seem to be "plain sv's" generally. */
21221             case 'r': /* a compiled regex (but still just another SV) */
21222             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21223                          this use case should go away, the code could have used
21224                          'a' instead - see S_set_ANYOF_arg() for array contents. */
21225             case 'S': /* actually an SV, but the dup function is identical.  */
21226             case 'u': /* actually an HV, but the dup function is identical.
21227                          values are "plain sv's" */
21228 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21229 		break;
21230 	    case 'f':
21231                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21232                  * patterns which could start with several different things. Pre-TRIE
21233                  * this was more important than it is now, however this still helps
21234                  * in some places, for instance /x?a+/ might produce a SSC equivalent
21235                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21236                  * in regexec.c
21237                  */
21238 		/* This is cheating. */
21239 		Newx(d->data[i], 1, regnode_ssc);
21240 		StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21241 		reti->regstclass = (regnode*)d->data[i];
21242 		break;
21243 	    case 'T':
21244                 /* AHO-CORASICK fail table */
21245                 /* Trie stclasses are readonly and can thus be shared
21246 		 * without duplication. We free the stclass in pregfree
21247 		 * when the corresponding reg_ac_data struct is freed.
21248 		 */
21249 		reti->regstclass= ri->regstclass;
21250 		/* FALLTHROUGH */
21251 	    case 't':
21252                 /* TRIE transition table */
21253 		OP_REFCNT_LOCK;
21254 		((reg_trie_data*)ri->data->data[i])->refcount++;
21255 		OP_REFCNT_UNLOCK;
21256 		/* FALLTHROUGH */
21257             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21258             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21259                          is not from another regexp */
21260 		d->data[i] = ri->data->data[i];
21261 		break;
21262             default:
21263                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21264                                                            ri->data->what[i]);
21265 	    }
21266 	}
21267 
21268 	reti->data = d;
21269     }
21270     else
21271 	reti->data = NULL;
21272 
21273     reti->name_list_idx = ri->name_list_idx;
21274 
21275 #ifdef RE_TRACK_PATTERN_OFFSETS
21276     if (ri->u.offsets) {
21277         Newx(reti->u.offsets, 2*len+1, U32);
21278         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21279     }
21280 #else
21281     SetProgLen(reti, len);
21282 #endif
21283 
21284     return (void*)reti;
21285 }
21286 
21287 #endif    /* USE_ITHREADS */
21288 
21289 #ifndef PERL_IN_XSUB_RE
21290 
21291 /*
21292  - regnext - dig the "next" pointer out of a node
21293  */
21294 regnode *
Perl_regnext(pTHX_ regnode * p)21295 Perl_regnext(pTHX_ regnode *p)
21296 {
21297     I32 offset;
21298 
21299     if (!p)
21300 	return(NULL);
21301 
21302     if (OP(p) > REGNODE_MAX) {		/* regnode.type is unsigned */
21303 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21304                                                 (int)OP(p), (int)REGNODE_MAX);
21305     }
21306 
21307     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21308     if (offset == 0)
21309 	return(NULL);
21310 
21311     return(p+offset);
21312 }
21313 
21314 #endif
21315 
21316 STATIC void
S_re_croak2(pTHX_ bool utf8,const char * pat1,const char * pat2,...)21317 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21318 {
21319     va_list args;
21320     STRLEN l1 = strlen(pat1);
21321     STRLEN l2 = strlen(pat2);
21322     char buf[512];
21323     SV *msv;
21324     const char *message;
21325 
21326     PERL_ARGS_ASSERT_RE_CROAK2;
21327 
21328     if (l1 > 510)
21329 	l1 = 510;
21330     if (l1 + l2 > 510)
21331 	l2 = 510 - l1;
21332     Copy(pat1, buf, l1 , char);
21333     Copy(pat2, buf + l1, l2 , char);
21334     buf[l1 + l2] = '\n';
21335     buf[l1 + l2 + 1] = '\0';
21336     va_start(args, pat2);
21337     msv = vmess(buf, &args);
21338     va_end(args);
21339     message = SvPV_const(msv, l1);
21340     if (l1 > 512)
21341 	l1 = 512;
21342     Copy(message, buf, l1 , char);
21343     /* l1-1 to avoid \n */
21344     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21345 }
21346 
21347 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21348 
21349 #ifndef PERL_IN_XSUB_RE
21350 void
Perl_save_re_context(pTHX)21351 Perl_save_re_context(pTHX)
21352 {
21353     I32 nparens = -1;
21354     I32 i;
21355 
21356     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21357 
21358     if (PL_curpm) {
21359 	const REGEXP * const rx = PM_GETRE(PL_curpm);
21360 	if (rx)
21361             nparens = RX_NPARENS(rx);
21362     }
21363 
21364     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21365      * that PL_curpm will be null, but that utf8.pm and the modules it
21366      * loads will only use $1..$3.
21367      * The t/porting/re_context.t test file checks this assumption.
21368      */
21369     if (nparens == -1)
21370         nparens = 3;
21371 
21372     for (i = 1; i <= nparens; i++) {
21373         char digits[TYPE_CHARS(long)];
21374         const STRLEN len = my_snprintf(digits, sizeof(digits),
21375                                        "%lu", (long)i);
21376         GV *const *const gvp
21377             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21378 
21379         if (gvp) {
21380             GV * const gv = *gvp;
21381             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21382                 save_scalar(gv);
21383         }
21384     }
21385 }
21386 #endif
21387 
21388 #ifdef DEBUGGING
21389 
21390 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)21391 S_put_code_point(pTHX_ SV *sv, UV c)
21392 {
21393     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21394 
21395     if (c > 255) {
21396         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21397     }
21398     else if (isPRINT(c)) {
21399 	const char string = (char) c;
21400 
21401         /* We use {phrase} as metanotation in the class, so also escape literal
21402          * braces */
21403 	if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21404 	    sv_catpvs(sv, "\\");
21405 	sv_catpvn(sv, &string, 1);
21406     }
21407     else if (isMNEMONIC_CNTRL(c)) {
21408         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21409     }
21410     else {
21411         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21412     }
21413 }
21414 
21415 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21416 
21417 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)21418 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21419 {
21420     /* Appends to 'sv' a displayable version of the range of code points from
21421      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21422      * that have them, when they occur at the beginning or end of the range.
21423      * It uses hex to output the remaining code points, unless 'allow_literals'
21424      * is true, in which case the printable ASCII ones are output as-is (though
21425      * some of these will be escaped by put_code_point()).
21426      *
21427      * NOTE:  This is designed only for printing ranges of code points that fit
21428      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21429      */
21430 
21431     const unsigned int min_range_count = 3;
21432 
21433     assert(start <= end);
21434 
21435     PERL_ARGS_ASSERT_PUT_RANGE;
21436 
21437     while (start <= end) {
21438         UV this_end;
21439         const char * format;
21440 
21441         if (end - start < min_range_count) {
21442 
21443             /* Output chars individually when they occur in short ranges */
21444             for (; start <= end; start++) {
21445                 put_code_point(sv, start);
21446             }
21447             break;
21448         }
21449 
21450         /* If permitted by the input options, and there is a possibility that
21451          * this range contains a printable literal, look to see if there is
21452          * one. */
21453         if (allow_literals && start <= MAX_PRINT_A) {
21454 
21455             /* If the character at the beginning of the range isn't an ASCII
21456              * printable, effectively split the range into two parts:
21457              *  1) the portion before the first such printable,
21458              *  2) the rest
21459              * and output them separately. */
21460             if (! isPRINT_A(start)) {
21461                 UV temp_end = start + 1;
21462 
21463                 /* There is no point looking beyond the final possible
21464                  * printable, in MAX_PRINT_A */
21465                 UV max = MIN(end, MAX_PRINT_A);
21466 
21467                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21468                     temp_end++;
21469                 }
21470 
21471                 /* Here, temp_end points to one beyond the first printable if
21472                  * found, or to one beyond 'max' if not.  If none found, make
21473                  * sure that we use the entire range */
21474                 if (temp_end > MAX_PRINT_A) {
21475                     temp_end = end + 1;
21476                 }
21477 
21478                 /* Output the first part of the split range: the part that
21479                  * doesn't have printables, with the parameter set to not look
21480                  * for literals (otherwise we would infinitely recurse) */
21481                 put_range(sv, start, temp_end - 1, FALSE);
21482 
21483                 /* The 2nd part of the range (if any) starts here. */
21484                 start = temp_end;
21485 
21486                 /* We do a continue, instead of dropping down, because even if
21487                  * the 2nd part is non-empty, it could be so short that we want
21488                  * to output it as individual characters, as tested for at the
21489                  * top of this loop.  */
21490                 continue;
21491             }
21492 
21493             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21494              * output a sub-range of just the digits or letters, then process
21495              * the remaining portion as usual. */
21496             if (isALPHANUMERIC_A(start)) {
21497                 UV mask = (isDIGIT_A(start))
21498                            ? _CC_DIGIT
21499                              : isUPPER_A(start)
21500                                ? _CC_UPPER
21501                                : _CC_LOWER;
21502                 UV temp_end = start + 1;
21503 
21504                 /* Find the end of the sub-range that includes just the
21505                  * characters in the same class as the first character in it */
21506                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21507                     temp_end++;
21508                 }
21509                 temp_end--;
21510 
21511                 /* For short ranges, don't duplicate the code above to output
21512                  * them; just call recursively */
21513                 if (temp_end - start < min_range_count) {
21514                     put_range(sv, start, temp_end, FALSE);
21515                 }
21516                 else {  /* Output as a range */
21517                     put_code_point(sv, start);
21518                     sv_catpvs(sv, "-");
21519                     put_code_point(sv, temp_end);
21520                 }
21521                 start = temp_end + 1;
21522                 continue;
21523             }
21524 
21525             /* We output any other printables as individual characters */
21526             if (isPUNCT_A(start) || isSPACE_A(start)) {
21527                 while (start <= end && (isPUNCT_A(start)
21528                                         || isSPACE_A(start)))
21529                 {
21530                     put_code_point(sv, start);
21531                     start++;
21532                 }
21533                 continue;
21534             }
21535         } /* End of looking for literals */
21536 
21537         /* Here is not to output as a literal.  Some control characters have
21538          * mnemonic names.  Split off any of those at the beginning and end of
21539          * the range to print mnemonically.  It isn't possible for many of
21540          * these to be in a row, so this won't overwhelm with output */
21541         if (   start <= end
21542             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21543         {
21544             while (isMNEMONIC_CNTRL(start) && start <= end) {
21545                 put_code_point(sv, start);
21546                 start++;
21547             }
21548 
21549             /* If this didn't take care of the whole range ... */
21550             if (start <= end) {
21551 
21552                 /* Look backwards from the end to find the final non-mnemonic
21553                  * */
21554                 UV temp_end = end;
21555                 while (isMNEMONIC_CNTRL(temp_end)) {
21556                     temp_end--;
21557                 }
21558 
21559                 /* And separately output the interior range that doesn't start
21560                  * or end with mnemonics */
21561                 put_range(sv, start, temp_end, FALSE);
21562 
21563                 /* Then output the mnemonic trailing controls */
21564                 start = temp_end + 1;
21565                 while (start <= end) {
21566                     put_code_point(sv, start);
21567                     start++;
21568                 }
21569                 break;
21570             }
21571         }
21572 
21573         /* As a final resort, output the range or subrange as hex. */
21574 
21575         this_end = (end < NUM_ANYOF_CODE_POINTS)
21576                     ? end
21577                     : NUM_ANYOF_CODE_POINTS - 1;
21578 #if NUM_ANYOF_CODE_POINTS > 256
21579         format = (this_end < 256)
21580                  ? "\\x%02" UVXf "-\\x%02" UVXf
21581                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21582 #else
21583         format = "\\x%02" UVXf "-\\x%02" UVXf;
21584 #endif
21585         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21586         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21587         GCC_DIAG_RESTORE_STMT;
21588         break;
21589     }
21590 }
21591 
21592 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)21593 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21594 {
21595     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21596      * 'invlist' */
21597 
21598     UV start, end;
21599     bool allow_literals = TRUE;
21600 
21601     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21602 
21603     /* Generally, it is more readable if printable characters are output as
21604      * literals, but if a range (nearly) spans all of them, it's best to output
21605      * it as a single range.  This code will use a single range if all but 2
21606      * ASCII printables are in it */
21607     invlist_iterinit(invlist);
21608     while (invlist_iternext(invlist, &start, &end)) {
21609 
21610         /* If the range starts beyond the final printable, it doesn't have any
21611          * in it */
21612         if (start > MAX_PRINT_A) {
21613             break;
21614         }
21615 
21616         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
21617          * all but two, the range must start and end no later than 2 from
21618          * either end */
21619         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21620             if (end > MAX_PRINT_A) {
21621                 end = MAX_PRINT_A;
21622             }
21623             if (start < ' ') {
21624                 start = ' ';
21625             }
21626             if (end - start >= MAX_PRINT_A - ' ' - 2) {
21627                 allow_literals = FALSE;
21628             }
21629             break;
21630         }
21631     }
21632     invlist_iterfinish(invlist);
21633 
21634     /* Here we have figured things out.  Output each range */
21635     invlist_iterinit(invlist);
21636     while (invlist_iternext(invlist, &start, &end)) {
21637         if (start >= NUM_ANYOF_CODE_POINTS) {
21638             break;
21639         }
21640         put_range(sv, start, end, allow_literals);
21641     }
21642     invlist_iterfinish(invlist);
21643 
21644     return;
21645 }
21646 
21647 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)21648 S_put_charclass_bitmap_innards_common(pTHX_
21649         SV* invlist,            /* The bitmap */
21650         SV* posixes,            /* Under /l, things like [:word:], \S */
21651         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
21652         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
21653         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
21654         const bool invert       /* Is the result to be inverted? */
21655 )
21656 {
21657     /* Create and return an SV containing a displayable version of the bitmap
21658      * and associated information determined by the input parameters.  If the
21659      * output would have been only the inversion indicator '^', NULL is instead
21660      * returned. */
21661 
21662     dVAR;
21663     SV * output;
21664 
21665     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21666 
21667     if (invert) {
21668         output = newSVpvs("^");
21669     }
21670     else {
21671         output = newSVpvs("");
21672     }
21673 
21674     /* First, the code points in the bitmap that are unconditionally there */
21675     put_charclass_bitmap_innards_invlist(output, invlist);
21676 
21677     /* Traditionally, these have been placed after the main code points */
21678     if (posixes) {
21679         sv_catsv(output, posixes);
21680     }
21681 
21682     if (only_utf8 && _invlist_len(only_utf8)) {
21683         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21684         put_charclass_bitmap_innards_invlist(output, only_utf8);
21685     }
21686 
21687     if (not_utf8 && _invlist_len(not_utf8)) {
21688         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21689         put_charclass_bitmap_innards_invlist(output, not_utf8);
21690     }
21691 
21692     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21693         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21694         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21695 
21696         /* This is the only list in this routine that can legally contain code
21697          * points outside the bitmap range.  The call just above to
21698          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21699          * output them here.  There's about a half-dozen possible, and none in
21700          * contiguous ranges longer than 2 */
21701         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21702             UV start, end;
21703             SV* above_bitmap = NULL;
21704 
21705             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21706 
21707             invlist_iterinit(above_bitmap);
21708             while (invlist_iternext(above_bitmap, &start, &end)) {
21709                 UV i;
21710 
21711                 for (i = start; i <= end; i++) {
21712                     put_code_point(output, i);
21713                 }
21714             }
21715             invlist_iterfinish(above_bitmap);
21716             SvREFCNT_dec_NN(above_bitmap);
21717         }
21718     }
21719 
21720     if (invert && SvCUR(output) == 1) {
21721         return NULL;
21722     }
21723 
21724     return output;
21725 }
21726 
21727 STATIC bool
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const bool force_as_is_display)21728 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21729                                      char *bitmap,
21730                                      SV *nonbitmap_invlist,
21731                                      SV *only_utf8_locale_invlist,
21732                                      const regnode * const node,
21733                                      const bool force_as_is_display)
21734 {
21735     /* Appends to 'sv' a displayable version of the innards of the bracketed
21736      * character class defined by the other arguments:
21737      *  'bitmap' points to the bitmap, or NULL if to ignore that.
21738      *  'nonbitmap_invlist' is an inversion list of the code points that are in
21739      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
21740      *      none.  The reasons for this could be that they require some
21741      *      condition such as the target string being or not being in UTF-8
21742      *      (under /d), or because they came from a user-defined property that
21743      *      was not resolved at the time of the regex compilation (under /u)
21744      *  'only_utf8_locale_invlist' is an inversion list of the code points that
21745      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
21746      *  'node' is the regex pattern ANYOF node.  It is needed only when the
21747      *      above two parameters are not null, and is passed so that this
21748      *      routine can tease apart the various reasons for them.
21749      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
21750      *      to invert things to see if that leads to a cleaner display.  If
21751      *      FALSE, this routine is free to use its judgment about doing this.
21752      *
21753      * It returns TRUE if there was actually something output.  (It may be that
21754      * the bitmap, etc is empty.)
21755      *
21756      * When called for outputting the bitmap of a non-ANYOF node, just pass the
21757      * bitmap, with the succeeding parameters set to NULL, and the final one to
21758      * FALSE.
21759      */
21760 
21761     /* In general, it tries to display the 'cleanest' representation of the
21762      * innards, choosing whether to display them inverted or not, regardless of
21763      * whether the class itself is to be inverted.  However,  there are some
21764      * cases where it can't try inverting, as what actually matches isn't known
21765      * until runtime, and hence the inversion isn't either. */
21766 
21767     dVAR;
21768     bool inverting_allowed = ! force_as_is_display;
21769 
21770     int i;
21771     STRLEN orig_sv_cur = SvCUR(sv);
21772 
21773     SV* invlist;            /* Inversion list we accumulate of code points that
21774                                are unconditionally matched */
21775     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
21776                                UTF-8 */
21777     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
21778                              */
21779     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
21780     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
21781                                        is UTF-8 */
21782 
21783     SV* as_is_display;      /* The output string when we take the inputs
21784                                literally */
21785     SV* inverted_display;   /* The output string when we invert the inputs */
21786 
21787     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21788 
21789     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
21790                                                    to match? */
21791     /* We are biased in favor of displaying things without them being inverted,
21792      * as that is generally easier to understand */
21793     const int bias = 5;
21794 
21795     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21796 
21797     /* Start off with whatever code points are passed in.  (We clone, so we
21798      * don't change the caller's list) */
21799     if (nonbitmap_invlist) {
21800         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21801         invlist = invlist_clone(nonbitmap_invlist, NULL);
21802     }
21803     else {  /* Worst case size is every other code point is matched */
21804         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21805     }
21806 
21807     if (flags) {
21808         if (OP(node) == ANYOFD) {
21809 
21810             /* This flag indicates that the code points below 0x100 in the
21811              * nonbitmap list are precisely the ones that match only when the
21812              * target is UTF-8 (they should all be non-ASCII). */
21813             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21814             {
21815                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21816                 _invlist_subtract(invlist, only_utf8, &invlist);
21817             }
21818 
21819             /* And this flag for matching all non-ASCII 0xFF and below */
21820             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21821             {
21822                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21823             }
21824         }
21825         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21826 
21827             /* If either of these flags are set, what matches isn't
21828              * determinable except during execution, so don't know enough here
21829              * to invert */
21830             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21831                 inverting_allowed = FALSE;
21832             }
21833 
21834             /* What the posix classes match also varies at runtime, so these
21835              * will be output symbolically. */
21836             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21837                 int i;
21838 
21839                 posixes = newSVpvs("");
21840                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21841                     if (ANYOF_POSIXL_TEST(node, i)) {
21842                         sv_catpv(posixes, anyofs[i]);
21843                     }
21844                 }
21845             }
21846         }
21847     }
21848 
21849     /* Accumulate the bit map into the unconditional match list */
21850     if (bitmap) {
21851         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21852             if (BITMAP_TEST(bitmap, i)) {
21853                 int start = i++;
21854                 for (;
21855                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21856                      i++)
21857                 { /* empty */ }
21858                 invlist = _add_range_to_invlist(invlist, start, i-1);
21859             }
21860         }
21861     }
21862 
21863     /* Make sure that the conditional match lists don't have anything in them
21864      * that match unconditionally; otherwise the output is quite confusing.
21865      * This could happen if the code that populates these misses some
21866      * duplication. */
21867     if (only_utf8) {
21868         _invlist_subtract(only_utf8, invlist, &only_utf8);
21869     }
21870     if (not_utf8) {
21871         _invlist_subtract(not_utf8, invlist, &not_utf8);
21872     }
21873 
21874     if (only_utf8_locale_invlist) {
21875 
21876         /* Since this list is passed in, we have to make a copy before
21877          * modifying it */
21878         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21879 
21880         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21881 
21882         /* And, it can get really weird for us to try outputting an inverted
21883          * form of this list when it has things above the bitmap, so don't even
21884          * try */
21885         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21886             inverting_allowed = FALSE;
21887         }
21888     }
21889 
21890     /* Calculate what the output would be if we take the input as-is */
21891     as_is_display = put_charclass_bitmap_innards_common(invlist,
21892                                                     posixes,
21893                                                     only_utf8,
21894                                                     not_utf8,
21895                                                     only_utf8_locale,
21896                                                     invert);
21897 
21898     /* If have to take the output as-is, just do that */
21899     if (! inverting_allowed) {
21900         if (as_is_display) {
21901             sv_catsv(sv, as_is_display);
21902             SvREFCNT_dec_NN(as_is_display);
21903         }
21904     }
21905     else { /* But otherwise, create the output again on the inverted input, and
21906               use whichever version is shorter */
21907 
21908         int inverted_bias, as_is_bias;
21909 
21910         /* We will apply our bias to whichever of the the results doesn't have
21911          * the '^' */
21912         if (invert) {
21913             invert = FALSE;
21914             as_is_bias = bias;
21915             inverted_bias = 0;
21916         }
21917         else {
21918             invert = TRUE;
21919             as_is_bias = 0;
21920             inverted_bias = bias;
21921         }
21922 
21923         /* Now invert each of the lists that contribute to the output,
21924          * excluding from the result things outside the possible range */
21925 
21926         /* For the unconditional inversion list, we have to add in all the
21927          * conditional code points, so that when inverted, they will be gone
21928          * from it */
21929         _invlist_union(only_utf8, invlist, &invlist);
21930         _invlist_union(not_utf8, invlist, &invlist);
21931         _invlist_union(only_utf8_locale, invlist, &invlist);
21932         _invlist_invert(invlist);
21933         _invlist_intersection(invlist, PL_InBitmap, &invlist);
21934 
21935         if (only_utf8) {
21936             _invlist_invert(only_utf8);
21937             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21938         }
21939         else if (not_utf8) {
21940 
21941             /* If a code point matches iff the target string is not in UTF-8,
21942              * then complementing the result has it not match iff not in UTF-8,
21943              * which is the same thing as matching iff it is UTF-8. */
21944             only_utf8 = not_utf8;
21945             not_utf8 = NULL;
21946         }
21947 
21948         if (only_utf8_locale) {
21949             _invlist_invert(only_utf8_locale);
21950             _invlist_intersection(only_utf8_locale,
21951                                   PL_InBitmap,
21952                                   &only_utf8_locale);
21953         }
21954 
21955         inverted_display = put_charclass_bitmap_innards_common(
21956                                             invlist,
21957                                             posixes,
21958                                             only_utf8,
21959                                             not_utf8,
21960                                             only_utf8_locale, invert);
21961 
21962         /* Use the shortest representation, taking into account our bias
21963          * against showing it inverted */
21964         if (   inverted_display
21965             && (   ! as_is_display
21966                 || (  SvCUR(inverted_display) + inverted_bias
21967                     < SvCUR(as_is_display)    + as_is_bias)))
21968         {
21969 	    sv_catsv(sv, inverted_display);
21970         }
21971         else if (as_is_display) {
21972 	    sv_catsv(sv, as_is_display);
21973         }
21974 
21975         SvREFCNT_dec(as_is_display);
21976         SvREFCNT_dec(inverted_display);
21977     }
21978 
21979     SvREFCNT_dec_NN(invlist);
21980     SvREFCNT_dec(only_utf8);
21981     SvREFCNT_dec(not_utf8);
21982     SvREFCNT_dec(posixes);
21983     SvREFCNT_dec(only_utf8_locale);
21984 
21985     return SvCUR(sv) > orig_sv_cur;
21986 }
21987 
21988 #define CLEAR_OPTSTART                                                       \
21989     if (optstart) STMT_START {                                               \
21990         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
21991                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21992         optstart=NULL;                                                       \
21993     } STMT_END
21994 
21995 #define DUMPUNTIL(b,e)                                                       \
21996                     CLEAR_OPTSTART;                                          \
21997                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21998 
21999 STATIC const regnode *
S_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)22000 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22001 	    const regnode *last, const regnode *plast,
22002 	    SV* sv, I32 indent, U32 depth)
22003 {
22004     U8 op = PSEUDO;	/* Arbitrary non-END op. */
22005     const regnode *next;
22006     const regnode *optstart= NULL;
22007 
22008     RXi_GET_DECL(r, ri);
22009     GET_RE_DEBUG_FLAGS_DECL;
22010 
22011     PERL_ARGS_ASSERT_DUMPUNTIL;
22012 
22013 #ifdef DEBUG_DUMPUNTIL
22014     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22015         last ? last-start : 0, plast ? plast-start : 0);
22016 #endif
22017 
22018     if (plast && plast < last)
22019         last= plast;
22020 
22021     while (PL_regkind[op] != END && (!last || node < last)) {
22022         assert(node);
22023 	/* While that wasn't END last time... */
22024 	NODE_ALIGN(node);
22025 	op = OP(node);
22026 	if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22027 	    indent--;
22028 	next = regnext((regnode *)node);
22029 
22030 	/* Where, what. */
22031 	if (OP(node) == OPTIMIZED) {
22032 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22033 	        optstart = node;
22034 	    else
22035 		goto after_print;
22036 	} else
22037 	    CLEAR_OPTSTART;
22038 
22039         regprop(r, sv, node, NULL, NULL);
22040         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22041 		      (int)(2*indent + 1), "", SvPVX_const(sv));
22042 
22043         if (OP(node) != OPTIMIZED) {
22044             if (next == NULL)		/* Next ptr. */
22045                 Perl_re_printf( aTHX_  " (0)");
22046             else if (PL_regkind[(U8)op] == BRANCH
22047                      && PL_regkind[OP(next)] != BRANCH )
22048                 Perl_re_printf( aTHX_  " (FAIL)");
22049             else
22050                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22051             Perl_re_printf( aTHX_ "\n");
22052         }
22053 
22054       after_print:
22055 	if (PL_regkind[(U8)op] == BRANCHJ) {
22056 	    assert(next);
22057 	    {
22058                 const regnode *nnode = (OP(next) == LONGJMP
22059                                        ? regnext((regnode *)next)
22060                                        : next);
22061                 if (last && nnode > last)
22062                     nnode = last;
22063                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22064 	    }
22065 	}
22066 	else if (PL_regkind[(U8)op] == BRANCH) {
22067 	    assert(next);
22068 	    DUMPUNTIL(NEXTOPER(node), next);
22069 	}
22070 	else if ( PL_regkind[(U8)op]  == TRIE ) {
22071 	    const regnode *this_trie = node;
22072 	    const char op = OP(node);
22073             const U32 n = ARG(node);
22074 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
22075                (reg_ac_data *)ri->data->data[n] :
22076                NULL;
22077 	    const reg_trie_data * const trie =
22078 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22079 #ifdef DEBUGGING
22080 	    AV *const trie_words
22081                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22082 #endif
22083 	    const regnode *nextbranch= NULL;
22084 	    I32 word_idx;
22085             SvPVCLEAR(sv);
22086 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22087 		SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22088 
22089                 Perl_re_indentf( aTHX_  "%s ",
22090                     indent+3,
22091                     elem_ptr
22092                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22093                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22094                                 PL_colors[0], PL_colors[1],
22095                                 (SvUTF8(*elem_ptr)
22096                                  ? PERL_PV_ESCAPE_UNI
22097                                  : 0)
22098                                 | PERL_PV_PRETTY_ELLIPSES
22099                                 | PERL_PV_PRETTY_LTGT
22100                             )
22101                     : "???"
22102                 );
22103                 if (trie->jump) {
22104                     U16 dist= trie->jump[word_idx+1];
22105                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22106                                (UV)((dist ? this_trie + dist : next) - start));
22107                     if (dist) {
22108                         if (!nextbranch)
22109                             nextbranch= this_trie + trie->jump[0];
22110 			DUMPUNTIL(this_trie + dist, nextbranch);
22111                     }
22112                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22113                         nextbranch= regnext((regnode *)nextbranch);
22114                 } else {
22115                     Perl_re_printf( aTHX_  "\n");
22116 		}
22117 	    }
22118 	    if (last && next > last)
22119 	        node= last;
22120 	    else
22121 	        node= next;
22122 	}
22123 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22124 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22125                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22126 	}
22127 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22128 	    assert(next);
22129 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22130 	}
22131 	else if ( op == PLUS || op == STAR) {
22132 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22133 	}
22134 	else if (PL_regkind[(U8)op] == EXACT) {
22135             /* Literal string, where present. */
22136 	    node += NODE_SZ_STR(node) - 1;
22137 	    node = NEXTOPER(node);
22138 	}
22139 	else {
22140 	    node = NEXTOPER(node);
22141 	    node += regarglen[(U8)op];
22142 	}
22143 	if (op == CURLYX || op == OPEN || op == SROPEN)
22144 	    indent++;
22145     }
22146     CLEAR_OPTSTART;
22147 #ifdef DEBUG_DUMPUNTIL
22148     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22149 #endif
22150     return node;
22151 }
22152 
22153 #endif	/* DEBUGGING */
22154 
22155 #ifndef PERL_IN_XSUB_RE
22156 
22157 #include "uni_keywords.h"
22158 
22159 void
Perl_init_uniprops(pTHX)22160 Perl_init_uniprops(pTHX)
22161 {
22162     dVAR;
22163 
22164     PL_user_def_props = newHV();
22165 
22166 #ifdef USE_ITHREADS
22167 
22168     HvSHAREKEYS_off(PL_user_def_props);
22169     PL_user_def_props_aTHX = aTHX;
22170 
22171 #endif
22172 
22173     /* Set up the inversion list global variables */
22174 
22175     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22176     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22177     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22178     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22179     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22180     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22181     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22182     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22183     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22184     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22185     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22186     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22187     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22188     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22189     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22190     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22191 
22192     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22193     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22194     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22195     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22196     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22197     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22198     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22199     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22200     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22201     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22202     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22203     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22204     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22205     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22206     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22207     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22208 
22209     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22210     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22211     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22212     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22213     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22214 
22215     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22216     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22217     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22218 
22219     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22220 
22221     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22222     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22223 
22224     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22225     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22226 
22227     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22228     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22229                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22230     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22231                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22232     PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
22233                                             UNI__PERL_NON_FINAL_FOLDS]);
22234 
22235     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22236     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22237     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22238     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22239     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22240     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22241     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22242     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22243     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22244 
22245 #ifdef UNI_XIDC
22246     /* The below are used only by deprecated functions.  They could be removed */
22247     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22248     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22249     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22250 #endif
22251 }
22252 
22253 #if 0
22254 
22255 This code was mainly added for backcompat to give a warning for non-portable
22256 code points in user-defined properties.  But experiments showed that the
22257 warning in earlier perls were only omitted on overflow, which should be an
22258 error, so there really isnt a backcompat issue, and actually adding the
22259 warning when none was present before might cause breakage, for little gain.  So
22260 khw left this code in, but not enabled.  Tests were never added.
22261 
22262 embed.fnc entry:
22263 Ei	|const char *|get_extended_utf8_msg|const UV cp
22264 
22265 PERL_STATIC_INLINE const char *
22266 S_get_extended_utf8_msg(pTHX_ const UV cp)
22267 {
22268     U8 dummy[UTF8_MAXBYTES + 1];
22269     HV *msgs;
22270     SV **msg;
22271 
22272     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22273                              &msgs);
22274 
22275     msg = hv_fetchs(msgs, "text", 0);
22276     assert(msg);
22277 
22278     (void) sv_2mortal((SV *) msgs);
22279 
22280     return SvPVX(*msg);
22281 }
22282 
22283 #endif
22284 
22285 SV *
Perl_handle_user_defined_property(pTHX_ const char * name,const STRLEN name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,SV * contents,bool * user_defined_ptr,SV * msg,const STRLEN level)22286 Perl_handle_user_defined_property(pTHX_
22287 
22288     /* Parses the contents of a user-defined property definition; returning the
22289      * expanded definition if possible.  If so, the return is an inversion
22290      * list.
22291      *
22292      * If there are subroutines that are part of the expansion and which aren't
22293      * known at the time of the call to this function, this returns what
22294      * parse_uniprop_string() returned for the first one encountered.
22295      *
22296      * If an error was found, NULL is returned, and 'msg' gets a suitable
22297      * message appended to it.  (Appending allows the back trace of how we got
22298      * to the faulty definition to be displayed through nested calls of
22299      * user-defined subs.)
22300      *
22301      * The caller IS responsible for freeing any returned SV.
22302      *
22303      * The syntax of the contents is pretty much described in perlunicode.pod,
22304      * but we also allow comments on each line */
22305 
22306     const char * name,          /* Name of property */
22307     const STRLEN name_len,      /* The name's length in bytes */
22308     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22309     const bool to_fold,         /* ? Is this under /i */
22310     const bool runtime,         /* ? Are we in compile- or run-time */
22311     const bool deferrable,      /* Is it ok for this property's full definition
22312                                    to be deferred until later? */
22313     SV* contents,               /* The property's definition */
22314     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
22315                                    getting called unless this is thought to be
22316                                    a user-defined property */
22317     SV * msg,                   /* Any error or warning msg(s) are appended to
22318                                    this */
22319     const STRLEN level)         /* Recursion level of this call */
22320 {
22321     STRLEN len;
22322     const char * string         = SvPV_const(contents, len);
22323     const char * const e        = string + len;
22324     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22325     const STRLEN msgs_length_on_entry = SvCUR(msg);
22326 
22327     const char * s0 = string;   /* Points to first byte in the current line
22328                                    being parsed in 'string' */
22329     const char overflow_msg[] = "Code point too large in \"";
22330     SV* running_definition = NULL;
22331 
22332     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22333 
22334     *user_defined_ptr = TRUE;
22335 
22336     /* Look at each line */
22337     while (s0 < e) {
22338         const char * s;     /* Current byte */
22339         char op = '+';      /* Default operation is 'union' */
22340         IV   min = 0;       /* range begin code point */
22341         IV   max = -1;      /* and range end */
22342         SV* this_definition;
22343 
22344         /* Skip comment lines */
22345         if (*s0 == '#') {
22346             s0 = strchr(s0, '\n');
22347             if (s0 == NULL) {
22348                 break;
22349             }
22350             s0++;
22351             continue;
22352         }
22353 
22354         /* For backcompat, allow an empty first line */
22355         if (*s0 == '\n') {
22356             s0++;
22357             continue;
22358         }
22359 
22360         /* First character in the line may optionally be the operation */
22361         if (   *s0 == '+'
22362             || *s0 == '!'
22363             || *s0 == '-'
22364             || *s0 == '&')
22365         {
22366             op = *s0++;
22367         }
22368 
22369         /* If the line is one or two hex digits separated by blank space, its
22370          * a range; otherwise it is either another user-defined property or an
22371          * error */
22372 
22373         s = s0;
22374 
22375         if (! isXDIGIT(*s)) {
22376             goto check_if_property;
22377         }
22378 
22379         do { /* Each new hex digit will add 4 bits. */
22380             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22381                 s = strchr(s, '\n');
22382                 if (s == NULL) {
22383                     s = e;
22384                 }
22385                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22386                 sv_catpv(msg, overflow_msg);
22387                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22388                                      UTF8fARG(is_contents_utf8, s - s0, s0));
22389                 sv_catpvs(msg, "\"");
22390                 goto return_failure;
22391             }
22392 
22393             /* Accumulate this digit into the value */
22394             min = (min << 4) + READ_XDIGIT(s);
22395         } while (isXDIGIT(*s));
22396 
22397         while (isBLANK(*s)) { s++; }
22398 
22399         /* We allow comments at the end of the line */
22400         if (*s == '#') {
22401             s = strchr(s, '\n');
22402             if (s == NULL) {
22403                 s = e;
22404             }
22405             s++;
22406         }
22407         else if (s < e && *s != '\n') {
22408             if (! isXDIGIT(*s)) {
22409                 goto check_if_property;
22410             }
22411 
22412             /* Look for the high point of the range */
22413             max = 0;
22414             do {
22415                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22416                     s = strchr(s, '\n');
22417                     if (s == NULL) {
22418                         s = e;
22419                     }
22420                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22421                     sv_catpv(msg, overflow_msg);
22422                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22423                                       UTF8fARG(is_contents_utf8, s - s0, s0));
22424                     sv_catpvs(msg, "\"");
22425                     goto return_failure;
22426                 }
22427 
22428                 max = (max << 4) + READ_XDIGIT(s);
22429             } while (isXDIGIT(*s));
22430 
22431             while (isBLANK(*s)) { s++; }
22432 
22433             if (*s == '#') {
22434                 s = strchr(s, '\n');
22435                 if (s == NULL) {
22436                     s = e;
22437                 }
22438             }
22439             else if (s < e && *s != '\n') {
22440                 goto check_if_property;
22441             }
22442         }
22443 
22444         if (max == -1) {    /* The line only had one entry */
22445             max = min;
22446         }
22447         else if (max < min) {
22448             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22449             sv_catpvs(msg, "Illegal range in \"");
22450             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22451                                 UTF8fARG(is_contents_utf8, s - s0, s0));
22452             sv_catpvs(msg, "\"");
22453             goto return_failure;
22454         }
22455 
22456 #if 0   /* See explanation at definition above of get_extended_utf8_msg() */
22457 
22458         if (   UNICODE_IS_PERL_EXTENDED(min)
22459             || UNICODE_IS_PERL_EXTENDED(max))
22460         {
22461             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22462 
22463             /* If both code points are non-portable, warn only on the lower
22464              * one. */
22465             sv_catpv(msg, get_extended_utf8_msg(
22466                                             (UNICODE_IS_PERL_EXTENDED(min))
22467                                             ? min : max));
22468             sv_catpvs(msg, " in \"");
22469             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22470                                  UTF8fARG(is_contents_utf8, s - s0, s0));
22471             sv_catpvs(msg, "\"");
22472         }
22473 
22474 #endif
22475 
22476         /* Here, this line contains a legal range */
22477         this_definition = sv_2mortal(_new_invlist(2));
22478         this_definition = _add_range_to_invlist(this_definition, min, max);
22479         goto calculate;
22480 
22481       check_if_property:
22482 
22483         /* Here it isn't a legal range line.  See if it is a legal property
22484          * line.  First find the end of the meat of the line */
22485         s = strpbrk(s, "#\n");
22486         if (s == NULL) {
22487             s = e;
22488         }
22489 
22490         /* Ignore trailing blanks in keeping with the requirements of
22491          * parse_uniprop_string() */
22492         s--;
22493         while (s > s0 && isBLANK_A(*s)) {
22494             s--;
22495         }
22496         s++;
22497 
22498         this_definition = parse_uniprop_string(s0, s - s0,
22499                                                is_utf8, to_fold, runtime,
22500                                                deferrable,
22501                                                user_defined_ptr, msg,
22502                                                (name_len == 0)
22503                                                 ? level /* Don't increase level
22504                                                            if input is empty */
22505                                                 : level + 1
22506                                               );
22507         if (this_definition == NULL) {
22508             goto return_failure;    /* 'msg' should have had the reason
22509                                        appended to it by the above call */
22510         }
22511 
22512         if (! is_invlist(this_definition)) {    /* Unknown at this time */
22513             return newSVsv(this_definition);
22514         }
22515 
22516         if (*s != '\n') {
22517             s = strchr(s, '\n');
22518             if (s == NULL) {
22519                 s = e;
22520             }
22521         }
22522 
22523       calculate:
22524 
22525         switch (op) {
22526             case '+':
22527                 _invlist_union(running_definition, this_definition,
22528                                                         &running_definition);
22529                 break;
22530             case '-':
22531                 _invlist_subtract(running_definition, this_definition,
22532                                                         &running_definition);
22533                 break;
22534             case '&':
22535                 _invlist_intersection(running_definition, this_definition,
22536                                                         &running_definition);
22537                 break;
22538             case '!':
22539                 _invlist_union_complement_2nd(running_definition,
22540                                         this_definition, &running_definition);
22541                 break;
22542             default:
22543                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22544                                  __FILE__, __LINE__, op);
22545                 break;
22546         }
22547 
22548         /* Position past the '\n' */
22549         s0 = s + 1;
22550     }   /* End of loop through the lines of 'contents' */
22551 
22552     /* Here, we processed all the lines in 'contents' without error.  If we
22553      * didn't add any warnings, simply return success */
22554     if (msgs_length_on_entry == SvCUR(msg)) {
22555 
22556         /* If the expansion was empty, the answer isn't nothing: its an empty
22557          * inversion list */
22558         if (running_definition == NULL) {
22559             running_definition = _new_invlist(1);
22560         }
22561 
22562         return running_definition;
22563     }
22564 
22565     /* Otherwise, add some explanatory text, but we will return success */
22566     goto return_msg;
22567 
22568   return_failure:
22569     running_definition = NULL;
22570 
22571   return_msg:
22572 
22573     if (name_len > 0) {
22574         sv_catpvs(msg, " in expansion of ");
22575         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
22576     }
22577 
22578     return running_definition;
22579 }
22580 
22581 /* As explained below, certain operations need to take place in the first
22582  * thread created.  These macros switch contexts */
22583 #ifdef USE_ITHREADS
22584 #  define DECLARATION_FOR_GLOBAL_CONTEXT                                    \
22585                                         PerlInterpreter * save_aTHX = aTHX;
22586 #  define SWITCH_TO_GLOBAL_CONTEXT                                          \
22587                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
22588 #  define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
22589 #  define CUR_CONTEXT      aTHX
22590 #  define ORIGINAL_CONTEXT save_aTHX
22591 #else
22592 #  define DECLARATION_FOR_GLOBAL_CONTEXT
22593 #  define SWITCH_TO_GLOBAL_CONTEXT          NOOP
22594 #  define RESTORE_CONTEXT                   NOOP
22595 #  define CUR_CONTEXT                       NULL
22596 #  define ORIGINAL_CONTEXT                  NULL
22597 #endif
22598 
22599 STATIC void
S_delete_recursion_entry(pTHX_ void * key)22600 S_delete_recursion_entry(pTHX_ void *key)
22601 {
22602     /* Deletes the entry used to detect recursion when expanding user-defined
22603      * properties.  This is a function so it can be set up to be called even if
22604      * the program unexpectedly quits */
22605 
22606     dVAR;
22607     SV ** current_entry;
22608     const STRLEN key_len = strlen((const char *) key);
22609     DECLARATION_FOR_GLOBAL_CONTEXT;
22610 
22611     SWITCH_TO_GLOBAL_CONTEXT;
22612 
22613     /* If the entry is one of these types, it is a permanent entry, and not the
22614      * one used to detect recursions.  This function should delete only the
22615      * recursion entry */
22616     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
22617     if (     current_entry
22618         && ! is_invlist(*current_entry)
22619         && ! SvPOK(*current_entry))
22620     {
22621         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
22622                                                                     G_DISCARD);
22623     }
22624 
22625     RESTORE_CONTEXT;
22626 }
22627 
22628 STATIC SV *
S_get_fq_name(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool has_colon_colon)22629 S_get_fq_name(pTHX_
22630               const char * const name,    /* The first non-blank in the \p{}, \P{} */
22631               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
22632               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22633               const bool has_colon_colon
22634              )
22635 {
22636     /* Returns a mortal SV containing the fully qualified version of the input
22637      * name */
22638 
22639     SV * fq_name;
22640 
22641     fq_name = newSVpvs_flags("", SVs_TEMP);
22642 
22643     /* Use the current package if it wasn't included in our input */
22644     if (! has_colon_colon) {
22645         const HV * pkg = (IN_PERL_COMPILETIME)
22646                          ? PL_curstash
22647                          : CopSTASH(PL_curcop);
22648         const char* pkgname = HvNAME(pkg);
22649 
22650         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22651                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
22652         sv_catpvs(fq_name, "::");
22653     }
22654 
22655     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22656                          UTF8fARG(is_utf8, name_len, name));
22657     return fq_name;
22658 }
22659 
22660 SV *
Perl_parse_uniprop_string(pTHX_ const char * const name,const Size_t name_len,const bool is_utf8,const bool to_fold,const bool runtime,const bool deferrable,bool * user_defined_ptr,SV * msg,const STRLEN level)22661 Perl_parse_uniprop_string(pTHX_
22662 
22663     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
22664      * now.  If so, the return is an inversion list.
22665      *
22666      * If the property is user-defined, it is a subroutine, which in turn
22667      * may call other subroutines.  This function will call the whole nest of
22668      * them to get the definition they return; if some aren't known at the time
22669      * of the call to this function, the fully qualified name of the highest
22670      * level sub is returned.  It is an error to call this function at runtime
22671      * without every sub defined.
22672      *
22673      * If an error was found, NULL is returned, and 'msg' gets a suitable
22674      * message appended to it.  (Appending allows the back trace of how we got
22675      * to the faulty definition to be displayed through nested calls of
22676      * user-defined subs.)
22677      *
22678      * The caller should NOT try to free any returned inversion list.
22679      *
22680      * Other parameters will be set on return as described below */
22681 
22682     const char * const name,    /* The first non-blank in the \p{}, \P{} */
22683     const Size_t name_len,      /* Its length in bytes, not including any
22684                                    trailing space */
22685     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22686     const bool to_fold,         /* ? Is this under /i */
22687     const bool runtime,         /* TRUE if this is being called at run time */
22688     const bool deferrable,      /* TRUE if it's ok for the definition to not be
22689                                    known at this call */
22690     bool *user_defined_ptr,     /* Upon return from this function it will be
22691                                    set to TRUE if any component is a
22692                                    user-defined property */
22693     SV * msg,                   /* Any error or warning msg(s) are appended to
22694                                    this */
22695    const STRLEN level)          /* Recursion level of this call */
22696 {
22697     dVAR;
22698     char* lookup_name;          /* normalized name for lookup in our tables */
22699     unsigned lookup_len;        /* Its length */
22700     bool stricter = FALSE;      /* Some properties have stricter name
22701                                    normalization rules, which we decide upon
22702                                    based on parsing */
22703 
22704     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
22705      * (though it requires extra effort to download them from Unicode and
22706      * compile perl to know about them) */
22707     bool is_nv_type = FALSE;
22708 
22709     unsigned int i, j = 0;
22710     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
22711     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
22712     int table_index = 0;    /* The entry number for this property in the table
22713                                of all Unicode property names */
22714     bool starts_with_In_or_Is = FALSE;  /* ? Does the name start with 'In' or
22715                                              'Is' */
22716     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
22717                                    the normalized name in certain situations */
22718     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
22719                                    part of a package name */
22720     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
22721                                              property rather than a Unicode
22722                                              one. */
22723     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
22724                                      if an error.  If it is an inversion list,
22725                                      it is the definition.  Otherwise it is a
22726                                      string containing the fully qualified sub
22727                                      name of 'name' */
22728     SV * fq_name = NULL;        /* For user-defined properties, the fully
22729                                    qualified name */
22730     bool invert_return = FALSE; /* ? Do we need to complement the result before
22731                                      returning it */
22732 
22733     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
22734 
22735     /* The input will be normalized into 'lookup_name' */
22736     Newx(lookup_name, name_len, char);
22737     SAVEFREEPV(lookup_name);
22738 
22739     /* Parse the input. */
22740     for (i = 0; i < name_len; i++) {
22741         char cur = name[i];
22742 
22743         /* Most of the characters in the input will be of this ilk, being parts
22744          * of a name */
22745         if (isIDCONT_A(cur)) {
22746 
22747             /* Case differences are ignored.  Our lookup routine assumes
22748              * everything is lowercase, so normalize to that */
22749             if (isUPPER_A(cur)) {
22750                 lookup_name[j++] = toLOWER_A(cur);
22751                 continue;
22752             }
22753 
22754             if (cur == '_') { /* Don't include these in the normalized name */
22755                 continue;
22756             }
22757 
22758             lookup_name[j++] = cur;
22759 
22760             /* The first character in a user-defined name must be of this type.
22761              * */
22762             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
22763                 could_be_user_defined = FALSE;
22764             }
22765 
22766             continue;
22767         }
22768 
22769         /* Here, the character is not something typically in a name,  But these
22770          * two types of characters (and the '_' above) can be freely ignored in
22771          * most situations.  Later it may turn out we shouldn't have ignored
22772          * them, and we have to reparse, but we don't have enough information
22773          * yet to make that decision */
22774         if (cur == '-' || isSPACE_A(cur)) {
22775             could_be_user_defined = FALSE;
22776             continue;
22777         }
22778 
22779         /* An equals sign or single colon mark the end of the first part of
22780          * the property name */
22781         if (    cur == '='
22782             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
22783         {
22784             lookup_name[j++] = '='; /* Treat the colon as an '=' */
22785             equals_pos = j; /* Note where it occurred in the input */
22786             could_be_user_defined = FALSE;
22787             break;
22788         }
22789 
22790         /* Otherwise, this character is part of the name. */
22791         lookup_name[j++] = cur;
22792 
22793         /* Here it isn't a single colon, so if it is a colon, it must be a
22794          * double colon */
22795         if (cur == ':') {
22796 
22797             /* A double colon should be a package qualifier.  We note its
22798              * position and continue.  Note that one could have
22799              *      pkg1::pkg2::...::foo
22800              * so that the position at the end of the loop will be just after
22801              * the final qualifier */
22802 
22803             i++;
22804             non_pkg_begin = i + 1;
22805             lookup_name[j++] = ':';
22806         }
22807         else { /* Only word chars (and '::') can be in a user-defined name */
22808             could_be_user_defined = FALSE;
22809         }
22810     } /* End of parsing through the lhs of the property name (or all of it if
22811          no rhs) */
22812 
22813 #define STRLENs(s)  (sizeof("" s "") - 1)
22814 
22815     /* If there is a single package name 'utf8::', it is ambiguous.  It could
22816      * be for a user-defined property, or it could be a Unicode property, as
22817      * all of them are considered to be for that package.  For the purposes of
22818      * parsing the rest of the property, strip it off */
22819     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
22820         lookup_name +=  STRLENs("utf8::");
22821         j -=  STRLENs("utf8::");
22822         equals_pos -=  STRLENs("utf8::");
22823     }
22824 
22825     /* Here, we are either done with the whole property name, if it was simple;
22826      * or are positioned just after the '=' if it is compound. */
22827 
22828     if (equals_pos >= 0) {
22829         assert(! stricter); /* We shouldn't have set this yet */
22830 
22831         /* Space immediately after the '=' is ignored */
22832         i++;
22833         for (; i < name_len; i++) {
22834             if (! isSPACE_A(name[i])) {
22835                 break;
22836             }
22837         }
22838 
22839         /* Most punctuation after the equals indicates a subpattern, like
22840          * \p{foo=/bar/} */
22841         if (   isPUNCT_A(name[i])
22842             && name[i] != '-'
22843             && name[i] != '+'
22844             && name[i] != '_'
22845             && name[i] != '{')
22846         {
22847             /* Find the property.  The table includes the equals sign, so we
22848              * use 'j' as-is */
22849             table_index = match_uniprop((U8 *) lookup_name, j);
22850             if (table_index) {
22851                 const char * const * prop_values
22852                                             = UNI_prop_value_ptrs[table_index];
22853                 SV * subpattern;
22854                 Size_t subpattern_len;
22855                 REGEXP * subpattern_re;
22856                 char open = name[i++];
22857                 char close;
22858                 const char * pos_in_brackets;
22859                 bool escaped = 0;
22860 
22861                 /* A backslash means the real delimitter is the next character.
22862                  * */
22863                 if (open == '\\') {
22864                     open = name[i++];
22865                     escaped = 1;
22866                 }
22867 
22868                 /* This data structure is constructed so that the matching
22869                  * closing bracket is 3 past its matching opening.  The second
22870                  * set of closing is so that if the opening is something like
22871                  * ']', the closing will be that as well.  Something similar is
22872                  * done in toke.c */
22873                 pos_in_brackets = strchr("([<)]>)]>", open);
22874                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
22875 
22876                 if (    i >= name_len
22877                     ||  name[name_len-1] != close
22878                     || (escaped && name[name_len-2] != '\\'))
22879                 {
22880                     sv_catpvs(msg, "Unicode property wildcard not terminated");
22881                     goto append_name_to_msg;
22882                 }
22883 
22884                 Perl_ck_warner_d(aTHX_
22885                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
22886                     "The Unicode property wildcards feature is experimental");
22887 
22888                 /* Now create and compile the wildcard subpattern.  Use /iaa
22889                  * because nothing outside of ASCII will match, and it the
22890                  * property values should all match /i.  Note that when the
22891                  * pattern fails to compile, our added text to the user's
22892                  * pattern will be displayed to the user, which is not so
22893                  * desirable. */
22894                 subpattern_len = name_len - i - 1 - escaped;
22895                 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
22896                                               (unsigned) subpattern_len,
22897                                               name + i);
22898                 subpattern = sv_2mortal(subpattern);
22899                 subpattern_re = re_compile(subpattern, 0);
22900                 assert(subpattern_re);  /* Should have died if didn't compile
22901                                          successfully */
22902 
22903                 /* For each legal property value, see if the supplied pattern
22904                  * matches it. */
22905                 while (*prop_values) {
22906                     const char * const entry = *prop_values;
22907                     const Size_t len = strlen(entry);
22908                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
22909 
22910                     if (pregexec(subpattern_re,
22911                                  (char *) entry,
22912                                  (char *) entry + len,
22913                                  (char *) entry, 0,
22914                                  entry_sv,
22915                                  0))
22916                     { /* Here, matched.  Add to the returned list */
22917                         Size_t total_len = j + len;
22918                         SV * sub_invlist = NULL;
22919                         char * this_string;
22920 
22921                         /* We know this is a legal \p{property=value}.  Call
22922                          * the function to return the list of code points that
22923                          * match it */
22924                         Newxz(this_string, total_len + 1, char);
22925                         Copy(lookup_name, this_string, j, char);
22926                         my_strlcat(this_string, entry, total_len + 1);
22927                         SAVEFREEPV(this_string);
22928                         sub_invlist = parse_uniprop_string(this_string,
22929                                                            total_len,
22930                                                            is_utf8,
22931                                                            to_fold,
22932                                                            runtime,
22933                                                            deferrable,
22934                                                            user_defined_ptr,
22935                                                            msg,
22936                                                            level + 1);
22937                         _invlist_union(prop_definition, sub_invlist,
22938                                        &prop_definition);
22939                     }
22940 
22941                     prop_values++;  /* Next iteration, look at next propvalue */
22942                 } /* End of looking through property values; (the data
22943                      structure is terminated by a NULL ptr) */
22944 
22945                 SvREFCNT_dec_NN(subpattern_re);
22946 
22947                 if (prop_definition) {
22948                     return prop_definition;
22949                 }
22950 
22951                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
22952                 goto append_name_to_msg;
22953             }
22954 
22955             /* Here's how khw thinks we should proceed to handle the properties
22956              * not yet done:    Bidi Mirroring Glyph
22957                                 Bidi Paired Bracket
22958                                 Case Folding  (both full and simple)
22959                                 Decomposition Mapping
22960                                 Equivalent Unified Ideograph
22961                                 Name
22962                                 Name Alias
22963                                 Lowercase Mapping  (both full and simple)
22964                                 NFKC Case Fold
22965                                 Titlecase Mapping  (both full and simple)
22966                                 Uppercase Mapping  (both full and simple)
22967              * Move the part that looks at the property values into a perl
22968              * script, like utf8_heavy.pl is done.  This makes things somewhat
22969              * easier, but most importantly, it avoids always adding all these
22970              * strings to the memory usage when the feature is little-used.
22971              *
22972              * The property values would all be concatenated into a single
22973              * string per property with each value on a separate line, and the
22974              * code point it's for on alternating lines.  Then we match the
22975              * user's input pattern m//mg, without having to worry about their
22976              * uses of '^' and '$'.  Only the values that aren't the default
22977              * would be in the strings.  Code points would be in UTF-8.  The
22978              * search pattern that we would construct would look like
22979              * (?: \n (code-point_re) \n (?aam: user-re ) \n )
22980              * And so $1 would contain the code point that matched the user-re.
22981              * For properties where the default is the code point itself, such
22982              * as any of the case changing mappings, the string would otherwise
22983              * consist of all Unicode code points in UTF-8 strung together.
22984              * This would be impractical.  So instead, examine their compiled
22985              * pattern, looking at the ssc.  If none, reject the pattern as an
22986              * error.  Otherwise run the pattern against every code point in
22987              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
22988              * And it might be good to create an API to return the ssc.
22989              *
22990              * For the name properties, a new function could be created in
22991              * charnames which essentially does the same thing as above,
22992              * sharing Name.pl with the other charname functions.  Don't know
22993              * about loose name matching, or algorithmically determined names.
22994              * Decomposition.pl similarly.
22995              *
22996              * It might be that a new pattern modifier would have to be
22997              * created, like /t for resTricTed, which changed the behavior of
22998              * some constructs in their subpattern, like \A. */
22999         } /* End of is a wildcard subppattern */
23000 
23001 
23002         /* Certain properties whose values are numeric need special handling.
23003          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
23004          * purposes of checking if this is one of those properties */
23005         if (memBEGINPs(lookup_name, j, "is")) {
23006             lookup_offset = 2;
23007         }
23008 
23009         /* Then check if it is one of these specially-handled properties.  The
23010          * possibilities are hard-coded because easier this way, and the list
23011          * is unlikely to change.
23012          *
23013          * All numeric value type properties are of this ilk, and are also
23014          * special in a different way later on.  So find those first.  There
23015          * are several numeric value type properties in the Unihan DB (which is
23016          * unlikely to be compiled with perl, but we handle it here in case it
23017          * does get compiled).  They all end with 'numeric'.  The interiors
23018          * aren't checked for the precise property.  This would stop working if
23019          * a cjk property were to be created that ended with 'numeric' and
23020          * wasn't a numeric type */
23021         is_nv_type = memEQs(lookup_name + lookup_offset,
23022                        j - 1 - lookup_offset, "numericvalue")
23023                   || memEQs(lookup_name + lookup_offset,
23024                       j - 1 - lookup_offset, "nv")
23025                   || (   memENDPs(lookup_name + lookup_offset,
23026                             j - 1 - lookup_offset, "numeric")
23027                       && (   memBEGINPs(lookup_name + lookup_offset,
23028                                       j - 1 - lookup_offset, "cjk")
23029                           || memBEGINPs(lookup_name + lookup_offset,
23030                                       j - 1 - lookup_offset, "k")));
23031         if (   is_nv_type
23032             || memEQs(lookup_name + lookup_offset,
23033                       j - 1 - lookup_offset, "canonicalcombiningclass")
23034             || memEQs(lookup_name + lookup_offset,
23035                       j - 1 - lookup_offset, "ccc")
23036             || memEQs(lookup_name + lookup_offset,
23037                       j - 1 - lookup_offset, "age")
23038             || memEQs(lookup_name + lookup_offset,
23039                       j - 1 - lookup_offset, "in")
23040             || memEQs(lookup_name + lookup_offset,
23041                       j - 1 - lookup_offset, "presentin"))
23042         {
23043             unsigned int k;
23044 
23045             /* Since the stuff after the '=' is a number, we can't throw away
23046              * '-' willy-nilly, as those could be a minus sign.  Other stricter
23047              * rules also apply.  However, these properties all can have the
23048              * rhs not be a number, in which case they contain at least one
23049              * alphabetic.  In those cases, the stricter rules don't apply.
23050              * But the numeric type properties can have the alphas [Ee] to
23051              * signify an exponent, and it is still a number with stricter
23052              * rules.  So look for an alpha that signifies not-strict */
23053             stricter = TRUE;
23054             for (k = i; k < name_len; k++) {
23055                 if (   isALPHA_A(name[k])
23056                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
23057                 {
23058                     stricter = FALSE;
23059                     break;
23060                 }
23061             }
23062         }
23063 
23064         if (stricter) {
23065 
23066             /* A number may have a leading '+' or '-'.  The latter is retained
23067              * */
23068             if (name[i] == '+') {
23069                 i++;
23070             }
23071             else if (name[i] == '-') {
23072                 lookup_name[j++] = '-';
23073                 i++;
23074             }
23075 
23076             /* Skip leading zeros including single underscores separating the
23077              * zeros, or between the final leading zero and the first other
23078              * digit */
23079             for (; i < name_len - 1; i++) {
23080                 if (    name[i] != '0'
23081                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23082                 {
23083                     break;
23084                 }
23085             }
23086         }
23087     }
23088     else {  /* No '=' */
23089 
23090        /* Only a few properties without an '=' should be parsed with stricter
23091         * rules.  The list is unlikely to change. */
23092         if (   memBEGINPs(lookup_name, j, "perl")
23093             && memNEs(lookup_name + 4, j - 4, "space")
23094             && memNEs(lookup_name + 4, j - 4, "word"))
23095         {
23096             stricter = TRUE;
23097 
23098             /* We set the inputs back to 0 and the code below will reparse,
23099              * using strict */
23100             i = j = 0;
23101         }
23102     }
23103 
23104     /* Here, we have either finished the property, or are positioned to parse
23105      * the remainder, and we know if stricter rules apply.  Finish out, if not
23106      * already done */
23107     for (; i < name_len; i++) {
23108         char cur = name[i];
23109 
23110         /* In all instances, case differences are ignored, and we normalize to
23111          * lowercase */
23112         if (isUPPER_A(cur)) {
23113             lookup_name[j++] = toLOWER(cur);
23114             continue;
23115         }
23116 
23117         /* An underscore is skipped, but not under strict rules unless it
23118          * separates two digits */
23119         if (cur == '_') {
23120             if (    stricter
23121                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
23122                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23123             {
23124                 lookup_name[j++] = '_';
23125             }
23126             continue;
23127         }
23128 
23129         /* Hyphens are skipped except under strict */
23130         if (cur == '-' && ! stricter) {
23131             continue;
23132         }
23133 
23134         /* XXX Bug in documentation.  It says white space skipped adjacent to
23135          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
23136          * in a number */
23137         if (isSPACE_A(cur) && ! stricter) {
23138             continue;
23139         }
23140 
23141         lookup_name[j++] = cur;
23142 
23143         /* Unless this is a non-trailing slash, we are done with it */
23144         if (i >= name_len - 1 || cur != '/') {
23145             continue;
23146         }
23147 
23148         slash_pos = j;
23149 
23150         /* A slash in the 'numeric value' property indicates that what follows
23151          * is a denominator.  It can have a leading '+' and '0's that should be
23152          * skipped.  But we have never allowed a negative denominator, so treat
23153          * a minus like every other character.  (No need to rule out a second
23154          * '/', as that won't match anything anyway */
23155         if (is_nv_type) {
23156             i++;
23157             if (i < name_len && name[i] == '+') {
23158                 i++;
23159             }
23160 
23161             /* Skip leading zeros including underscores separating digits */
23162             for (; i < name_len - 1; i++) {
23163                 if (   name[i] != '0'
23164                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23165                 {
23166                     break;
23167                 }
23168             }
23169 
23170             /* Store the first real character in the denominator */
23171             lookup_name[j++] = name[i];
23172         }
23173     }
23174 
23175     /* Here are completely done parsing the input 'name', and 'lookup_name'
23176      * contains a copy, normalized.
23177      *
23178      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23179      * different from without the underscores.  */
23180     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
23181            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23182         && UNLIKELY(name[name_len-1] == '_'))
23183     {
23184         lookup_name[j++] = '&';
23185     }
23186 
23187     /* If the original input began with 'In' or 'Is', it could be a subroutine
23188      * call to a user-defined property instead of a Unicode property name. */
23189     if (    non_pkg_begin + name_len > 2
23190         &&  name[non_pkg_begin+0] == 'I'
23191         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23192     {
23193         starts_with_In_or_Is = TRUE;
23194     }
23195     else {
23196         could_be_user_defined = FALSE;
23197     }
23198 
23199     if (could_be_user_defined) {
23200         CV* user_sub;
23201 
23202         /* If the user defined property returns the empty string, it could
23203          * easily be because the pattern is being compiled before the data it
23204          * actually needs to compile is available.  This could be argued to be
23205          * a bug in the perl code, but this is a change of behavior for Perl,
23206          * so we handle it.  This means that intentionally returning nothing
23207          * will not be resolved until runtime */
23208         bool empty_return = FALSE;
23209 
23210         /* Here, the name could be for a user defined property, which are
23211          * implemented as subs. */
23212         user_sub = get_cvn_flags(name, name_len, 0);
23213         if (user_sub) {
23214             const char insecure[] = "Insecure user-defined property";
23215 
23216             /* Here, there is a sub by the correct name.  Normally we call it
23217              * to get the property definition */
23218             dSP;
23219             SV * user_sub_sv = MUTABLE_SV(user_sub);
23220             SV * error;     /* Any error returned by calling 'user_sub' */
23221             SV * key;       /* The key into the hash of user defined sub names
23222                              */
23223             SV * placeholder;
23224             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
23225 
23226             /* How many times to retry when another thread is in the middle of
23227              * expanding the same definition we want */
23228             PERL_INT_FAST8_T retry_countdown = 10;
23229 
23230             DECLARATION_FOR_GLOBAL_CONTEXT;
23231 
23232             /* If we get here, we know this property is user-defined */
23233             *user_defined_ptr = TRUE;
23234 
23235             /* We refuse to call a potentially tainted subroutine; returning an
23236              * error instead */
23237             if (TAINT_get) {
23238                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23239                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23240                 goto append_name_to_msg;
23241             }
23242 
23243             /* In principal, we only call each subroutine property definition
23244              * once during the life of the program.  This guarantees that the
23245              * property definition never changes.  The results of the single
23246              * sub call are stored in a hash, which is used instead for future
23247              * references to this property.  The property definition is thus
23248              * immutable.  But, to allow the user to have a /i-dependent
23249              * definition, we call the sub once for non-/i, and once for /i,
23250              * should the need arise, passing the /i status as a parameter.
23251              *
23252              * We start by constructing the hash key name, consisting of the
23253              * fully qualified subroutine name, preceded by the /i status, so
23254              * that there is a key for /i and a different key for non-/i */
23255             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23256             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23257                                           non_pkg_begin != 0);
23258             sv_catsv(key, fq_name);
23259             sv_2mortal(key);
23260 
23261             /* We only call the sub once throughout the life of the program
23262              * (with the /i, non-/i exception noted above).  That means the
23263              * hash must be global and accessible to all threads.  It is
23264              * created at program start-up, before any threads are created, so
23265              * is accessible to all children.  But this creates some
23266              * complications.
23267              *
23268              * 1) The keys can't be shared, or else problems arise; sharing is
23269              *    turned off at hash creation time
23270              * 2) All SVs in it are there for the remainder of the life of the
23271              *    program, and must be created in the same interpreter context
23272              *    as the hash, or else they will be freed from the wrong pool
23273              *    at global destruction time.  This is handled by switching to
23274              *    the hash's context to create each SV going into it, and then
23275              *    immediately switching back
23276              * 3) All accesses to the hash must be controlled by a mutex, to
23277              *    prevent two threads from getting an unstable state should
23278              *    they simultaneously be accessing it.  The code below is
23279              *    crafted so that the mutex is locked whenever there is an
23280              *    access and unlocked only when the next stable state is
23281              *    achieved.
23282              *
23283              * The hash stores either the definition of the property if it was
23284              * valid, or, if invalid, the error message that was raised.  We
23285              * use the type of SV to distinguish.
23286              *
23287              * There's also the need to guard against the definition expansion
23288              * from infinitely recursing.  This is handled by storing the aTHX
23289              * of the expanding thread during the expansion.  Again the SV type
23290              * is used to distinguish this from the other two cases.  If we
23291              * come to here and the hash entry for this property is our aTHX,
23292              * it means we have recursed, and the code assumes that we would
23293              * infinitely recurse, so instead stops and raises an error.
23294              * (Any recursion has always been treated as infinite recursion in
23295              * this feature.)
23296              *
23297              * If instead, the entry is for a different aTHX, it means that
23298              * that thread has gotten here first, and hasn't finished expanding
23299              * the definition yet.  We just have to wait until it is done.  We
23300              * sleep and retry a few times, returning an error if the other
23301              * thread doesn't complete. */
23302 
23303           re_fetch:
23304             USER_PROP_MUTEX_LOCK;
23305 
23306             /* If we have an entry for this key, the subroutine has already
23307              * been called once with this /i status. */
23308             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23309                                                    SvPVX(key), SvCUR(key), 0);
23310             if (saved_user_prop_ptr) {
23311 
23312                 /* If the saved result is an inversion list, it is the valid
23313                  * definition of this property */
23314                 if (is_invlist(*saved_user_prop_ptr)) {
23315                     prop_definition = *saved_user_prop_ptr;
23316 
23317                     /* The SV in the hash won't be removed until global
23318                      * destruction, so it is stable and we can unlock */
23319                     USER_PROP_MUTEX_UNLOCK;
23320 
23321                     /* The caller shouldn't try to free this SV */
23322                     return prop_definition;
23323                 }
23324 
23325                 /* Otherwise, if it is a string, it is the error message
23326                  * that was returned when we first tried to evaluate this
23327                  * property.  Fail, and append the message */
23328                 if (SvPOK(*saved_user_prop_ptr)) {
23329                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23330                     sv_catsv(msg, *saved_user_prop_ptr);
23331 
23332                     /* The SV in the hash won't be removed until global
23333                      * destruction, so it is stable and we can unlock */
23334                     USER_PROP_MUTEX_UNLOCK;
23335 
23336                     return NULL;
23337                 }
23338 
23339                 assert(SvIOK(*saved_user_prop_ptr));
23340 
23341                 /* Here, we have an unstable entry in the hash.  Either another
23342                  * thread is in the middle of expanding the property's
23343                  * definition, or we are ourselves recursing.  We use the aTHX
23344                  * in it to distinguish */
23345                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23346 
23347                     /* Here, it's another thread doing the expanding.  We've
23348                      * looked as much as we are going to at the contents of the
23349                      * hash entry.  It's safe to unlock. */
23350                     USER_PROP_MUTEX_UNLOCK;
23351 
23352                     /* Retry a few times */
23353                     if (retry_countdown-- > 0) {
23354                         PerlProc_sleep(1);
23355                         goto re_fetch;
23356                     }
23357 
23358                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23359                     sv_catpvs(msg, "Timeout waiting for another thread to "
23360                                    "define");
23361                     goto append_name_to_msg;
23362                 }
23363 
23364                 /* Here, we are recursing; don't dig any deeper */
23365                 USER_PROP_MUTEX_UNLOCK;
23366 
23367                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23368                 sv_catpvs(msg,
23369                           "Infinite recursion in user-defined property");
23370                 goto append_name_to_msg;
23371             }
23372 
23373             /* Here, this thread has exclusive control, and there is no entry
23374              * for this property in the hash.  So we have the go ahead to
23375              * expand the definition ourselves. */
23376 
23377             PUSHSTACKi(PERLSI_MAGIC);
23378             ENTER;
23379 
23380             /* Create a temporary placeholder in the hash to detect recursion
23381              * */
23382             SWITCH_TO_GLOBAL_CONTEXT;
23383             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23384             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23385             RESTORE_CONTEXT;
23386 
23387             /* Now that we have a placeholder, we can let other threads
23388              * continue */
23389             USER_PROP_MUTEX_UNLOCK;
23390 
23391             /* Make sure the placeholder always gets destroyed */
23392             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23393 
23394             PUSHMARK(SP);
23395             SAVETMPS;
23396 
23397             /* Call the user's function, with the /i status as a parameter.
23398              * Note that we have gone to a lot of trouble to keep this call
23399              * from being within the locked mutex region. */
23400             XPUSHs(boolSV(to_fold));
23401             PUTBACK;
23402 
23403             /* The following block was taken from swash_init().  Presumably
23404              * they apply to here as well, though we no longer use a swash --
23405              * khw */
23406             SAVEHINTS();
23407             save_re_context();
23408             /* We might get here via a subroutine signature which uses a utf8
23409              * parameter name, at which point PL_subname will have been set
23410              * but not yet used. */
23411             save_item(PL_subname);
23412 
23413             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23414 
23415             SPAGAIN;
23416 
23417             error = ERRSV;
23418             if (TAINT_get || SvTRUE(error)) {
23419                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23420                 if (SvTRUE(error)) {
23421                     sv_catpvs(msg, "Error \"");
23422                     sv_catsv(msg, error);
23423                     sv_catpvs(msg, "\"");
23424                 }
23425                 if (TAINT_get) {
23426                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
23427                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23428                 }
23429 
23430                 if (name_len > 0) {
23431                     sv_catpvs(msg, " in expansion of ");
23432                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23433                                                                   name_len,
23434                                                                   name));
23435                 }
23436 
23437                 (void) POPs;
23438                 prop_definition = NULL;
23439             }
23440             else {  /* G_SCALAR guarantees a single return value */
23441                 SV * contents = POPs;
23442 
23443                 /* The contents is supposed to be the expansion of the property
23444                  * definition.  If the definition is deferrable, and we got an
23445                  * empty string back, set a flag to later defer it (after clean
23446                  * up below). */
23447                 if (      deferrable
23448                     && (! SvPOK(contents) || SvCUR(contents) == 0))
23449                 {
23450                         empty_return = TRUE;
23451                 }
23452                 else { /* Otherwise, call a function to check for valid syntax,
23453                           and handle it */
23454 
23455                     prop_definition = handle_user_defined_property(
23456                                                     name, name_len,
23457                                                     is_utf8, to_fold, runtime,
23458                                                     deferrable,
23459                                                     contents, user_defined_ptr,
23460                                                     msg,
23461                                                     level);
23462                 }
23463             }
23464 
23465             /* Here, we have the results of the expansion.  Delete the
23466              * placeholder, and if the definition is now known, replace it with
23467              * that definition.  We need exclusive access to the hash, and we
23468              * can't let anyone else in, between when we delete the placeholder
23469              * and add the permanent entry */
23470             USER_PROP_MUTEX_LOCK;
23471 
23472             S_delete_recursion_entry(aTHX_ SvPVX(key));
23473 
23474             if (    ! empty_return
23475                 && (! prop_definition || is_invlist(prop_definition)))
23476             {
23477                 /* If we got success we use the inversion list defining the
23478                  * property; otherwise use the error message */
23479                 SWITCH_TO_GLOBAL_CONTEXT;
23480                 (void) hv_store_ent(PL_user_def_props,
23481                                     key,
23482                                     ((prop_definition)
23483                                      ? newSVsv(prop_definition)
23484                                      : newSVsv(msg)),
23485                                     0);
23486                 RESTORE_CONTEXT;
23487             }
23488 
23489             /* All done, and the hash now has a permanent entry for this
23490              * property.  Give up exclusive control */
23491             USER_PROP_MUTEX_UNLOCK;
23492 
23493             FREETMPS;
23494             LEAVE;
23495             POPSTACK;
23496 
23497             if (empty_return) {
23498                 goto definition_deferred;
23499             }
23500 
23501             if (prop_definition) {
23502 
23503                 /* If the definition is for something not known at this time,
23504                  * we toss it, and go return the main property name, as that's
23505                  * the one the user will be aware of */
23506                 if (! is_invlist(prop_definition)) {
23507                     SvREFCNT_dec_NN(prop_definition);
23508                     goto definition_deferred;
23509                 }
23510 
23511                 sv_2mortal(prop_definition);
23512             }
23513 
23514             /* And return */
23515             return prop_definition;
23516 
23517         }   /* End of calling the subroutine for the user-defined property */
23518     }       /* End of it could be a user-defined property */
23519 
23520     /* Here it wasn't a user-defined property that is known at this time.  See
23521      * if it is a Unicode property */
23522 
23523     lookup_len = j;     /* This is a more mnemonic name than 'j' */
23524 
23525     /* Get the index into our pointer table of the inversion list corresponding
23526      * to the property */
23527     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23528 
23529     /* If it didn't find the property ... */
23530     if (table_index == 0) {
23531 
23532         /* Try again stripping off any initial 'In' or 'Is' */
23533         if (starts_with_In_or_Is) {
23534             lookup_name += 2;
23535             lookup_len -= 2;
23536             equals_pos -= 2;
23537             slash_pos -= 2;
23538 
23539             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23540         }
23541 
23542         if (table_index == 0) {
23543             char * canonical;
23544 
23545             /* Here, we didn't find it.  If not a numeric type property, and
23546              * can't be a user-defined one, it isn't a legal property */
23547             if (! is_nv_type) {
23548                 if (! could_be_user_defined) {
23549                     goto failed;
23550                 }
23551 
23552                 /* Here, the property name is legal as a user-defined one.   At
23553                  * compile time, it might just be that the subroutine for that
23554                  * property hasn't been encountered yet, but at runtime, it's
23555                  * an error to try to use an undefined one */
23556                 if (! deferrable) {
23557                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23558                     sv_catpvs(msg, "Unknown user-defined property name");
23559                     goto append_name_to_msg;
23560                 }
23561 
23562                 goto definition_deferred;
23563             } /* End of isn't a numeric type property */
23564 
23565             /* The numeric type properties need more work to decide.  What we
23566              * do is make sure we have the number in canonical form and look
23567              * that up. */
23568 
23569             if (slash_pos < 0) {    /* No slash */
23570 
23571                 /* When it isn't a rational, take the input, convert it to a
23572                  * NV, then create a canonical string representation of that
23573                  * NV. */
23574 
23575                 NV value;
23576                 SSize_t value_len = lookup_len - equals_pos;
23577 
23578                 /* Get the value */
23579                 if (   value_len <= 0
23580                     || my_atof3(lookup_name + equals_pos, &value,
23581                                 value_len)
23582                           != lookup_name + lookup_len)
23583                 {
23584                     goto failed;
23585                 }
23586 
23587                 /* If the value is an integer, the canonical value is integral
23588                  * */
23589                 if (Perl_ceil(value) == value) {
23590                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
23591                                             equals_pos, lookup_name, value);
23592                 }
23593                 else {  /* Otherwise, it is %e with a known precision */
23594                     char * exp_ptr;
23595 
23596                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
23597                                                 equals_pos, lookup_name,
23598                                                 PL_E_FORMAT_PRECISION, value);
23599 
23600                     /* The exponent generated is expecting two digits, whereas
23601                      * %e on some systems will generate three.  Remove leading
23602                      * zeros in excess of 2 from the exponent.  We start
23603                      * looking for them after the '=' */
23604                     exp_ptr = strchr(canonical + equals_pos, 'e');
23605                     if (exp_ptr) {
23606                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
23607                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
23608 
23609                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
23610 
23611                         if (excess_exponent_len > 0) {
23612                             SSize_t leading_zeros = strspn(cur_ptr, "0");
23613                             SSize_t excess_leading_zeros
23614                                     = MIN(leading_zeros, excess_exponent_len);
23615                             if (excess_leading_zeros > 0) {
23616                                 Move(cur_ptr + excess_leading_zeros,
23617                                      cur_ptr,
23618                                      strlen(cur_ptr) - excess_leading_zeros
23619                                        + 1,  /* Copy the NUL as well */
23620                                      char);
23621                             }
23622                         }
23623                     }
23624                 }
23625             }
23626             else {  /* Has a slash.  Create a rational in canonical form  */
23627                 UV numerator, denominator, gcd, trial;
23628                 const char * end_ptr;
23629                 const char * sign = "";
23630 
23631                 /* We can't just find the numerator, denominator, and do the
23632                  * division, then use the method above, because that is
23633                  * inexact.  And the input could be a rational that is within
23634                  * epsilon (given our precision) of a valid rational, and would
23635                  * then incorrectly compare valid.
23636                  *
23637                  * We're only interested in the part after the '=' */
23638                 const char * this_lookup_name = lookup_name + equals_pos;
23639                 lookup_len -= equals_pos;
23640                 slash_pos -= equals_pos;
23641 
23642                 /* Handle any leading minus */
23643                 if (this_lookup_name[0] == '-') {
23644                     sign = "-";
23645                     this_lookup_name++;
23646                     lookup_len--;
23647                     slash_pos--;
23648                 }
23649 
23650                 /* Convert the numerator to numeric */
23651                 end_ptr = this_lookup_name + slash_pos;
23652                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
23653                     goto failed;
23654                 }
23655 
23656                 /* It better have included all characters before the slash */
23657                 if (*end_ptr != '/') {
23658                     goto failed;
23659                 }
23660 
23661                 /* Set to look at just the denominator */
23662                 this_lookup_name += slash_pos;
23663                 lookup_len -= slash_pos;
23664                 end_ptr = this_lookup_name + lookup_len;
23665 
23666                 /* Convert the denominator to numeric */
23667                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
23668                     goto failed;
23669                 }
23670 
23671                 /* It better be the rest of the characters, and don't divide by
23672                  * 0 */
23673                 if (   end_ptr != this_lookup_name + lookup_len
23674                     || denominator == 0)
23675                 {
23676                     goto failed;
23677                 }
23678 
23679                 /* Get the greatest common denominator using
23680                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
23681                 gcd = numerator;
23682                 trial = denominator;
23683                 while (trial != 0) {
23684                     UV temp = trial;
23685                     trial = gcd % trial;
23686                     gcd = temp;
23687                 }
23688 
23689                 /* If already in lowest possible terms, we have already tried
23690                  * looking this up */
23691                 if (gcd == 1) {
23692                     goto failed;
23693                 }
23694 
23695                 /* Reduce the rational, which should put it in canonical form
23696                  * */
23697                 numerator /= gcd;
23698                 denominator /= gcd;
23699 
23700                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
23701                         equals_pos, lookup_name, sign, numerator, denominator);
23702             }
23703 
23704             /* Here, we have the number in canonical form.  Try that */
23705             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
23706             if (table_index == 0) {
23707                 goto failed;
23708             }
23709         }   /* End of still didn't find the property in our table */
23710     }       /* End of       didn't find the property in our table */
23711 
23712     /* Here, we have a non-zero return, which is an index into a table of ptrs.
23713      * A negative return signifies that the real index is the absolute value,
23714      * but the result needs to be inverted */
23715     if (table_index < 0) {
23716         invert_return = TRUE;
23717         table_index = -table_index;
23718     }
23719 
23720     /* Out-of band indices indicate a deprecated property.  The proper index is
23721      * modulo it with the table size.  And dividing by the table size yields
23722      * an offset into a table constructed by regen/mk_invlists.pl to contain
23723      * the corresponding warning message */
23724     if (table_index > MAX_UNI_KEYWORD_INDEX) {
23725         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
23726         table_index %= MAX_UNI_KEYWORD_INDEX;
23727         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
23728                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
23729                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
23730     }
23731 
23732     /* In a few properties, a different property is used under /i.  These are
23733      * unlikely to change, so are hard-coded here. */
23734     if (to_fold) {
23735         if (   table_index == UNI_XPOSIXUPPER
23736             || table_index == UNI_XPOSIXLOWER
23737             || table_index == UNI_TITLE)
23738         {
23739             table_index = UNI_CASED;
23740         }
23741         else if (   table_index == UNI_UPPERCASELETTER
23742                  || table_index == UNI_LOWERCASELETTER
23743 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
23744                  || table_index == UNI_TITLECASELETTER
23745 #  endif
23746         ) {
23747             table_index = UNI_CASEDLETTER;
23748         }
23749         else if (  table_index == UNI_POSIXUPPER
23750                 || table_index == UNI_POSIXLOWER)
23751         {
23752             table_index = UNI_POSIXALPHA;
23753         }
23754     }
23755 
23756     /* Create and return the inversion list */
23757     prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
23758     sv_2mortal(prop_definition);
23759 
23760 
23761     /* See if there is a private use override to add to this definition */
23762     {
23763         COPHH * hinthash = (IN_PERL_COMPILETIME)
23764                            ? CopHINTHASH_get(&PL_compiling)
23765                            : CopHINTHASH_get(PL_curcop);
23766 	SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
23767 
23768         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
23769 
23770             /* See if there is an element in the hints hash for this table */
23771             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
23772             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
23773 
23774             if (pos) {
23775                 bool dummy;
23776                 SV * pu_definition;
23777                 SV * pu_invlist;
23778                 SV * expanded_prop_definition =
23779                             sv_2mortal(invlist_clone(prop_definition, NULL));
23780 
23781                 /* If so, it's definition is the string from here to the next
23782                  * \a character.  And its format is the same as a user-defined
23783                  * property */
23784                 pos += SvCUR(pu_lookup);
23785                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
23786                 pu_invlist = handle_user_defined_property(lookup_name,
23787                                                           lookup_len,
23788                                                           0, /* Not UTF-8 */
23789                                                           0, /* Not folded */
23790                                                           runtime,
23791                                                           deferrable,
23792                                                           pu_definition,
23793                                                           &dummy,
23794                                                           msg,
23795                                                           level);
23796                 if (TAINT_get) {
23797                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23798                     sv_catpvs(msg, "Insecure private-use override");
23799                     goto append_name_to_msg;
23800                 }
23801 
23802                 /* For now, as a safety measure, make sure that it doesn't
23803                  * override non-private use code points */
23804                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
23805 
23806                 /* Add it to the list to be returned */
23807                 _invlist_union(prop_definition, pu_invlist,
23808                                &expanded_prop_definition);
23809                 prop_definition = expanded_prop_definition;
23810                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
23811             }
23812         }
23813     }
23814 
23815     if (invert_return) {
23816         _invlist_invert(prop_definition);
23817     }
23818     return prop_definition;
23819 
23820 
23821   failed:
23822     if (non_pkg_begin != 0) {
23823         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23824         sv_catpvs(msg, "Illegal user-defined property name");
23825     }
23826     else {
23827         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23828         sv_catpvs(msg, "Can't find Unicode property definition");
23829     }
23830     /* FALLTHROUGH */
23831 
23832   append_name_to_msg:
23833     {
23834         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
23835         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
23836 
23837         sv_catpv(msg, prefix);
23838         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23839         sv_catpv(msg, suffix);
23840     }
23841 
23842     return NULL;
23843 
23844   definition_deferred:
23845 
23846     /* Here it could yet to be defined, so defer evaluation of this
23847      * until its needed at runtime.  We need the fully qualified property name
23848      * to avoid ambiguity, and a trailing newline */
23849     if (! fq_name) {
23850         fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23851                                       non_pkg_begin != 0 /* If has "::" */
23852                                );
23853     }
23854     sv_catpvs(fq_name, "\n");
23855 
23856     *user_defined_ptr = TRUE;
23857     return fq_name;
23858 }
23859 
23860 #endif
23861 
23862 /*
23863  * ex: set ts=8 sts=4 sw=4 et:
23864  */
23865